Advertisement
Guest User

Untitled

a guest
Jul 18th, 2019
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.57 KB | None | 0 0
  1. years=[2015,2016,2017,2018]
  2. months=[1,2,3,4,5,6,7,8,9,10,11,12]
  3. import requests
  4. from bs4 import BeautifulSoup as bsp
  5. a2m=lambda ahi:{
  6. 'game':ahi.parent.h2.text,
  7. 'time':ahi.previousSibling.previousSibling.text,
  8. 'url':ahi['href'],
  9. 'tvalue': ahi.parent.parent.h3.text
  10. }
  11. def saveobjs(objs,at="./export.csv"):
  12. '''db handler, csv for now'''
  13. with open(at,'a') as f:
  14. f.write('\n'.join([','.join(map(str,x)) for x in objs])+'\n')
  15. return 1
  16. def scrapeHomePage():
  17. rh=requests.get('https://sattakingdarbar.com/')
  18. sp=bsp(rh.text,'html.parser')
  19. ahrefs=filter(lambda a:a.text=="Record Chart",sp.findAll('a'))
  20. temps={d['url']:[d['game'],d['time'],d['tvalue']] for d in map(a2m,ahrefs)}
  21. return temps
  22.  
  23. def urlpath(url):
  24. import sys
  25. if sys.version_info[0] < 3:
  26. import urllib
  27. return urllib.splitquery(url)[0]
  28. else:
  29. from urllib.parse import urlparse,urlunparse
  30. return urlunparse(list(urlparse(url))[:3]+['']*3)
  31.  
  32. def scrapeThisPage(url):
  33. objs=[]
  34. hitu=urlpath(url)+"?month={}&year={}"
  35. for y in years:
  36. for m in months:
  37. ri=requests.get(hitu.format(m,y))
  38. spi=bsp(ri.text,'html.parser')
  39. dates=spi.findAll('td',attrs={'class':'day'})
  40. vals=spi.findAll('td',attrs={'class':'number'})
  41. names=spi.findAll('th',attrs={'class':'name'})
  42. ln,ld=len(names),len(dates)
  43. for i,iv in enumerate(vals):
  44. objs.append([dates[i//ln].text,m,y,names[i%ln].text,iv.text,''])
  45. return objs
  46.  
  47. def scrapAll():
  48. allobjs=[]
  49. alllinks=scrapeHomePage()
  50. for u in alllinks:
  51. allobjs+=scrapeThisPage(u)
  52. if len(allobjs)>500 and saveobjs(allobjs):allobjs=[]
  53. scrapAll()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement