Guest User

Untitled

a guest
Jul 21st, 2018
102
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.23 KB | None | 0 0
  1. #!/usr/bin/env python
  2. import urllib2
  3. import psycopg2
  4. from time import strftime
  5. from datetime import date, timedelta, datetime
  6. from geopy import geocoders
  7. from BeautifulSoup import BeautifulSoup
  8. from multiprocessing import Pool, cpu_count
  9.  
  10. conn = psycopg2.connect("dbname=mydb user=ideamonk password=geodb")
  11. cur = conn.cursor()
  12. today = (date.today()).strftime("%a %b %d") # think west, think yesterday
  13. sql_today = (date.today() - timedelta(1)).strftime("%Y-%m-%d")
  14. sql_time = (datetime.now()).strftime("%H:%M:%S")
  15.  
  16. def updateStats(id,url, old_count, nesting_level):
  17. stat_count = old_count
  18. if nesting_level==0:
  19. try:
  20. soup = BeautifulSoup(urllib2.urlopen(url).read())
  21. except:
  22. print "Fetch FAIL"
  23. return (id, stat_count)
  24. else:
  25. try:
  26. soup = BeautifulSoup(urllib2.urlopen("%s/index%d.html"%(url,nesting_level)).read())
  27. except:
  28. print "Fetch FAIL"
  29. return (id, stat_count)
  30.  
  31. masthead = soup.findAll('h4')
  32.  
  33. if len(masthead)>0:
  34. if masthead[0].contents[0].strip() != today and len(masthead[0].contents[0].strip())>3:
  35. print "quitting at level ", nesting_level, masthead[0].contents[0].strip(), today, url
  36. return (id, stat_count)
  37.  
  38. start = masthead[0].nextSibling
  39. natural_break = False
  40.  
  41. while start:
  42. if str(type(start)) == "<class 'BeautifulSoup.Tag'>":
  43. if start.name=="p":
  44. stat_count+=1
  45. if start.name=="h4":
  46. natural_break = True
  47. break
  48. start = start.nextSibling
  49.  
  50. if soup.find('a', {'href':'index%s.html' % (nesting_level+100)}) and not natural_break:
  51. print "nesting in", nesting_level+100
  52. return updateStats(id, url, stat_count, nesting_level+100)
  53.  
  54. print url
  55. return (id, stat_count)
  56.  
  57. cur.execute("select * from craigs_statistics;")
  58. city = cur.fetchone()
  59.  
  60. city_list = []
  61. while city:
  62. city_list.append( (city[0], city[3] + "/sss", 0, 0) )
  63. city = cur.fetchone()
  64.  
  65. ''' # uniprocessing random fluke test
  66. r=46
  67. print updateStats(city_list[r][0],city_list[r][1],city_list[r][2],city_list[r][3])
  68. '''
  69.  
  70. pool = Pool(processes=10)
  71. results = [pool.apply_async(updateStats, a) for a in city_list]
  72. results = [r.get() for r in results]
  73.  
  74. for id,count in results:
  75. cur.execute("INSERT into craigs_timed(city_id_id, date, time, count) VALUES (%d,'%s','%s',%d);" % (id,sql_today, sql_time,count))
  76. conn.commit()
  77. print "done"
Add Comment
Please, Sign In to add comment