Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import urllib2
- import psycopg2
- from time import strftime
- from datetime import date, timedelta, datetime
- from geopy import geocoders
- from BeautifulSoup import BeautifulSoup
- from multiprocessing import Pool, cpu_count
- conn = psycopg2.connect("dbname=mydb user=ideamonk password=geodb")
- cur = conn.cursor()
- today = (date.today()).strftime("%a %b %d") # think west, think yesterday
- sql_today = (date.today() - timedelta(1)).strftime("%Y-%m-%d")
- sql_time = (datetime.now()).strftime("%H:%M:%S")
- def updateStats(id,url, old_count, nesting_level):
- stat_count = old_count
- if nesting_level==0:
- try:
- soup = BeautifulSoup(urllib2.urlopen(url).read())
- except:
- print "Fetch FAIL"
- return (id, stat_count)
- else:
- try:
- soup = BeautifulSoup(urllib2.urlopen("%s/index%d.html"%(url,nesting_level)).read())
- except:
- print "Fetch FAIL"
- return (id, stat_count)
- masthead = soup.findAll('h4')
- if len(masthead)>0:
- if masthead[0].contents[0].strip() != today and len(masthead[0].contents[0].strip())>3:
- print "quitting at level ", nesting_level, masthead[0].contents[0].strip(), today, url
- return (id, stat_count)
- start = masthead[0].nextSibling
- natural_break = False
- while start:
- if str(type(start)) == "<class 'BeautifulSoup.Tag'>":
- if start.name=="p":
- stat_count+=1
- if start.name=="h4":
- natural_break = True
- break
- start = start.nextSibling
- if soup.find('a', {'href':'index%s.html' % (nesting_level+100)}) and not natural_break:
- print "nesting in", nesting_level+100
- return updateStats(id, url, stat_count, nesting_level+100)
- print url
- return (id, stat_count)
- cur.execute("select * from craigs_statistics;")
- city = cur.fetchone()
- city_list = []
- while city:
- city_list.append( (city[0], city[3] + "/sss", 0, 0) )
- city = cur.fetchone()
- ''' # uniprocessing random fluke test
- r=46
- print updateStats(city_list[r][0],city_list[r][1],city_list[r][2],city_list[r][3])
- '''
- pool = Pool(processes=10)
- results = [pool.apply_async(updateStats, a) for a in city_list]
- results = [r.get() for r in results]
- for id,count in results:
- cur.execute("INSERT into craigs_timed(city_id_id, date, time, count) VALUES (%d,'%s','%s',%d);" % (id,sql_today, sql_time,count))
- conn.commit()
- print "done"
Add Comment
Please, Sign In to add comment