Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- #-------------------------------------------------------------------------------
- # Name: Foursquare checkins crawler-processor
- # Author: Alexander Tolmach
- # Created: 15.03.2013
- # Licence: CC BY-SA
- #-------------------------------------------------------------------------------
- #!/usr/bin/env python
- import re
- import time
- import datetime
- import urlparse
- from urllib2 import urlopen
- import json
- import PySQLPool
- import MySQLdb
- import tweetstream
- import threading
- #TWITTER
- twitter_login = 'TWITTER_LOGIN'
- twitter_password = 'TWITTER_PASSWORD'
- twitter_locations = ['37.286,55.438', '37.984,55.99'] # Moscow region coordinates
- #BITLY
- bitly_login = 'BITLY_LOGIN'
- bitly_apikey = 'BITLY_API_KEY'
- #FOURSQUARE
- fsq_clientid = 'FOURSQUARE_CLINET_ID'
- fsq_clientsecret = 'FOURSQUARE_CLIENT_SECRET'
- #REKOGNITION
- rek_key = 'REKOGNITION_KEY'
- rek_secret = 'REKOGNITION_SECRET'
- #MYSQL
- PySQLPool.getNewPool().maxActiveConnections = 1
- db_connection = PySQLPool.getNewConnection(username='DB_USER', password='DB_PASSWORD', host='DB_SERVER', db='DB_NAME')
- class TwitterThread(threading.Thread):
- def run(self):
- passtime = 60
- while True:
- try:
- stream = tweetstream.FilterStream(twitter_login, twitter_password, locations=twitter_locations)
- passtime = 60
- for tweet in stream:
- if tweet['entities']['urls']:
- for element in tweet['entities']['urls']:
- if '4sq.com' in element['expanded_url']:
- query = PySQLPool.getNewQuery(db_connection, commitOnEnd=True)
- query.Query('INSERT INTO checkins(twitlink) VALUES ("{0}");'.format(element['expanded_url']))
- except:
- print 'ERROR'
- time.sleep(passtime)
- passtime *= 2
- pass
- class BitlyThread(threading.Thread):
- def run(self):
- while True:
- query = PySQLPool.getNewQuery(db_connection, commitOnEnd=True)
- query.Query('SELECT * FROM checkins WHERE twitunboxed=0 LIMIT 1;')
- try:
- dbrow = query.record[0]
- except IndexError:
- time.sleep(60)
- else:
- checkinid, signature = expandlink(dbrow['twitlink'])
- if checkinid == 'error':
- twitstat = 9
- else:
- twitstat = 1
- query.Query('UPDATE checkins SET twitunboxed={0}, checkinid="{1}", signature="{2}" WHERE twitid={3};'.format(
- twitstat,
- checkinid,
- signature,
- dbrow['twitid']))
- time.sleep(3)
- class FoursquareThread(threading.Thread):
- def run(self):
- while True:
- query = PySQLPool.getNewQuery(db_connection, commitOnEnd=True)
- query.Query('SELECT * FROM checkins WHERE twitunboxed=1 LIMIT 1;')
- try:
- dbrow = query.record[0]
- except IndexError:
- time.sleep(60)
- else:
- startt = time.clock()
- checkindict, userdict, venuedict = checkfoursquare(dbrow['checkinid'], dbrow['signature'])
- #checkindict = {'time':dt,'venueid':venueid,'userid':userid}
- #userdict = {'id':userid, 'photo':userphoto}
- #venuedict = {'id':venueid, 'name':venuename, 'lat':venuelat, 'lng':venuelng, 'cat1':cat1, 'cat2':cat2}
- if venuedict['id'] == 'error':
- twitstat = 9
- else:
- twitstat = 2
- query.Query('UPDATE checkins SET twitunboxed={0}, userid="{1}", venueid="{2}", chekintime="{3}" WHERE twitid={4};'.format(twitstat, checkindict['userid'], checkindict['venueid'], checkindict['time'], dbrow['twitid']))
- query.Query('SELECT * FROM venues WHERE venueid="{0}";'.format(venuedict['id']))
- if len(query.record) == 0:
- query.Query('INSERT INTO venues(venueid, name, lat, lng, cat1, cat2) VALUES ("{0}", "{1}", {2}, {3}, "{4}", "{5}");'.format(
- venuedict['id'],
- MySQLdb.escape_string(venuedict['name'].encode('utf-8', 'replace')),
- venuedict['lat'],
- venuedict['lng'],
- venuedict['cat1'],
- venuedict['cat2'],
- ))
- query.Query('SELECT * FROM people WHERE userid="{0}";'.format(userdict['id']))
- if len(query.record) == 0:
- query.Query('INSERT INTO people(userid, photo) VALUES ("{0}", "{1}");'.format(
- userdict['id'],
- userdict['photo'],
- ))
- if (time.clock() - startt) < 8:
- time.sleep(8 - (time.clock() - startt))
- class MonitorThread(threading.Thread):
- def run(self):
- while True:
- counterr = counttwi = countbit = countfsq = 0
- query = PySQLPool.getNewQuery(db_connection, commitOnEnd=True)
- query.Query('SELECT twitunboxed, COUNT(*) FROM checkins GROUP BY twitunboxed;')
- for line in query.record:
- if line['twitunboxed'] == 0: counttwi = line['COUNT(*)']
- elif line['twitunboxed'] == 1: countbit = line['COUNT(*)']
- elif line['twitunboxed'] == 2: countfsq = line['COUNT(*)']
- elif line['twitunboxed'] == 9: counterr = line['COUNT(*)']
- query.Query('SELECT COUNT(*) FROM venues;')
- countvenues = query.record[0]['COUNT(*)']
- query.Query('SELECT COUNT(*) FROM people;')
- countusers = query.record[0]['COUNT(*)']
- timer = time.strftime('%d.%m.%Y %H:%M:%S', time.gmtime(time.time() + (4 * 60 * 60)))
- query.Query('INSERT INTO stats(timer,checkins,users,venues,waiting,errors) VALUES ("{0}","{1}","{2}","{3}","{4}","{5}");'.format(
- timer,
- countfsq,
- countusers,
- countvenues,
- countbit + counttwi,
- counterr))
- time.sleep(60 * 60)
- class ReKognitionThread(threading.Thread):
- def run(self):
- while True:
- query = PySQLPool.getNewQuery(db_connection, commitOnEnd=True)
- query.Query('SELECT * FROM people WHERE age_prob is NULL and not(photo = "error") LIMIT 1;')
- try: dbrow = query.record[0]
- except IndexError: time.sleep(60 * 3)
- else:
- facedict = facerekognition('https://is1.4sqi.net/userpix/{0}'.format(dbrow['photo']))
- query.Query('UPDATE people SET age={0}, age_prob={1}, gender={2}, gender_prob={3} WHERE userid={4};'.format(
- facedict['age'],
- facedict['age_prob'],
- facedict['sex'],
- facedict['sex_prob'],
- dbrow['userid']
- ))
- time.sleep(60 * 3)
- def expandlink(link):
- json_tx = jsonfromurl('https://api-ssl.bitly.com/v3/expand?shortUrl={0}&login={1}&apiKey={2}'.format(link, bitly_login, bitly_apikey))
- try:
- retval1 = urlparse.urlparse(json_tx['data']['expand'][0]['long_url']).path.split('/')[-1]
- retval2 = urlparse.parse_qs(urlparse.urlparse(json_tx['data']['expand'][0]['long_url']).query)['s'][0]
- except:
- retval1 = retval2 = 'error'
- return retval1, retval2
- def checkfoursquare(check, code):
- json_ch = jsonfromurl('https://api.foursquare.com/v2/checkins/{0}?signature={1}&client_id={2}&client_secret={3}&v={4}'.format(
- check,
- code,
- fsq_clientid,
- fsq_clientsecret,
- datetime.datetime.now().strftime('%Y%m%d')))
- try: dt = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(int(json_ch['response']['checkin']['createdAt']) + (4 * 60 * 60)))
- except: dt = '1900-01-01 00:00:00'
- try: venueid = json_ch['response']['checkin']['venue']['id']
- except: venueid = 'error'
- try: userid = json_ch['response']['checkin']['user']['id']
- except: userid = 'error'
- try:
- userphoto = json_ch['response']['checkin']['user']['photo']['suffix']
- if 'blank' in userphoto: userphoto='error'
- except: userphoto = 'error'
- try: venuename = json_ch['response']['checkin']['venue']['name']
- except: venuename = 'error'
- try:
- venuelat = json_ch['response']['checkin']['venue']['location']['lat']
- venuelng = json_ch['response']['checkin']['venue']['location']['lng']
- except: venuelat = venuelng = 0.
- try:
- cat1 = 'error'
- for category in json_ch['response']['checkin']['venue']['categories']:
- if 'primary' in category and category['primary'] is True:
- cat1 = category['id']
- break
- except:
- pass
- try:
- cat2 = re.sub('https://foursquare.com/img/categories_v2/', '', json_ch['response']['checkin']['venue']['categories'][0]['icon']['prefix'])
- if cat2[-1:] == '_': cat2=cat2[:-1]
- except: cat2 = 'error'
- checkindict = {'time': dt, 'venueid': venueid, 'userid': userid}
- userdict = {'id': userid, 'photo': userphoto}
- venuedict = {'id': venueid, 'name': venuename, 'lat': venuelat, 'lng': venuelng, 'cat1': cat1, 'cat2': cat2}
- return checkindict, userdict, venuedict
- def facerekognition(url):
- json_fc = jsonfromurl('http://rekognition.com/func/api/?api_key={0}&api_secret={1}&jobs=face_gender_age&urls={2}'.format(
- rek_key,
- rek_secret,
- url
- ))
- if 'face_detection' not in json_fc or len(json_fc['face_detection']) == 0:
- return {'sex': 9, 'age': 0, 'age_prob': 9, 'sex_prob': 9}
- else:
- face = sorted(json_fc['face_detection'], key=lambda x: x['confidence'], reverse=True)[0]
- try:
- sexprob = abs(.5 - face['sex']) * 2
- sex = round(face['sex'], 0)
- except:
- sexprob = sex = 9
- try: age = face['age']
- except: age = 0
- return {'sex': sex, 'sex_prob': sexprob, 'age': age, 'age_prob': face['confidence']}
- def jsonfromurl(url, attempts=5):
- errorcount = 0
- while True:
- try:
- ret_json = json.loads(urlopen(url).read())
- break
- except:
- errorcount += 1
- if errorcount < attempts:
- time.sleep(10)
- pass
- else:
- ret_json = {}
- break
- return ret_json
- TwitterThread().start()
- time.sleep(60)
- BitlyThread().start()
- time.sleep(60)
- FoursquareThread().start()
- time.sleep(60)
- ReKognitionThread().start()
- time.sleep(60)
- MonitorThread().start()
Add Comment
Please, Sign In to add comment