Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding:utf8 -*-
- from robobrowser import RoboBrowser
- import diskcache
- import unicodedata
- import graphviz as gv
- from collections import Counter
- with open('french') as f:
- stopwords = [line.strip() for line in f.readlines()]
- # sleep : 14 a 30 secondes
- str = lambda x:unicodedata.normalize('NFKD', x).encode('ascii','ignore')
- cache = diskcache.Cache('cache',expire=604800,size_limit=int(500000))
- def analyse():
- c = Counter()
- for k in cache:
- desc = cache.get(k)['description'].lower()
- for word in desc.split(' '):
- if not word in stopwords:
- c[word] += 1
- print(c.most_common(10))
- def search(d):
- for k in cache:
- e = cache.get(k)
- match = True
- for k,v in d.items():
- if not v in e[k]:
- match = False
- break
- if match:
- print(e)
- def crawl(q):
- rb.open('https://www.adopteunmec.com/gogole?q={}'.format(q))
- for profile in rb.find_all('div','someone'):
- dProfile = {}
- a = profile.div.div.a
- dProfile['url'] = a.get('href')
- dProfile['image_url'] = a.img.get('src')
- infos = profile.find('div','user-infos')
- dProfile['username'] = infos.find('h4','user-grid-title').text.strip()
- dProfile['description'] = infos.find('div','user-desc').text.strip()
- infos = profile.find('div','user-infos-basics')
- dProfile['age'] = infos.find('span','age').text[:2]
- dProfile['city'] = infos.find('span','city').text
- cache.set(dProfile['username'],dProfile)
- for k,v in dProfile.items():
- dProfile[k] = str(v)
- def graph():
- G = gv.Graph(format='png',engine='fdp',graph_attr={'title':'sirix','overlap':'scale'})
- for k in cache:
- e = cache.get(k)
- G.edge(k,cache.get(k)['city'])
- G.render('sirix.dot',view=True)
- if __name__ == '__main__':
- bCrawl=False
- if bCrawl:
- login_url = "https://www.adopteunmec.com//auth/login"
- rb = RoboBrowser(None,'html5lib')
- rb.open(login_url)
- form = rb.get_form(class_='form-loggin')
- form['remember'].value = 'on'
- form['username'].value = 'redman@hotmail.com'
- form['password'].value = 'Bb12481632'
- rb.submit_form(form)
- for ville in ['Toulouse','Colomiers','Paris']:
- crawl(ville)
- # search({"description":"jeux","city":"Toulouse"})
- analyse()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement