Advertisement
Guest User

Untitled

a guest
Jan 16th, 2018
498
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.17 KB | None | 0 0
  1. # -*- coding:utf8 -*-
  2. from robobrowser import RoboBrowser
  3. import diskcache
  4. import unicodedata
  5. import graphviz as gv
  6. from collections import Counter
  7.  
  8. with open('french') as f:
  9. stopwords = [line.strip() for line in f.readlines()]
  10.  
  11. # sleep : 14 a 30 secondes
  12.  
  13. str = lambda x:unicodedata.normalize('NFKD', x).encode('ascii','ignore')
  14. cache = diskcache.Cache('cache',expire=604800,size_limit=int(500000))
  15.  
  16. def analyse():
  17. c = Counter()
  18. for k in cache:
  19. desc = cache.get(k)['description'].lower()
  20. for word in desc.split(' '):
  21. if not word in stopwords:
  22. c[word] += 1
  23. print(c.most_common(10))
  24.  
  25.  
  26. def search(d):
  27. for k in cache:
  28. e = cache.get(k)
  29. match = True
  30. for k,v in d.items():
  31. if not v in e[k]:
  32. match = False
  33. break
  34. if match:
  35. print(e)
  36.  
  37. def crawl(q):
  38. rb.open('https://www.adopteunmec.com/gogole?q={}'.format(q))
  39.  
  40. for profile in rb.find_all('div','someone'):
  41. dProfile = {}
  42. a = profile.div.div.a
  43. dProfile['url'] = a.get('href')
  44. dProfile['image_url'] = a.img.get('src')
  45. infos = profile.find('div','user-infos')
  46. dProfile['username'] = infos.find('h4','user-grid-title').text.strip()
  47. dProfile['description'] = infos.find('div','user-desc').text.strip()
  48. infos = profile.find('div','user-infos-basics')
  49. dProfile['age'] = infos.find('span','age').text[:2]
  50. dProfile['city'] = infos.find('span','city').text
  51. cache.set(dProfile['username'],dProfile)
  52.  
  53. for k,v in dProfile.items():
  54. dProfile[k] = str(v)
  55.  
  56. def graph():
  57. G = gv.Graph(format='png',engine='fdp',graph_attr={'title':'sirix','overlap':'scale'})
  58. for k in cache:
  59. e = cache.get(k)
  60. G.edge(k,cache.get(k)['city'])
  61. G.render('sirix.dot',view=True)
  62.  
  63.  
  64.  
  65.  
  66. if __name__ == '__main__':
  67. bCrawl=False
  68. if bCrawl:
  69. login_url = "https://www.adopteunmec.com//auth/login"
  70.  
  71. rb = RoboBrowser(None,'html5lib')
  72.  
  73. rb.open(login_url)
  74.  
  75. form = rb.get_form(class_='form-loggin')
  76. form['remember'].value = 'on'
  77. form['username'].value = 'redman@hotmail.com'
  78. form['password'].value = 'Bb12481632'
  79.  
  80. rb.submit_form(form)
  81. for ville in ['Toulouse','Colomiers','Paris']:
  82. crawl(ville)
  83. # search({"description":"jeux","city":"Toulouse"})
  84. analyse()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement