Advertisement
mmu_man

parse-frenchy.py

Mar 19th, 2014
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.18 KB | None | 0 0
  1. #! /usr/bin/python
  2. # -*- coding: utf-8 -*-
  3.  
  4. # parse les pages du site frenchy-commerce.com
  5. # ./parse-frenchy.py http://frenchy-commerce.com/commerce/v2/26/26120-CHABEUIL.php
  6. #
  7. # Copyright(C) 2014 François Revol
  8. # Licensed under MIT
  9.  
  10. import sys
  11. import csv
  12. import mechanize
  13. import lxml.html
  14.  
  15. annuaire = [['Categorie', 'Nom', 'Adresse', 'Telephone']]
  16.  
  17. br = mechanize.Browser()
  18. #br.set_all_readonly(False)    # allow everything to be written to
  19. br.set_handle_robots(False)   # ignore robots
  20. br.set_handle_refresh(False)  # can sometimes hang without this
  21. #br.addheaders = []             # [('User-agent', 'Firefox')]
  22.  
  23. url = sys.argv[1]
  24. outfile = url.split('/')[-1].replace('.php', '.csv')
  25.  
  26. response = br.open(url)
  27. #print response.read()      # the text of the page
  28. tree = lxml.html.fromstring(response.read())
  29.  
  30. pages = tree.xpath('//div[@id="pagination"]/a')
  31. if len(pages) > 0:
  32.     pages = [u.attrib['href'] for u in pages]
  33. else:
  34.     # pas de liens de pages dans la page, on suppose une seule
  35.     pages = [url]
  36. #print pages
  37.  
  38. for url in pages:
  39.     print 'Page: %s' % url
  40.     try:
  41.         response = br.open(url)
  42.     except mechanize.HTTPError as e:
  43.         print e
  44.         continue
  45.     tree = lxml.html.fromstring(response.read())
  46.  
  47.     categorie = ''
  48.     lines = []
  49.     col1 = []
  50.     col2 = []
  51.     col3 = []
  52.    
  53.     # recherche de tous les '<h2>' (catégories) et <div> col1,2,3
  54.     for div in tree.xpath('//h2|//div[@class="col1"]|//div[@class="col2"]|//div[@class="col3"]'):
  55.         #print lxml.html.tostring(div)
  56.         #print div.tag
  57.         if div.tag == 'h2':
  58.             categorie = unicode(div.text).encode('utf-8')
  59.             lines = []
  60.             continue
  61.  
  62.         # div class=col1,2,3
  63.         s = lxml.html.tostring(div)
  64.         s = s.replace('<br>', '</div><div>')
  65.         s = s.replace(' class="col1"', '').replace(' class="col2"', '').replace(' class="col3"', '')
  66.         s = s.replace('&#160;','').replace('<div></div>', '')
  67.         s = '<div>%s</div>' % s
  68.         #print s
  69.         fixed_divs = lxml.html.fragment_fromstring(s)
  70.         #print fixed_divs
  71.         for i, child in enumerate(fixed_divs.iterchildren()):
  72.             #print child
  73.             #print "append '%s'" % child.text
  74.             #print "%d:" % i
  75.             print child.text.encode('utf-8')
  76.             if i >= len(lines):
  77.                 lines.append([categorie, '', '', ''])
  78.            
  79.             if div.attrib['class'] == "col1":
  80.                 c = 1
  81.             if div.attrib['class'] == "col2":
  82.                 c = 2
  83.             if div.attrib['class'] == "col3":
  84.                 c = 3
  85.             lines[i][c] = unicode(child.text).encode('utf-8')
  86.         if div.attrib['class'] == "col3":
  87.             annuaire.extend(lines)
  88.             #print annuaire
  89.             print lines
  90.             lines = []
  91.  
  92.  
  93.         #print column
  94.         #print lines
  95.  
  96. print annuaire
  97.  
  98.  
  99. with open(outfile.replace('.csv', '.dq.csv'), 'wb') as outcsvfile:
  100.     writer = csv.writer(outcsvfile, lineterminator='\n')
  101.     try:
  102.         for row in annuaire:
  103.             print row
  104.             writer.writerow(row)
  105.     except csv.Error as e:
  106.         sys.exit('file %s: %s' % (outfile, e))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement