Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #! /usr/bin/python
- # -*- coding: utf-8 -*-
- # parse les pages du site frenchy-commerce.com
- # ./parse-frenchy.py http://frenchy-commerce.com/commerce/v2/26/26120-CHABEUIL.php
- #
- # Copyright(C) 2014 François Revol
- # Licensed under MIT
- import sys
- import csv
- import mechanize
- import lxml.html
- annuaire = [['Categorie', 'Nom', 'Adresse', 'Telephone']]
- br = mechanize.Browser()
- #br.set_all_readonly(False) # allow everything to be written to
- br.set_handle_robots(False) # ignore robots
- br.set_handle_refresh(False) # can sometimes hang without this
- #br.addheaders = [] # [('User-agent', 'Firefox')]
- url = sys.argv[1]
- outfile = url.split('/')[-1].replace('.php', '.csv')
- response = br.open(url)
- #print response.read() # the text of the page
- tree = lxml.html.fromstring(response.read())
- pages = tree.xpath('//div[@id="pagination"]/a')
- if len(pages) > 0:
- pages = [u.attrib['href'] for u in pages]
- else:
- # pas de liens de pages dans la page, on suppose une seule
- pages = [url]
- #print pages
- for url in pages:
- print 'Page: %s' % url
- try:
- response = br.open(url)
- except mechanize.HTTPError as e:
- print e
- continue
- tree = lxml.html.fromstring(response.read())
- categorie = ''
- lines = []
- col1 = []
- col2 = []
- col3 = []
- # recherche de tous les '<h2>' (catégories) et <div> col1,2,3
- for div in tree.xpath('//h2|//div[@class="col1"]|//div[@class="col2"]|//div[@class="col3"]'):
- #print lxml.html.tostring(div)
- #print div.tag
- if div.tag == 'h2':
- categorie = unicode(div.text).encode('utf-8')
- lines = []
- continue
- # div class=col1,2,3
- s = lxml.html.tostring(div)
- s = s.replace('<br>', '</div><div>')
- s = s.replace(' class="col1"', '').replace(' class="col2"', '').replace(' class="col3"', '')
- s = s.replace(' ','').replace('<div></div>', '')
- s = '<div>%s</div>' % s
- #print s
- fixed_divs = lxml.html.fragment_fromstring(s)
- #print fixed_divs
- for i, child in enumerate(fixed_divs.iterchildren()):
- #print child
- #print "append '%s'" % child.text
- #print "%d:" % i
- print child.text.encode('utf-8')
- if i >= len(lines):
- lines.append([categorie, '', '', ''])
- if div.attrib['class'] == "col1":
- c = 1
- if div.attrib['class'] == "col2":
- c = 2
- if div.attrib['class'] == "col3":
- c = 3
- lines[i][c] = unicode(child.text).encode('utf-8')
- if div.attrib['class'] == "col3":
- annuaire.extend(lines)
- #print annuaire
- print lines
- lines = []
- #print column
- #print lines
- print annuaire
- with open(outfile.replace('.csv', '.dq.csv'), 'wb') as outcsvfile:
- writer = csv.writer(outcsvfile, lineterminator='\n')
- try:
- for row in annuaire:
- print row
- writer.writerow(row)
- except csv.Error as e:
- sys.exit('file %s: %s' % (outfile, e))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement