Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #coding: utf-8
- import sqlite3
- import urllib
- import re
- import ssl
- from urlparse import urljoin
- from urlparse import urlparse
- from bs4 import BeautifulSoup
- scontext = None
- conn = sqlite3.connect('spider.sqlite')
- cur = conn.cursor()
- cur.execute('''CREATE TABLE IF NOT EXISTS Results
- (id INTEGER PRIMARY KEY, url TEXT UNIQUE, titre TEXT,
- prix INTEGER, zone TEXT, cp INTEGER, pieces INTEGER,
- surface INTEGER, metro1 TEXT, metro2 TEXT, metro3
- TEXT, tel TEXT, error INTEGER)''')
- cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''')
- conn.commit()
- url = raw_input('Enter a url to crawl')
- sval = raw_input('How many pages:')
- many = int(sval)
- while many>0:
- many = many - 1
- try:
- enter = urllib.urlopen(url)
- html = enter.read()
- if enter.getcode() != 200 :
- print "Error on page: ",enter.getcode()
- cur.execute('UPDATE Results SET error=? WHERE url=?', (enter.getcode(), url) )
- if 'text/html' != enter.info().gettype() :
- print "Ignore non text/html page"
- cur.execute('UPDATE Results SET error=-1 WHERE url=?', (url, ) )
- conn.commit()
- continue
- print '('+str(len(html))+')',
- soup = BeautifulSoup(html)
- except KeyboardInterrupt:
- print ''
- print 'Program interrupted by user...'
- break
- except:
- print "Unable to retrieve or parse page"
- cur.execute('UPDATE Results SET error=-1 WHERE url=?', (url, ) )
- conn.commit()
- continue
- tags = soup('a')
- href = list()
- count = 0
- for tag in tags:
- try:
- addr = 'http://www.pap.fr'+tag.get('href')
- addr=re.findall('^http.*/annonce.+r[0-9]+',addr)
- href.append(addr)
- except:
- continue
- href = [i for i in href if should_keep(i)]
- print href
- href = list(set(href))
- #去重
- print href
- # print href
- """
- try:
- href = "http://www.pap.fr"+href
- cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( href, ) )
- count = count + 1
- print href
- else: continue
- conn.commit()
- print count
- cur.close()
- """
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement