Untitled

#coding: utf-8
import sqlite3
import urllib
import re
import ssl
from urlparse import urljoin
from urlparse import urlparse
from bs4 import BeautifulSoup


scontext = None

conn = sqlite3.connect('spider.sqlite')
cur = conn.cursor()

cur.execute('''CREATE TABLE IF NOT EXISTS Results
    (id INTEGER PRIMARY KEY, url TEXT UNIQUE, titre TEXT,
    prix INTEGER, zone TEXT, cp INTEGER, pieces INTEGER,
    surface INTEGER, metro1 TEXT, metro2 TEXT, metro3
    TEXT, tel TEXT, error INTEGER)''')
cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''')
conn.commit()

url = raw_input('Enter a url to crawl')
sval = raw_input('How many pages:')
many = int(sval)
while many>0:


    many = many - 1

    try:
        enter = urllib.urlopen(url)

        html = enter.read()
        if enter.getcode() != 200 :
            print "Error on page: ",enter.getcode()
            cur.execute('UPDATE Results SET error=? WHERE url=?', (enter.getcode(), url) )

        if 'text/html' != enter.info().gettype() :
            print "Ignore non text/html page"
            cur.execute('UPDATE Results SET error=-1 WHERE url=?', (url, ) )
            conn.commit()
            continue

        print '('+str(len(html))+')',

        soup = BeautifulSoup(html)
    except KeyboardInterrupt:
        print ''
        print 'Program interrupted by user...'
        break
    except:
        print "Unable to retrieve or parse page"
        cur.execute('UPDATE Results SET error=-1 WHERE url=?', (url, ) )
        conn.commit()
        continue


    tags = soup('a')

    href = list()
    count = 0
    for tag in tags:
        try:
            addr = 'http://www.pap.fr'+tag.get('href')
            addr=re.findall('^http.*/annonce.+r[0-9]+',addr)
            href.append(addr)
        except:
            continue

    href = [i for i in href if should_keep(i)]
    print href
    href = list(set(href))
        #去重
    print href


#        print href
"""
        try:

                href = "http://www.pap.fr"+href
                cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( href, ) )
                count = count + 1
                print href
            else: continue
            conn.commit()

    print count


cur.close()
"""