SHARE
TWEET

Untitled

a guest May 19th, 2017 39 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #coding: utf-8
  2. import sqlite3
  3. import urllib
  4. import re
  5. import ssl
  6. from urlparse import urljoin
  7. from urlparse import urlparse
  8. from bs4 import BeautifulSoup
  9.  
  10.  
  11. scontext = None
  12.  
  13. conn = sqlite3.connect('spider.sqlite')
  14. cur = conn.cursor()
  15.  
  16. cur.execute('''CREATE TABLE IF NOT EXISTS Results
  17.     (id INTEGER PRIMARY KEY, url TEXT UNIQUE, titre TEXT,
  18.     prix INTEGER, zone TEXT, cp INTEGER, pieces INTEGER,
  19.     surface INTEGER, metro1 TEXT, metro2 TEXT, metro3
  20.     TEXT, tel TEXT, error INTEGER)''')
  21. cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''')
  22. conn.commit()
  23.  
  24. url = raw_input('Enter a url to crawl')
  25. sval = raw_input('How many pages:')
  26. many = int(sval)
  27. while many>0:
  28.  
  29.  
  30.     many = many - 1
  31.  
  32.     try:
  33.         enter = urllib.urlopen(url)
  34.  
  35.         html = enter.read()
  36.         if enter.getcode() != 200 :
  37.             print "Error on page: ",enter.getcode()
  38.             cur.execute('UPDATE Results SET error=? WHERE url=?', (enter.getcode(), url) )
  39.  
  40.         if 'text/html' != enter.info().gettype() :
  41.             print "Ignore non text/html page"
  42.             cur.execute('UPDATE Results SET error=-1 WHERE url=?', (url, ) )
  43.             conn.commit()
  44.             continue
  45.  
  46.         print '('+str(len(html))+')',
  47.  
  48.         soup = BeautifulSoup(html)
  49.     except KeyboardInterrupt:
  50.         print ''
  51.         print 'Program interrupted by user...'
  52.         break
  53.     except:
  54.         print "Unable to retrieve or parse page"
  55.         cur.execute('UPDATE Results SET error=-1 WHERE url=?', (url, ) )
  56.         conn.commit()
  57.         continue
  58.  
  59.  
  60.     tags = soup('a')
  61.  
  62.     href = list()
  63.     count = 0
  64.     for tag in tags:
  65.         try:
  66.             addr = 'http://www.pap.fr'+tag.get('href')
  67.             addr=re.findall('^http.*/annonce.+r[0-9]+',addr)
  68.             href.append(addr)
  69.         except:
  70.             continue
  71.  
  72.     href = [i for i in href if should_keep(i)]
  73.     print href
  74.     href = list(set(href))
  75.         #去重
  76.     print href
  77.  
  78.  
  79.  
  80. #        print href
  81. """
  82.         try:
  83.  
  84.                 href = "http://www.pap.fr"+href
  85.                 cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( href, ) )
  86.                 count = count + 1
  87.                 print href
  88.             else: continue
  89.             conn.commit()
  90.  
  91.     print count
  92.  
  93.  
  94. cur.close()
  95. """
RAW Paste Data
Top