Advertisement
pendekar_langit

scrapping

Feb 29th, 2016
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.83 KB | None | 0 0
  1. #!/usr/bin/python
  2. import sqlite3
  3. import os, requests, urlparse, urllib
  4. from bs4 import BeautifulSoup
  5. from multiprocessing import Pool
  6.  
  7. def conn_db():
  8.     return sqlite3.connect('emails.db')
  9. def create_db(conn):
  10.     cursor = conn.cursor()
  11.     #Create table
  12.     cursor.execute("CREATE TABLE IF NOT EXISTS lists (urls text)")
  13.     cursor.execute("CREATE TABLE IF NOT EXISTS visiteds (visited text)")
  14.     cursor.execute("CREATE TABLE IF NOT EXISTS emails (email text)")
  15.     conn.commit()
  16.  
  17. def insert_db(url, conn):
  18.     cursor = conn.cursor()
  19.     cursor.execute("SELECT urls FROM lists WHERE urls = '%s'" % url)
  20.     res =  cursor.fetchall()
  21.     if len(res) == 0:
  22.         #Insert a row of data
  23.         cursor.execute("INSERT INTO lists VALUES ('%s')" % url)
  24.         cursor.execute("INSERT INTO visiteds VALUES ('%s')" % url)
  25.         # Save (commit) the changes
  26.         conn.commit()
  27. def get_all_count(table, conn):
  28.     cursor = conn.cursor()
  29.     return cursor.execute("SELECT * FROM %s" % table).fetchall()
  30. def get_all_data(table, conn):
  31.     cursor = conn.cursor()
  32.     return [r[0] for r in cursor.execute("SELECT * FROM %s" % table).fetchall()]
  33. def get_data(table, conn):
  34.     cursor = conn.cursor()
  35.     return cursor.execute("SELECT * FROM %s" % table).fetchone()
  36. def del_data(url, table, collumn, conn):
  37.     cursor = conn.cursor()
  38.     cursor.execute("DELETE FROM %s WHERE %s = '%s'" % (table, collumn, url))
  39.     conn.commit()
  40. def insert_data(data, table, conn):
  41.     cursor = conn.cursor()
  42.     cursor.execute("INSERT INTO %s VALUES ('%s')" % (table, data))
  43.     conn.commit()
  44. def crawling(uri):
  45.     conn = sqlite3.connect('emails.db')
  46.     create_db(conn)
  47.     insert_db(uri, conn)
  48.     ln = len(get_all_count("lists", conn))
  49.     conn.close()
  50.     while ln > 0:
  51.         ht = None
  52.         conn = sqlite3.connect('emails.db')
  53.         url = get_data("lists", conn)[0]
  54.         try:
  55.             res = requests.get(url, timeout=None)
  56.             ht = res.content
  57.         except:
  58.             print url
  59.         if ht != None:
  60.             soup = BeautifulSoup(ht, "html.parser")
  61.             del_data(url, "lists", "urls", conn)
  62.             print len(get_all_count("lists", conn))
  63.             try:
  64.                 for tag in soup.findAll('a', href=True):
  65.                     uri = '{uri.scheme}://{uri.netloc}/'.format(uri=urlparse.urlparse(url))
  66.                     tag['href'] = urlparse.urljoin(uri, tag['href'])
  67.                     if tag['href'] not in get_all_data("visiteds", conn) and '#' not in tag['href'] and tag['href'][:6] != "mailto" and "javascript" not in tag['href']:
  68.                         insert_data(tag['href'], "lists", conn)
  69.                         insert_data(tag['href'], "visiteds", conn)
  70.                     elif tag['href'][:6] == "mailto" and urllib.unquote(tag['href'].split(':')[1]).decode('utf-8') not in get_all_data('emails', conn):
  71.                         print urllib.unquote(tag['href'].split(':')[1]).decode('utf-8')
  72.                         insert_data(urllib.unquote(tag['href'].split(':')[1]).decode('utf-8'), 'emails', conn)
  73.             except:
  74.                 pass
  75.     conn.close()
  76. def main():
  77.     url = "https://www.ietf.org"
  78.     crawling(url)
  79.  
  80. if __name__ == "__main__":
  81.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement