Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- import sqlite3
- import os, requests, urlparse, urllib
- from bs4 import BeautifulSoup
- from multiprocessing import Pool
- def conn_db():
- return sqlite3.connect('emails.db')
- def create_db(conn):
- cursor = conn.cursor()
- #Create table
- cursor.execute("CREATE TABLE IF NOT EXISTS lists (urls text)")
- cursor.execute("CREATE TABLE IF NOT EXISTS visiteds (visited text)")
- cursor.execute("CREATE TABLE IF NOT EXISTS emails (email text)")
- conn.commit()
- def insert_db(url, conn):
- cursor = conn.cursor()
- cursor.execute("SELECT urls FROM lists WHERE urls = '%s'" % url)
- res = cursor.fetchall()
- if len(res) == 0:
- #Insert a row of data
- cursor.execute("INSERT INTO lists VALUES ('%s')" % url)
- cursor.execute("INSERT INTO visiteds VALUES ('%s')" % url)
- # Save (commit) the changes
- conn.commit()
- def get_all_count(table, conn):
- cursor = conn.cursor()
- return cursor.execute("SELECT * FROM %s" % table).fetchall()
- def get_all_data(table, conn):
- cursor = conn.cursor()
- return [r[0] for r in cursor.execute("SELECT * FROM %s" % table).fetchall()]
- def get_data(table, conn):
- cursor = conn.cursor()
- return cursor.execute("SELECT * FROM %s" % table).fetchone()
- def del_data(url, table, collumn, conn):
- cursor = conn.cursor()
- cursor.execute("DELETE FROM %s WHERE %s = '%s'" % (table, collumn, url))
- conn.commit()
- def insert_data(data, table, conn):
- cursor = conn.cursor()
- cursor.execute("INSERT INTO %s VALUES ('%s')" % (table, data))
- conn.commit()
- def crawling(uri):
- conn = sqlite3.connect('emails.db')
- create_db(conn)
- insert_db(uri, conn)
- ln = len(get_all_count("lists", conn))
- conn.close()
- while ln > 0:
- ht = None
- conn = sqlite3.connect('emails.db')
- url = get_data("lists", conn)[0]
- try:
- res = requests.get(url, timeout=None)
- ht = res.content
- except:
- print url
- if ht != None:
- soup = BeautifulSoup(ht, "html.parser")
- del_data(url, "lists", "urls", conn)
- print len(get_all_count("lists", conn))
- try:
- for tag in soup.findAll('a', href=True):
- uri = '{uri.scheme}://{uri.netloc}/'.format(uri=urlparse.urlparse(url))
- tag['href'] = urlparse.urljoin(uri, tag['href'])
- if tag['href'] not in get_all_data("visiteds", conn) and '#' not in tag['href'] and tag['href'][:6] != "mailto" and "javascript" not in tag['href']:
- insert_data(tag['href'], "lists", conn)
- insert_data(tag['href'], "visiteds", conn)
- elif tag['href'][:6] == "mailto" and urllib.unquote(tag['href'].split(':')[1]).decode('utf-8') not in get_all_data('emails', conn):
- print urllib.unquote(tag['href'].split(':')[1]).decode('utf-8')
- insert_data(urllib.unquote(tag['href'].split(':')[1]).decode('utf-8'), 'emails', conn)
- except:
- pass
- conn.close()
- def main():
- url = "https://www.ietf.org"
- crawling(url)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement