Advertisement
thiagobodruk

Python link crawler

Jul 4th, 2014
267
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. from bs4 import BeautifulSoup
  4. import urllib.request
  5. import re
  6.  
  7. url = []
  8.  
  9. def readList():
  10.     try:
  11.         list = open('list.txt', 'r')
  12.         for line in list:
  13.             url.append(line)
  14.     except:
  15.         print('Cannot open the file list.txt!')
  16.  
  17. def readUrl():
  18.     u = input('URL: ')
  19.     if not(re.search('http',u)):
  20.         u = 'http://' + u
  21.     url.append(u)
  22.  
  23. def readPage():
  24.     html = page.read()
  25.     page.close()
  26.     soup = BeautifulSoup(html)
  27.  
  28. def options():
  29.     print('Web Crawler v1.0\n')
  30.     o = int(input('[1] Crawl from List \n[2] Crawl from URL\n\nOption:'))
  31.     if(o == 1):
  32.         readList()
  33.     if(o == 2):
  34.         readUrl()
  35.  
  36. def crawl():
  37.     file = open('links.txt', 'w+')
  38.     for u in url:
  39.         links = []
  40.         final = []
  41.         u = re.sub('\n$', '', u)
  42.         page = urllib.request.urlopen(u)
  43.         html = page.read()
  44.         soup = BeautifulSoup(html)
  45.         for a in soup.find_all('a'):
  46.             try:
  47.                 if(re.search('http', a['href'])):
  48.                     links.append(a['href'])
  49.                 else:
  50.                     href = u + a['href']
  51.                     links.append(href)             
  52.             except:
  53.                 print('ERROR: Cant parse ' + str(a))
  54.         final = sorted(set(links))
  55.         for l in final:
  56.             if(l != '#' and l != '/'):
  57.                 print(l + '\n')
  58.                 file.write(l + '\n')
  59.     file.close()
  60. options()
  61. crawl()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement