This week only. Pastebin PRO Accounts Christmas Special! Don't miss out!Want more features on Pastebin? Sign Up, it's FREE!
Guest

Untitled

By: a guest on Feb 26th, 2013  |  syntax: Python  |  size: 6.45 KB  |  views: 27  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. import sys
  2. import urllib2
  3. from collections import deque
  4. from time import localtime, strftime
  5. from urlparse import urlparse
  6. from bs4 import BeautifulSoup
  7. import lxml
  8.  
  9. #
  10. #  Start Main function - Called at the very end to allow every function to be declared first.
  11. #
  12. def main() :
  13.  
  14.     # Starter Links!
  15.     links = deque(["http://www.reddit.com/r/circlejerk/comments/18dzo9",
  16.             "http://xkcd.com",
  17.             "http://reddit.com",
  18.             "http://news.google.com",
  19.             "http://www.cnn.com",
  20.             "http://abcnews.go.com/",
  21.             "http://en.wikipedia.org/wiki/Main_Page",
  22.             "http://popurls.com/",
  23.             "http://www.foxnews.com/",
  24.             "http://twitter.com/news"
  25.             ])
  26.     try :
  27.         # The guts of our program are called here
  28.         crawl_web(links, set(), 10000)
  29.     except RuntimeError, r :
  30.         log(str(r))
  31.  
  32. # END main()  
  33.  
  34.  
  35. #
  36. #  get_page() takes the url and tries to open a socket, get the html,
  37. #  then close the socket and return the html as a string. Also handles
  38. #  exceptions.
  39. #
  40. def get_page(url):
  41.     try :
  42.         usock = urllib2.urlopen(url)
  43.         data = usock.read()
  44.         usock.close()
  45.         return data
  46.     except IOError :    
  47.         log(url, "bad_link")
  48.         pass
  49.     except ValueError :
  50.         log(url, "bad_page")
  51.         pass
  52. # END get_page(url)
  53.  
  54.  
  55. #
  56. #  get_all_links takes a webpage's html source in a string and
  57. #  gets all the links in the page.
  58. #
  59. #  TODO: add the base path to the args list, so that links like
  60. #  /blah/blah.html can be found and fixed, instead of being ignored.
  61. #  
  62. def get_all_links(url):
  63.     page = get_page(url)
  64.     links = []
  65.  
  66.     # Check to make sure the page is valid. If it isn't, log it.
  67.     if page :
  68.         # Break down our URL.
  69.         u = urlparse(url)
  70.         base_url = u.hostname
  71.  
  72.         # Get the HTML of the page.
  73.         soup = BeautifulSoup(page)
  74.  
  75.         # Get all the links in the page, and add them to our links variable.
  76.         for link in soup.findAll('a') :
  77.             try:
  78.                 # Looks like a valid URL
  79.                 if  (link['href'][0:7] == "http://") or (link['href'][0:8] == "https://"):
  80.                     links.append(link['href'])
  81.                 # This might be valid, but is a relative link. We need to add
  82.                 # add it to the base url.
  83.                 elif link['href'][0] == '/' :
  84.                     newLink = base_url + link['href']
  85.                     links.append(newLink)
  86.                 # Doesn't look like a valid url, log as such.
  87.                 else:
  88.                     log(link['href'], "bad_link")
  89.             except KeyError as k:
  90.                 log("Key Error :" + str(link), "bad_link")
  91.                 pass
  92.             except IndexError as i :
  93.                 log("Index Error Detected: " + str(i));
  94.                 pass
  95.         # End for link loop
  96.     # END if page
  97.     else:
  98.         log(url, "bad_page")
  99.  
  100.     return links
  101. # END get_all_links(page)
  102.  
  103.  
  104. #
  105. #  The crawl_web function takes a seed queue and number of pages
  106. #  as arguements and starts crawling web pages gathering links until
  107. #  it has crawled pages equal to num_pages
  108. #
  109. def crawl_web(seed, db, num_pages):
  110.     tocrawl = seed
  111.     crawled = []
  112.     pages_crawled = 1
  113.  
  114.     while pages_crawled < num_pages and len(tocrawl) > 0:
  115.       # Get the nex page from the tocrawl list.
  116.       page = tocrawl.popleft()
  117.  
  118.       # Check to make sure we haven't been to this page already.
  119.       if page not in crawled:
  120.         # Grab all the links from the page, and put them in our links list
  121.         links = get_all_links(page)
  122.  
  123.         # For each link,
  124.         for linky in links :
  125.             if not visited(linky, db) :
  126.                 linky = linky.replace("'", "")
  127.                 tocrawl.append(linky)
  128.                 insert_url(db, linky)
  129.             # end not visited if
  130.         # End for linky in links
  131.  
  132.         # Add to the visited list so we don't come back here again.
  133.         crawled.append(page)
  134.  
  135.         # Print a completion percentage.
  136.         print str("%0.2f" % ((float(pages_crawled)/float(num_pages)) * 100)) + "% complete. (" + str(pages_crawled) + " / " + str(num_pages) + ")"
  137.  
  138.         # Increment pages crawled.
  139.         pages_crawled += 1
  140.     # END if page not in crawled.
  141.     # END while pages_crawled < num_pages and len(tocrawl) > 0
  142. # END crawl_web(seed)
  143.  
  144. #
  145. #  The visited function takes a url as an arguement, and if it's in the
  146. #  database, returns true. Otherwise, returns false.
  147. #
  148. def visited(url, db) :
  149.     try :
  150.         url = url.replace("'", "")
  151.         url = url.replace('"', "")
  152.         return url in db
  153.     except UnicodeEncodeError :
  154.         return True
  155. #END visited(URL)
  156.  
  157.  
  158. #
  159. #
  160. #
  161. #
  162. def log(message, log_type = 'error' ) :
  163.     try :
  164.         # Create a custom timestamp.
  165.         timestamp = strftime("%B %d, %Y @ %H:%M - ", localtime())
  166.  
  167.         # Replace them pesky single and double quotes.
  168.         message = message.replace("'", "")
  169.         message = message.replace('"', "")
  170.  
  171.         # Added because the OP's code assumed all directories already exist
  172.         try:
  173.             os.mkdir('./crawlies')
  174.             os.mkdir('./crawlies/logs')
  175.         except:
  176.             pass
  177.            
  178.         # Find what type of message it is, and log that mofo. Default is error.
  179.         if log_type == "bad_link" :
  180.             file = open('./crawlies/logs/bad_link.txt', 'a')
  181.             file.write(timestamp + message +"\r")
  182.         elif log_type == "bad_page" :
  183.             file = open('./crawlies/logs/bad_page.txt', 'a')
  184.             file.write(timestamp + message +"\r")
  185.         elif log_type == "valid_page" :
  186.             file = open('./crawlies/logs/valid_page.txt', 'a')
  187.             file.write(timestamp + message +"\r")
  188.         else :
  189.             file = open('./crawlies/logs/error_log.txt', 'a')
  190.             file.write(timestamp + message +"\r")
  191.  
  192.         #close the file
  193.         file.close()
  194.     except UnicodeEncodeError :
  195.         file = open('./crawlies/logs/unicode_error.txt', 'a')
  196.         file.write(timestamp + "Unicode Error Detected\r")
  197.         file.close()            
  198. # END log(message, log_type)
  199.  
  200.  
  201. #
  202. #
  203. #
  204. #
  205. def insert_url(db, url) :
  206.     try :
  207.         url = url.replace("'", "")
  208.         url = url.replace('"', "")
  209.         db.add(url)
  210.     except UnicodeEncodeError :
  211.         log(url, "bad_link")
  212.         pass
  213. # END insert_url(db, url)
  214.  
  215.  
  216. #call the main function, starting the program
  217. main()
clone this paste RAW Paste Data