Advertisement
alvations

Simple Crawl - Single domain crawler

Jan 4th, 2013
290
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.09 KB | None | 0 0
  1. def get_page(url):
  2.   try:
  3.     import urllib
  4.     return urllib.urlopen(url).read().decode('utf8')
  5.     '''import urllib2
  6.    req = urllib2.Request(url)
  7.    response = urllib2.urlopen(req)
  8.    return response.read().decode('utf8')'''
  9.   except:
  10.     return ""
  11.  
  12. def getDomain(page):
  13.   # Get domain name from page.
  14.   d_start = 0
  15.   if page[0:4] == "http":
  16.     d_start = 7 # "http://" 1st 7 character
  17.   d_end = page.find(".gov.sg/") + 7
  18.   domain = page[d_start:d_end]
  19.   #print domain
  20.   return domain
  21.  
  22. def cleanURL(page, domain):
  23.   #if page == None: return domain;
  24.   clean_url = page
  25.   # Prepend the internal pages with domain page
  26.   if page[0] == "/" and page[-5:] == ".html" :
  27.     clean_url = "".join([domain, page])
  28.   else:
  29.     clean_url = page
  30.   # Removes http:// and insert www. if not in page
  31.   if clean_url[0:7] == "http://" and clean_url[7:11] != "www.":
  32.     clean_url = "www." + clean_url[7:]
  33.   elif clean_url[0:11] == "http://www.":
  34.     clean_url = clean_url[7:]
  35.   elif clean_url[0:12] == "https://www.":
  36.     clean_url = clean_url[8:]
  37.   clean_url = "".join(["http://",clean_url])
  38.   return clean_url
  39.  
  40. def get_next_target_list(page, only_html=True):
  41.   import BeautifulSoup
  42.   import urllib2, re
  43.   domain = getDomain(page) # Gets the domain from the url.
  44.   soup = BeautifulSoup.BeautifulSoup(get_page(page))
  45.   url_list = []; full_url = ""
  46.   for link in soup.findAll('a'):
  47.     url = str(link.get('href'))
  48.     if url == "": continue
  49.     # Skips javascript and None url
  50.     if url[0:10] == "javascript":
  51.       continue
  52.     if url == "None":
  53.       continue
  54.     full_url = cleanURL(url, domain) # Cleans the url.
  55.     # If crawling only html/htm.
  56.     if only_html == True:
  57.       if full_url[-5:] == ".html" or full_url[-4:] == ".htm":
  58.         url_list.append(full_url)
  59.     else:
  60.       url_list.append(full_url)
  61.   return set(url_list)
  62.  
  63. def add_to_index (index, url, html):
  64.   for entry in index:
  65.     if entry[0] == url:
  66.       entry[1].append(html)
  67.       return
  68.   index.append([url,[html]])
  69.  
  70. def union(p,q):
  71.   return list(set(p).union(set(q)))
  72.  
  73. def crawl_web(seed, max_pages, max_depth):
  74.   index = []
  75.   tocrawl = [seed]; crawled = []
  76.   depth = 0; next_depth = []  
  77.   while tocrawl and depth <= max_depth:
  78.     page = tocrawl.pop()
  79.     print page
  80.     if page not in crawled and len(crawled) < max_pages: # depth 1st search.
  81.       content = get_page(page)
  82.       #if content == "": continue
  83.       add_to_index(index, page, content)
  84.       next_depth = union(next_depth, get_next_target_list(page))
  85.       crawled.append(page)
  86.     if not tocrawl:
  87.       tocrawl, next_depth = next_depth, []
  88.       depth += 1
  89.   return index
  90.  
  91. #print get_page("http://www.pmo.gov.sg/content/pmosite/home.html")
  92.  
  93. seed = "http://www.pmo.gov.sg/content/pmosite/home.html"
  94. #seed = "http://app.singaporeedu.gov.sg/ct/asp/index.asp"
  95. '''seed = ['http://www.pmo.gov.sg', 'http://www.pmo.gov.sg/content/pmosite/aboutpmo.html']
  96. for j in seed:
  97.  print "here"
  98.  print get_page(j)
  99.  for i in get_next_target_list(j):
  100.    print i'''
  101.  
  102. index = crawl_web(seed, 10000000000, 50000000)
  103.  
  104. #for i in index:
  105. #  print i
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement