Simple Crawl - Single domain crawler

def get_page(url):
  try:
    import urllib
    return urllib.urlopen(url).read().decode('utf8')
    '''import urllib2
    req = urllib2.Request(url)
    response = urllib2.urlopen(req)
    return response.read().decode('utf8')'''
  except:
    return ""

def getDomain(page):
  # Get domain name from page.
  d_start = 0
  if page[0:4] == "http":
    d_start = 7 # "http://" 1st 7 character
  d_end = page.find(".gov.sg/") + 7
  domain = page[d_start:d_end]
  #print domain
  return domain

def cleanURL(page, domain):
  #if page == None: return domain;
  clean_url = page
  # Prepend the internal pages with domain page
  if page[0] == "/" and page[-5:] == ".html" :
    clean_url = "".join([domain, page])
  else:
    clean_url = page
  # Removes http:// and insert www. if not in page
  if clean_url[0:7] == "http://" and clean_url[7:11] != "www.":
    clean_url = "www." + clean_url[7:]
  elif clean_url[0:11] == "http://www.":
    clean_url = clean_url[7:]
  elif clean_url[0:12] == "https://www.":
    clean_url = clean_url[8:]
  clean_url = "".join(["http://",clean_url])
  return clean_url

def get_next_target_list(page, only_html=True):
  import BeautifulSoup
  import urllib2, re
  domain = getDomain(page) # Gets the domain from the url.
  soup = BeautifulSoup.BeautifulSoup(get_page(page))
  url_list = []; full_url = ""
  for link in soup.findAll('a'):
    url = str(link.get('href'))
    if url == "": continue
    # Skips javascript and None url
    if url[0:10] == "javascript":
      continue
    if url == "None":
      continue
    full_url = cleanURL(url, domain) # Cleans the url.
    # If crawling only html/htm.
    if only_html == True:
      if full_url[-5:] == ".html" or full_url[-4:] == ".htm":
        url_list.append(full_url)
    else:
      url_list.append(full_url)
  return set(url_list)

def add_to_index (index, url, html):
  for entry in index:
    if entry[0] == url:
      entry[1].append(html)
      return
  index.append([url,[html]])

def union(p,q):
  return list(set(p).union(set(q)))

def crawl_web(seed, max_pages, max_depth):
  index = []
  tocrawl = [seed]; crawled = []
  depth = 0; next_depth = []
  while tocrawl and depth <= max_depth:
    page = tocrawl.pop()
    print page
    if page not in crawled and len(crawled) < max_pages: # depth 1st search.
      content = get_page(page)
      #if content == "": continue
      add_to_index(index, page, content)
      next_depth = union(next_depth, get_next_target_list(page))
      crawled.append(page)
    if not tocrawl:
      tocrawl, next_depth = next_depth, []
      depth += 1
  return index

#print get_page("http://www.pmo.gov.sg/content/pmosite/home.html")

seed = "http://www.pmo.gov.sg/content/pmosite/home.html"
#seed = "http://app.singaporeedu.gov.sg/ct/asp/index.asp"
'''seed = ['http://www.pmo.gov.sg', 'http://www.pmo.gov.sg/content/pmosite/aboutpmo.html']
for j in seed:
  print "here"
  print get_page(j)
  for i in get_next_target_list(j):
    print i'''

index = crawl_web(seed, 10000000000, 50000000)

#for i in index:
#  print i