Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def get_page(url):
- try:
- import urllib
- return urllib.urlopen(url).read().decode('utf8')
- '''import urllib2
- req = urllib2.Request(url)
- response = urllib2.urlopen(req)
- return response.read().decode('utf8')'''
- except:
- return ""
- def getDomain(page):
- # Get domain name from page.
- d_start = 0
- if page[0:4] == "http":
- d_start = 7 # "http://" 1st 7 character
- d_end = page.find(".gov.sg/") + 7
- domain = page[d_start:d_end]
- #print domain
- return domain
- def cleanURL(page, domain):
- #if page == None: return domain;
- clean_url = page
- # Prepend the internal pages with domain page
- if page[0] == "/" and page[-5:] == ".html" :
- clean_url = "".join([domain, page])
- else:
- clean_url = page
- # Removes http:// and insert www. if not in page
- if clean_url[0:7] == "http://" and clean_url[7:11] != "www.":
- clean_url = "www." + clean_url[7:]
- elif clean_url[0:11] == "http://www.":
- clean_url = clean_url[7:]
- elif clean_url[0:12] == "https://www.":
- clean_url = clean_url[8:]
- clean_url = "".join(["http://",clean_url])
- return clean_url
- def get_next_target_list(page, only_html=True):
- import BeautifulSoup
- import urllib2, re
- domain = getDomain(page) # Gets the domain from the url.
- soup = BeautifulSoup.BeautifulSoup(get_page(page))
- url_list = []; full_url = ""
- for link in soup.findAll('a'):
- url = str(link.get('href'))
- if url == "": continue
- # Skips javascript and None url
- if url[0:10] == "javascript":
- continue
- if url == "None":
- continue
- full_url = cleanURL(url, domain) # Cleans the url.
- # If crawling only html/htm.
- if only_html == True:
- if full_url[-5:] == ".html" or full_url[-4:] == ".htm":
- url_list.append(full_url)
- else:
- url_list.append(full_url)
- return set(url_list)
- def add_to_index (index, url, html):
- for entry in index:
- if entry[0] == url:
- entry[1].append(html)
- return
- index.append([url,[html]])
- def union(p,q):
- return list(set(p).union(set(q)))
- def crawl_web(seed, max_pages, max_depth):
- index = []
- tocrawl = [seed]; crawled = []
- depth = 0; next_depth = []
- while tocrawl and depth <= max_depth:
- page = tocrawl.pop()
- print page
- if page not in crawled and len(crawled) < max_pages: # depth 1st search.
- content = get_page(page)
- #if content == "": continue
- add_to_index(index, page, content)
- next_depth = union(next_depth, get_next_target_list(page))
- crawled.append(page)
- if not tocrawl:
- tocrawl, next_depth = next_depth, []
- depth += 1
- return index
- #print get_page("http://www.pmo.gov.sg/content/pmosite/home.html")
- seed = "http://www.pmo.gov.sg/content/pmosite/home.html"
- #seed = "http://app.singaporeedu.gov.sg/ct/asp/index.asp"
- '''seed = ['http://www.pmo.gov.sg', 'http://www.pmo.gov.sg/content/pmosite/aboutpmo.html']
- for j in seed:
- print "here"
- print get_page(j)
- for i in get_next_target_list(j):
- print i'''
- index = crawl_web(seed, 10000000000, 50000000)
- #for i in index:
- # print i
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement