Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- import html5lib
- import urllib2
- import robotparser
- # Written by David Harris, 2012
- # Colorado Springs, CO, USA
- # e-mail: forbiddenvoid@gmail.com
- # Released under a Creative Commons License for use by Udacity.com
- # -----------------------------
- #
- # The following is my 'finished' search engine built as an
- # entry to the contest at the end of the first run of Udacity's
- # CS101 course: Build a Search Engine
- # A few preliminary notes:
- # I have used two external modules, namely BeautifulSoup 4 - for cleaning
- # up web page content and html5lib, which serves as a parser for BS4.
- # urllib2 and robotparser are in the standard library for Python 2.7
- # I was pretty excited to find out that there was already a tool for parsing
- # robots.txt, because it seemed like a pretty daunting task.
- # Otherwise, I have adapted the code from the Udacity course and made a number
- # of modifications:
- #
- # 1. Allowing for multiple search terms (exact match only)
- # 2. I modified the 'URank' algorithm. It now returns more relevant results.
- # - A more thorough explanation is attached to the multi_ranks function.
- # 3. I have made the crawler more polite by following instructions in robots.txt
- # 4. I am incorporating both the max_pages and max_depth arguments in crawl web.
- # - This allows more control over how much is crawled.
- # 5. I incorporated my own split_string function instead of the built in .split
- # 6. I have set everything to lower-case to eliminate duplicate, unconnected index entries.
- # 7. I have built a console UI for executing searches with a number of features.
- # - Search results limited to a certain number per 'page.' Default is 10. Changeable by user.
- # - Able to navigate forward and backward through result pages.
- # - Able to jump to a specific search page.
- # - Constraints to prevent going to pages out of bounds.
- # - Results page lists number of matches, number of pages, current page & current result range.
- # There is a brief explanation of each page preceding it. Feel free to e-mail me if you have
- # questions about a specific function or about any part of the code.
- # crawl_web is the main function of the webcrawler portion of the code.
- # This function builds an index of keywords, a directional graph of links
- # and stores webpage content in a cache for later use. (The contents are not
- # currently written to file due to time constraints, so the crawler must be
- # run prior to every search. This will be fixed at a later time, but for now
- # the code is more important than that execution.
- # A few notes:
- # Twitter statuses are not blocked by twitter.com's robots.txt file. I have built a filter into
- # the function to avoid most of these pages, because they will quickly overrun the crawler graph.
- # 199.59.148.11 is twitter.com's public IP address
- #
- # .RobotFileParser() and can_fetch() are both part of the robotparser module. I enclosed the .read
- # and can_fetch in a try block because I got errors on a few pages. For simplicity, an error given
- # when trying to access a robots.txt file defaults to not allowed to search.
- #
- # BeautifulSoup() is the primary method used when creating a BS4 object out of the HTML content of
- # a web page. It has some nice features to the object. I only use three in this whole program to
- # preserve the 101 simplicity. .prettify organizes the content in a neat structure, and get_text gets
- # all of the non-HTML text from the website. (This currently also includes javascript. I'm looking into
- # ways to remove the js, also).
- #
- # I changed crawled from a list to a dictionary to speed up lookup times. I found that the time increased
- # dramatically starting at around 2500 pages crawled with the list. The hashed dictionary structure moves
- # much faster.
- def crawl_web(seed, max_depth, max_pages):
- tocrawl = [seed]
- crawled = {}
- graph = {}
- index = {}
- cache = {}
- depth = [0]
- num_pages = 0
- while tocrawl and num_pages < max_pages:
- page = tocrawl.pop(0)
- page = page.lower()
- if page[len(page)-1] == '/':
- page = page[:len(page)-1]
- not_twitter = True
- if (page.find('twitter.com') > 0 or page.find('199.59.148.11') > 0):
- if len(page) > 28:
- not_twitter = False
- else:
- not_twitter = True
- current_depth = depth.pop(0)
- robot = robotparser.RobotFileParser()
- robot.set_url(roboize(page))
- try:
- robot.read()
- allowed = robot.can_fetch("*", page)
- except:
- allowed = False
- if page not in crawled and current_depth <= max_depth and num_pages < max_pages and allowed and not_twitter:
- num_pages += 1
- try:
- soup = BeautifulSoup(get_page(page), "html5lib")
- soup.prettify()
- except:
- soup = ''
- if soup:
- content = soup.get_text()
- cache[page] = content
- outlinks = get_all_links(soup, page)
- add_page_to_index(index, page, content)
- else:
- outlinks = []
- for link in outlinks:
- depth.append(current_depth + 1)
- graph[page] = outlinks
- union(tocrawl, outlinks)
- crawled[page] = True
- return index, graph, cache
- # This is a fairly simple function that creates the robots.txt url from the page being crawled.
- def roboize(page):
- start_page = page.find('/') + 2
- if page.find('/', start_page) == -1:
- return page + '/robots.txt'
- else:
- return page[:page.find('/', start_page)] + '/robots.txt'
- # Pretty straightforward. This function contacts a webpage and returns the contents.
- def get_page(url):
- try:
- return urllib2.urlopen(url, None, 10).read()
- except:
- return ""
- # This is the third function I use from BeautifulSoup -link.get('href') gets all of the links
- # (internal and external) on a page. I've modified get_all_links from what we did in class to
- # convert internal links to their full external counterparts.
- def get_all_links(soup, page):
- links = []
- last = page.find('/', 8)
- if last > 0:
- page = page[:last]
- for link in soup.find_all('a'):
- if link.get('href'):
- if link.get('href')[0] == '/':
- links.append(page + link.get('href'))
- else:
- links.append(link.get('href'))
- return links
- # The following two functions work in tandem to split all of the content of a web page and
- # add each word to the index, along with the url of the page and a list of positions on the page
- # where the URL appears. The position is needed for searching multiple terms.
- def add_page_to_index(index, url, content):
- words = split_string(content, "!@#$%^&*(),./?><[]}\"{':;=-~`|\\ \n")
- counter = 0
- for word in words:
- counter +=1
- add_to_index(index, regex(word), url, counter)
- def add_to_index(index, keyword, url, pos):
- if keyword in index:
- if url in index[keyword]:
- index[keyword][url].append(pos)
- else:
- index[keyword][url] = [pos]
- else:
- index[keyword] = {url: [pos]}
- #Logical Set Union, nothing to see here, move along. :)
- def union(a, b):
- for e in b:
- if e not in a:
- a.append(e)
- # My split_string function. There are more efficient ones, but
- # since this is the one I wrote for class, I used it.
- def split_string(source, splitlist):
- end_split = []
- if source == "":
- return end_split
- marker = 0
- for pos in range(0, len(source) - 1):
- for j in range(0, len(splitlist)):
- if source[pos] == splitlist[j]:
- if len(source[marker:pos]) > 0:
- end_split.append(source[marker:pos])
- marker = pos + 1
- break
- else:
- marker = pos + 1
- pos = len(source)-1
- flag = False
- for j in range(0, len(splitlist)):
- if source[pos] == splitlist[j]:
- if len(source[marker:pos]) > 0:
- end_split.append(source[marker:pos])
- flag = True
- if not flag and len(source[marker:pos]) > 0:
- end_split.append(source[marker:])
- return end_split
- # I don't have time to write a real regex parser, but this function eliminates
- # most possessives and converts all words to lowercase.
- def regex(word):
- result = word.lower()
- if result[-2:] == "'s":
- result = result[:-2]
- return result
- # This function looks up the given query and returns all of the urls where the query
- # appears exactly. I'd considered doing a .find on the cache, but quickly realized that
- # would take a very, very long time as the cache increased in size. Essentially, this
- # function gets a list of possible urls from the first keyword and eliminates all of
- # those that do not fit the criteria (pos + 1) on subsequent keywords.
- def multi_lookup(index, query):
- result = []
- possible = []
- not_possible = []
- for keyword in query:
- if keyword not in index:
- return None
- for url in index[query[0]]:
- for pos in index[query[0]][url]:
- possible.append([url, pos])
- for i in range(0, len(query) - 1):
- for url in possible:
- if url[0] not in index[query[i + 1]]:
- not_possible.append(url)
- else:
- flag = False
- for pos in index[query[i + 1]][url[0]]:
- if url[1] + i + 1 == pos:
- flag = True
- if not flag:
- not_possible.append(url)
- for entry in not_possible:
- if entry in possible:
- possible.remove(entry)
- for entry in possible:
- if entry[0] not in result:
- result.append(entry[0])
- return result
- # This is the 'URank' function we defined in class, adapted to the data structure I'm using.
- def compute_ranks(graph):
- d = 0.8 # damping factor
- numloops = 10
- ranks = {}
- npages = len(graph)
- for page in graph:
- ranks[page] = 1.0 / npages
- for i in range(0, numloops):
- newranks = {}
- for page in graph:
- newrank = (1 - d) / npages
- for node in graph:
- if page in graph[node]:
- newrank = newrank + d * (ranks[node] / len(graph[node]))
- newranks[page] = newrank
- ranks = newranks
- return ranks
- # This is the magical function that fixes the issues with URank/Vanilla PageRank.
- # A brief explanation, first. PageRank will order all returned pages in a search
- # by their absolute ranks. This is useful, but when you're searching for a certain word,
- # say, sushi, if that word happens to appear on twitter.com, you'll likely get twitter as
- # a first result, which seems inappropriate to the search itself.
- #
- # The multirank process simply converts the existing graph and returns a graph of only
- # those pages that actually contain the search terms. There is a disadvantage, in that
- # the graph has to be built at runtime, but the results are an order of magnitude more
- # precise than the basic URank that we used before.
- def multi_ranks(index, graph, query):
- newgraph = {}
- has_query = multi_lookup(index, query)
- if not has_query:
- return newgraph
- for entry in graph:
- url_with_query = []
- if entry in has_query:
- for url in graph[entry]:
- if url in has_query:
- url_with_query.append(url)
- newgraph[entry] = url_with_query
- return newgraph
- # Quicksort. Not really much explanation needed.
- def quicksort(rlist):
- if len(rlist) <= 1:
- return rlist
- left = []
- right = []
- pivot = rlist[0]
- for element in rlist:
- if element != pivot:
- if element[1] >= pivot[1]:
- left.append(element)
- else:
- right.append(element)
- return quicksort(left) + [pivot] + quicksort(right)
- # This parses the ranks returned by compute_ranks and uses quicksort to order them.
- # The last step removes the actual ranks from the list, leaving only the ordered URLs.
- def ordered_search(ranks, query):
- for keyword in query:
- if keyword not in index:
- return None
- result = []
- new_rank = []
- for element in ranks:
- new_rank.append([element, ranks[element]])
- result = quicksort(new_rank)
- final = []
- for element in result:
- final.append(element[0])
- return final
- # This is the primary search function. It takes the index, graph and cache built by the crawler
- # and allows the user to search for the term of their choice. It runs until given the 'quit'
- # command. So, no, you can't search for 'quit,' but really, why would you want to?
- # It uses the split_string function to split up the query into a list of words.
- # In order, the function calls multi_graph to build the graph of results with the search term,
- # then it computes the ranks of those pages, then orders them. Finally, it calls search_result
- # to present the results.
- def do_search(index, graph, cache):
- print "*******************"
- print "Welcome to FVSearch"
- print "*******************"
- while True:
- query = str(raw_input("What can I help you find? (Type 'quit' to exit) >>"))
- pass_query = query
- if len(query) > 1 or (len(query) == 1 and len(query[0]) > 1):
- query = split_string(query, "!@#$%^&*(),./?><[]}\"{':;=-~`|\\ \n")
- else:
- query = [query]
- for i in range(0, len(query)):
- query[i] = query[i].lower()
- if query[0] == 'quit':
- break
- multi_graph = multi_ranks(index, graph, query)
- if multi_graph:
- ranks = compute_ranks(multi_graph)
- else:
- ranks = []
- if ranks:
- results = ordered_search(ranks, query)
- else:
- results = []
- if results:
- search_result(results, pass_query, cache)
- else:
- print 'No results found. Try another search term.'
- # Here are our prettified search results. It gives the total number of results
- # for the query and breaks them up into pages. .The action switch uses user input
- # to generate either the next page, previous page or a specific page.
- # It also returns to the main search function if 's' or 'S' is typed.
- # I inlined a function to change the number of results per page.
- def search_result(results, query, cache):
- print '**************'
- print 'SEARCH RESULTS'
- print '**************'
- print 'Found ', len(results), ' matches for ', "'", query, "'"
- page = 1
- results_per_page = 10
- while True:
- if len(results)%results_per_page > 0:
- pages = len(results)/results_per_page + 1
- else:
- pages = len(results)/results_per_page
- show_page(results, page, results_per_page, query, cache)
- action = get_next_action()
- if action in ['s', 'S']:
- return
- if action in ['p', 'P']:
- if page > 1:
- page -= 1
- else:
- print
- print "Can't go back any more!"
- print
- if action in ['n', 'N']:
- if page < pages:
- page += 1
- else:
- print
- print 'No more results!'
- print
- if action in ['j', 'J']:
- page = jump_page(len(results), results_per_page)
- if action in ['o', 'O']:
- print
- print '1. Change number of results per page'
- print '2. Would you like to play a game?'
- print
- choice = raw_input('> ')
- choice = int(choice)
- if choice == 1:
- print
- get_answer = ''
- while not get_answer:
- print 'How many results per page?'
- get_answer = raw_input('> ')
- if get_answer:
- get_answer = int(get_answer)
- results_per_page = get_answer
- print
- if choice != 1:
- print
- print "I'm sorry, Dave, but I can't do that."
- print
- # This function just prints the results on the current page. I had built a function to
- # also display an excerpt of the page, but it wasn't pretty (lots of extra whitespace, newlines
- # and javascript inside) and turned out to not really be very helpful.
- def show_page(results, page, results_per_page, query, cache):
- if len(results)%results_per_page > 0:
- pages = len(results)/results_per_page + 1
- else:
- pages = len(results)/results_per_page
- start = ((page - 1) * results_per_page) + 1
- end = min([page * results_per_page, len(results)])
- print 'Showing results ', start, '-', end, ' ( Page ', page, ' of', pages, ')\n'
- j = 1
- for i in range(start - 1, end):
- print j, ' - ', results[i]
- print
- j += 1
- # This function just gets user input to find out what they want to do once they receive a page of results.
- def get_next_action():
- action_flag = True
- while action_flag:
- print '\n'
- print 'What would you like to do?'
- print '(N)ext Page - (P)revious Page - (J)ump to Page - New (S)earch - (O)ptions'
- print '\n'
- action = raw_input('> ')
- if action not in ['n', 'N', 'p', 'P', 'j', 'J', 's', 'S', 'o', 'O']:
- print "I don't understand what you're trying to do."
- else:
- action_flag = False
- return action
- # This turned out to be more complex than I thought it would. The process has to make sure that the page
- # selected is a valid page and also give an accurate number of the total pages based on the number of results
- # and the results per page.
- def jump_page(num_results, results_per_page):
- if num_results%results_per_page > 0:
- pages = num_results/results_per_page + 1
- else:
- pages = num_results/results_per_page
- while True:
- print '\n'
- print 'What page? Choose 1 -', pages
- page = ''
- while not page:
- page = raw_input('> ')
- page = int(page)
- print '\n'
- if page < 1 or page > pages:
- print '\n'
- print 'No such page!'
- print '\n'
- else:
- return page
- ## WARNING ##
- # It takes a LONG time to crawl even 5000 pages, several hours probably.
- # I don't recommend using numbers any larger. Udacity seems to be a decent
- # seed page, also, but you can experiment with others.
- index, graph, cache = crawl_web('http://www.udacity.com', 4, 5000)
- do_search(index, graph, cache)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement