David Harris's Udacity Search Engine

from bs4 import BeautifulSoup
import html5lib
import urllib2
import robotparser

# Written by David Harris, 2012
# Colorado Springs, CO, USA
# e-mail: forbiddenvoid@gmail.com
# Released under a Creative Commons License for use by Udacity.com
# -----------------------------
#
# The following is my 'finished' search engine built as an
# entry to the contest at the end of the first run of Udacity's
# CS101 course: Build a Search Engine

# A few preliminary notes:
# I have used two external modules, namely BeautifulSoup 4 - for cleaning
# up web page content and html5lib, which serves as a parser for BS4.
# urllib2 and robotparser are in the standard library for Python 2.7
# I was pretty excited to find out that there was already a tool for parsing
# robots.txt, because it seemed like a pretty daunting task.
# Otherwise, I have adapted the code from the Udacity course and made a number
# of modifications:
#
# 1. Allowing for multiple search terms (exact match only)
# 2. I modified the 'URank' algorithm. It now returns more relevant results.
#     - A more thorough explanation is attached to the multi_ranks function.
# 3. I have made the crawler more polite by following instructions in robots.txt
# 4. I am incorporating both the max_pages and max_depth arguments in crawl web.
#     - This allows more control over how much is crawled.
# 5. I incorporated my own split_string function instead of the built in .split
# 6. I have set everything to lower-case to eliminate duplicate, unconnected index entries.
# 7. I have built a console UI for executing searches with a number of features.
#     - Search results limited to a certain number per 'page.' Default is 10. Changeable by user.
#     - Able to navigate forward and backward through result pages.
#     - Able to jump to a specific search page.
#     - Constraints to prevent going to pages out of bounds.
#     - Results page lists number of matches, number of pages, current page & current result range.

# There is a brief explanation of each page preceding it. Feel free to e-mail me if you have
# questions about a specific function or about any part of the code.

# crawl_web is the main function of the webcrawler portion of the code.
# This function builds an index of keywords, a directional graph of links
# and stores webpage content in a cache for later use. (The contents are not
# currently written to file due to time constraints, so the crawler must be
# run prior to every search. This will be fixed at a later time, but for now
# the code is more important than that execution.

# A few notes:
# Twitter statuses are not blocked by twitter.com's robots.txt file. I have built a filter into
# the function to avoid most of these pages, because they will quickly overrun the crawler graph.
# 199.59.148.11 is twitter.com's public IP address
#
# .RobotFileParser() and can_fetch() are both part of the robotparser module. I enclosed the .read
# and can_fetch in a try block because I got errors on a few pages. For simplicity, an error given
# when trying to access a robots.txt file defaults to not allowed to search.
#
# BeautifulSoup() is the primary method used when creating a BS4 object out of the HTML content of
# a web page. It has some nice features to the object. I only use three in this whole program to
# preserve the 101 simplicity. .prettify organizes the content in a neat structure, and get_text gets
# all of the non-HTML text from the website. (This currently also includes javascript. I'm looking into
# ways to remove the js, also).
#
# I changed crawled from a list to a dictionary to speed up lookup times. I found that the time increased
# dramatically starting at around 2500 pages crawled with the list. The hashed dictionary structure moves
# much faster.


def crawl_web(seed, max_depth, max_pages):
    tocrawl = [seed]
    crawled = {}
    graph = {}
    index = {}
    cache = {}
    depth = [0]
    num_pages = 0
    while tocrawl and num_pages < max_pages:
        page = tocrawl.pop(0)
        page = page.lower()
        if page[len(page)-1] == '/':
            page = page[:len(page)-1]
        not_twitter = True
        if (page.find('twitter.com') > 0 or page.find('199.59.148.11') > 0):
            if len(page) > 28:
                not_twitter = False
            else:
                not_twitter = True
        current_depth = depth.pop(0)
        robot = robotparser.RobotFileParser()
        robot.set_url(roboize(page))
        try:
            robot.read()
            allowed = robot.can_fetch("*", page)
        except:
            allowed = False
        if page not in crawled and current_depth <= max_depth and num_pages < max_pages and allowed and not_twitter:
            num_pages += 1
            try:
                soup = BeautifulSoup(get_page(page), "html5lib")
                soup.prettify()
            except:
                soup = ''
            if soup:
                content = soup.get_text()
                cache[page] = content
                outlinks = get_all_links(soup, page)
                add_page_to_index(index, page, content)
            else:
                outlinks = []
            for link in outlinks:
                depth.append(current_depth + 1)
            graph[page] = outlinks
            union(tocrawl, outlinks)
            crawled[page] = True
    return index, graph, cache

# This is a fairly simple function that creates the robots.txt url from the page being crawled.

def roboize(page):
    start_page = page.find('/') + 2
    if page.find('/', start_page) == -1:
        return page + '/robots.txt'
    else:
        return page[:page.find('/', start_page)] + '/robots.txt'

# Pretty straightforward. This function contacts a webpage and returns the contents.

def get_page(url):
    try:
        return urllib2.urlopen(url, None, 10).read()
    except:
        return ""

# This is the third function I use from BeautifulSoup -link.get('href') gets all of the links
# (internal and external) on a page. I've modified get_all_links from what we did in class to
# convert internal links to their full external counterparts.

def get_all_links(soup, page):
    links = []
    last = page.find('/', 8)
    if last > 0:
        page = page[:last]
    for link in soup.find_all('a'):
        if link.get('href'):
            if link.get('href')[0] == '/':
                links.append(page + link.get('href'))
            else:
                links.append(link.get('href'))
    return links

# The following two functions work in tandem to split all of the content of a web page and
# add each word to the index, along with the url of the page and a list of positions on the page
# where the URL appears. The position is needed for searching multiple terms.

def add_page_to_index(index, url, content):
    words = split_string(content, "!@#$%^&*(),./?><[]}\"{':;=-~`|\\ \n")
    counter = 0
    for word in words:
        counter +=1
        add_to_index(index, regex(word), url, counter)

def add_to_index(index, keyword, url, pos):
    if keyword in index:
        if url in index[keyword]:
            index[keyword][url].append(pos)
        else:
            index[keyword][url] = [pos]
    else:
        index[keyword] = {url: [pos]}

#Logical Set Union, nothing to see here, move along. :)

def union(a, b):
    for e in b:
        if e not in a:
            a.append(e)

# My split_string function. There are more efficient ones, but
# since this is the one I wrote for class, I used it.

def split_string(source, splitlist):
    end_split = []
    if source == "":
        return end_split
    marker = 0
    for pos in range(0, len(source) - 1):
        for j in range(0, len(splitlist)):
           if source[pos] == splitlist[j]:
               if len(source[marker:pos]) > 0:
                   end_split.append(source[marker:pos])
                   marker = pos + 1
                   break
               else:
                   marker = pos + 1
    pos = len(source)-1
    flag = False
    for j in range(0, len(splitlist)):
        if source[pos] == splitlist[j]:
            if len(source[marker:pos]) > 0:
                   end_split.append(source[marker:pos])
                   flag = True
    if not flag and len(source[marker:pos]) > 0:
        end_split.append(source[marker:])
    return end_split

# I don't have time to write a real regex parser, but this function eliminates
# most possessives and converts all words to lowercase.

def regex(word):
    result = word.lower()
    if result[-2:] == "'s":
        result = result[:-2]
    return result

# This function looks up the given query and returns all of the urls where the query
# appears exactly. I'd considered doing a .find on the cache, but quickly realized that
# would take a very, very long time as the cache increased in size.  Essentially, this
# function gets a list of possible urls from the first keyword and eliminates all of
# those that do not fit the criteria (pos + 1) on subsequent keywords.

def multi_lookup(index, query):
    result = []
    possible = []
    not_possible = []
    for keyword in query:
        if keyword not in index:
            return None
    for url in index[query[0]]:
        for pos in index[query[0]][url]:
            possible.append([url, pos])
    for i in range(0, len(query) - 1):
        for url in possible:
            if url[0] not in index[query[i + 1]]:
                not_possible.append(url)
            else:
                flag = False
                for pos in index[query[i + 1]][url[0]]:
                    if url[1] + i + 1 == pos:
                        flag = True
                if not flag:
                    not_possible.append(url)
    for entry in not_possible:
        if entry in possible:
            possible.remove(entry)
    for entry in possible:
        if entry[0] not in result:
            result.append(entry[0])
    return result

# This is the 'URank' function we defined in class, adapted to the data structure I'm using.

def compute_ranks(graph):
    d = 0.8 # damping factor
    numloops = 10
    ranks = {}
    npages = len(graph)
    for page in graph:
        ranks[page] = 1.0 / npages
    for i in range(0, numloops):
        newranks = {}
        for page in graph:
            newrank = (1 - d) / npages
            for node in graph:
                if page in graph[node]:
                    newrank = newrank + d * (ranks[node] / len(graph[node]))
            newranks[page] = newrank
        ranks = newranks
    return ranks

# This is the magical function that fixes the issues with URank/Vanilla PageRank.
# A brief explanation, first. PageRank will order all returned pages in a search
# by their absolute ranks. This is useful, but when you're searching for a certain word,
# say, sushi, if that word happens to appear on twitter.com, you'll likely get twitter as
# a first result, which seems inappropriate to the search itself.
#
# The multirank process simply converts the existing graph and returns a graph of only
# those pages that actually contain the search terms. There is a disadvantage, in that
# the graph has to be built at runtime, but the results are an order of magnitude more
# precise than the basic URank that we used before.

def multi_ranks(index, graph, query):
    newgraph = {}
    has_query = multi_lookup(index, query)
    if not has_query:
        return newgraph
    for entry in graph:
        url_with_query = []
        if entry in has_query:
            for url in graph[entry]:
                if url in has_query:
                    url_with_query.append(url)
            newgraph[entry] = url_with_query
    return newgraph

# Quicksort. Not really much explanation needed.

def quicksort(rlist):
    if len(rlist) <= 1:
        return rlist
    left = []
    right = []
    pivot = rlist[0]
    for element in rlist:
        if element != pivot:
            if element[1] >= pivot[1]:
                left.append(element)
            else:
                right.append(element)
    return quicksort(left) + [pivot] + quicksort(right)

# This parses the ranks returned by compute_ranks and uses quicksort to order them.
# The last step removes the actual ranks from the list, leaving only the ordered URLs.

def ordered_search(ranks, query):
    for keyword in query:
        if keyword not in index:
            return None
    result = []
    new_rank = []
    for element in ranks:
        new_rank.append([element, ranks[element]])
    result = quicksort(new_rank)
    final = []
    for element in result:
        final.append(element[0])
    return final

# This is the primary search function. It takes the index, graph and cache built by the crawler
# and allows the user to search for the term of their choice. It runs until given the 'quit'
# command. So, no, you can't search for 'quit,' but really, why would you want to?
# It uses the split_string function to split up the query into a list of words.
# In order, the function calls multi_graph to build the graph of results with the search term,
# then it computes the ranks of those pages, then orders them. Finally, it calls search_result
# to present the results.


def do_search(index, graph, cache):
    print "*******************"
    print "Welcome to FVSearch"
    print "*******************"
    while True:
        query = str(raw_input("What can I help you find? (Type 'quit' to exit) >>"))
        pass_query = query
        if len(query) >  1 or (len(query) == 1 and len(query[0]) > 1):
            query = split_string(query, "!@#$%^&*(),./?><[]}\"{':;=-~`|\\ \n")
        else:
            query = [query]
        for i in range(0, len(query)):
            query[i] = query[i].lower()
        if query[0] == 'quit':
            break
        multi_graph = multi_ranks(index, graph, query)
        if multi_graph:
            ranks = compute_ranks(multi_graph)
        else:
            ranks = []
        if ranks:
            results = ordered_search(ranks, query)
        else:
            results = []
        if results:
            search_result(results, pass_query, cache)
        else:
            print 'No results found. Try another search term.'

# Here are our prettified search results. It gives the total number of results
# for the query and breaks them up into pages. .The action switch uses user input
# to generate either the next page, previous page or a specific page.
# It also returns to the main search function if 's' or 'S' is typed.
# I inlined a function to change the number of results per page.


def search_result(results, query, cache):
    print '**************'
    print 'SEARCH RESULTS'
    print '**************'
    print 'Found ', len(results), ' matches for ', "'", query, "'"
    page = 1
    results_per_page = 10
    while True:
        if len(results)%results_per_page > 0:
            pages = len(results)/results_per_page + 1
        else:
            pages = len(results)/results_per_page
        show_page(results, page, results_per_page, query, cache)
        action = get_next_action()
        if action in ['s', 'S']:
            return
        if action in ['p', 'P']:
            if page > 1:
                page -= 1
            else:
                print
                print "Can't go back any more!"
                print
        if action in ['n', 'N']:
            if page < pages:
                page += 1
            else:
                print
                print 'No more results!'
                print
        if action in ['j', 'J']:
            page = jump_page(len(results), results_per_page)
        if action in ['o', 'O']:
            print
            print '1. Change number of results per page'
            print '2. Would you like to play a game?'
            print
            choice = raw_input('> ')
            choice = int(choice)
            if choice == 1:
                print
                get_answer = ''
                while not get_answer:
                    print 'How many results per page?'
                    get_answer = raw_input('> ')
                    if get_answer:
                        get_answer = int(get_answer)
                results_per_page = get_answer
                print
            if choice != 1:
                print
                print "I'm sorry, Dave, but I can't do that."
                print

# This function just prints the results on the current page. I had built a function to
# also display an excerpt of the page, but it wasn't pretty (lots of extra whitespace, newlines
# and javascript inside) and turned out to not really be very helpful.

def show_page(results, page, results_per_page, query, cache):
    if len(results)%results_per_page > 0:
        pages = len(results)/results_per_page + 1
    else:
        pages = len(results)/results_per_page
    start = ((page - 1) * results_per_page) + 1
    end = min([page * results_per_page, len(results)])
    print 'Showing results ', start, '-', end, ' ( Page ', page, ' of', pages, ')\n'
    j = 1
    for i in range(start - 1, end):
        print j, ' - ', results[i]
        print
        j += 1

# This function just gets user input to find out what they want to do once they receive a page of results.

def get_next_action():
    action_flag = True
    while action_flag:
        print '\n'
        print 'What would you like to do?'
        print '(N)ext Page - (P)revious Page - (J)ump to Page - New (S)earch - (O)ptions'
        print '\n'
        action = raw_input('> ')
        if action not in ['n', 'N', 'p', 'P', 'j', 'J', 's', 'S', 'o', 'O']:
            print "I don't understand what you're trying to do."
        else:
            action_flag  = False
    return action

# This turned out to be more complex than I thought it would. The process has to make sure that the page
# selected is a valid page and also give an accurate number of the total pages based on the number of results
# and the results per page.

def jump_page(num_results, results_per_page):
    if num_results%results_per_page > 0:
        pages = num_results/results_per_page + 1
    else:
        pages = num_results/results_per_page
    while True:
        print '\n'
        print 'What page? Choose 1 -', pages
        page = ''
        while not page:
            page = raw_input('> ')
        page = int(page)
        print '\n'
        if page < 1 or page > pages:
            print '\n'
            print 'No such page!'
            print '\n'
        else:
            return page

## WARNING ##
# It takes a LONG time to crawl even 5000 pages, several hours probably.
# I don't recommend using numbers any larger. Udacity seems to be a decent
# seed page, also, but you can experiment with others.

index, graph, cache = crawl_web('http://www.udacity.com', 4, 5000)
do_search(index, graph, cache)