Advertisement
codeadi

PS6-1-3-OnlyALittleLucky-Works.py

Jan 20th, 2013
101
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.36 KB | None | 0 0
  1. # Triple Gold Star
  2.  
  3. # Only A Little Lucky
  4.  
  5. # The Feeling Lucky question (from the regular homework) assumed it was enough
  6. # to find the best-ranked page for a given query. For most queries, though, we
  7. # don't just want the best page (according to the page ranking algorithm), we
  8. # want a list of many pages that match the query, ordered from the most likely
  9. # to be useful to the least likely.
  10.  
  11. # Your goal for this question is to define a procedure, ordered_search(index,
  12. # ranks, keyword), that takes the same inputs as lucky_search from Question 5,
  13. # but returns an ordered list of all the URLs that match the query.
  14.  
  15. # To order the pages, use the quicksort algorithm, invented by Sir Tony Hoare in
  16. # 1959. Quicksort provides a way to sort any list of data, using an expected
  17. # number of comparisons that scales as n log n where n is the number of elements
  18. # in the list.
  19.  
  20. # The idea of quicksort is quite simple:
  21.  
  22. # If the list has zero or one elements, it is already sorted.
  23.  
  24. # Otherwise, pick a pivot element, and split the list into two partitions: one
  25. # contains all the elements equal to or lower than the value of the pivot
  26. # element, and the other contains all the elements that are greater than the
  27. # pivot element. Recursively sort each of the sub-lists, and then return the
  28. # result of concatenating the sorted left sub-list, the pivot element, and the
  29. # sorted right sub-list.
  30.  
  31. # For simplicity, use the first element in the list as your pivot element (this
  32. # is not usually a good choice, since it means if the input list is already
  33. # nearly sorted, the actual work will be much worse than expected).
  34.  
  35. def ordered_search(index, ranks, keyword):
  36.     possible_results = lookup(index, keyword)
  37.     return quicksort(possible_results, ranks)
  38.  
  39. def quicksort(pages, ranks):
  40.     if not pages or len(pages) <= 1:
  41.         return pages
  42.     else:
  43.         pivot = ranks[pages[0]]
  44.         worse = []
  45.         better = []
  46.         for page in pages[1:]:
  47.             if ranks[page] <= pivot:
  48.                 worse.append(page)
  49.             else:
  50.                 better.append(page)
  51.         return quicksort(better, ranks) + [pages[0]] + quicksort(worse, ranks)
  52.  
  53.  
  54. cache = {
  55.     'http://udacity.com/cs101x/urank/index.html': """<html>
  56. <body>
  57. <h1>Dave's Cooking Algorithms</h1>
  58. <p>
  59. Here are my favorite recipies:
  60. <ul>
  61. <li> <a href="http://udacity.com/cs101x/urank/hummus.html">Hummus Recipe</a>
  62. <li> <a href="http://udacity.com/cs101x/urank/arsenic.html">World's Best Hummus</a>
  63. <li> <a href="http://udacity.com/cs101x/urank/kathleen.html">Kathleen's Hummus Recipe</a>
  64. </ul>
  65.  
  66. For more expert opinions, check out the
  67. <a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>
  68. and <a href="http://udacity.com/cs101x/urank/zinc.html">Zinc Chef</a>.
  69. </body>
  70. </html>
  71.  
  72.  
  73. """,
  74.     'http://udacity.com/cs101x/urank/zinc.html': """<html>
  75. <body>
  76. <h1>The Zinc Chef</h1>
  77. <p>
  78. I learned everything I know from
  79. <a href="http://udacity.com/cs101x/urank/nickel.html">the Nickel Chef</a>.
  80. </p>
  81. <p>
  82. For great hummus, try
  83. <a href="http://udacity.com/cs101x/urank/arsenic.html">this recipe</a>.
  84.  
  85. </body>
  86. </html>
  87.  
  88.  
  89. """,
  90.     'http://udacity.com/cs101x/urank/nickel.html': """<html>
  91. <body>
  92. <h1>The Nickel Chef</h1>
  93. <p>
  94. This is the
  95. <a href="http://udacity.com/cs101x/urank/kathleen.html">
  96. best Hummus recipe!
  97. </a>
  98.  
  99. </body>
  100. </html>
  101.  
  102.  
  103. """,
  104.     'http://udacity.com/cs101x/urank/kathleen.html': """<html>
  105. <body>
  106. <h1>
  107. Kathleen's Hummus Recipe
  108. </h1>
  109. <p>
  110.  
  111. <ol>
  112. <li> Open a can of garbonzo beans.
  113. <li> Crush them in a blender.
  114. <li> Add 3 tablesppons of tahini sauce.
  115. <li> Squeeze in one lemon.
  116. <li> Add salt, pepper, and buttercream frosting to taste.
  117. </ol>
  118.  
  119. </body>
  120. </html>
  121.  
  122.  
  123. """,
  124.     'http://udacity.com/cs101x/urank/arsenic.html': """<html>
  125. <body>
  126. <h1>
  127. The Arsenic Chef's World Famous Hummus Recipe
  128. </h1>
  129. <p>
  130.  
  131. <ol>
  132. <li> Kidnap the <a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>.
  133. <li> Force her to make hummus for you.
  134. </ol>
  135.  
  136. </body>
  137. </html>
  138.  
  139.  
  140. """,
  141.     'http://udacity.com/cs101x/urank/hummus.html': """<html>
  142. <body>
  143. <h1>
  144. Hummus Recipe
  145. </h1>
  146. <p>
  147.  
  148. <ol>
  149. <li> Go to the store and buy a container of hummus.
  150. <li> Open it.
  151. </ol>
  152.  
  153. </body>
  154. </html>
  155.  
  156.  
  157. """,
  158.     }
  159.  
  160. def get_page(url):
  161.     if url in cache:
  162.         return cache[url]
  163.     return ""
  164.  
  165.  
  166. def get_next_target(page):
  167.     start_link = page.find('<a href=')
  168.     if start_link == -1:
  169.         return None, 0
  170.     start_quote = page.find('"', start_link)
  171.     end_quote = page.find('"', start_quote + 1)
  172.     url = page[start_quote + 1:end_quote]
  173.     return url, end_quote
  174.  
  175. def get_all_links(page):
  176.     links = []
  177.     while True:
  178.         url, endpos = get_next_target(page)
  179.         if url:
  180.             links.append(url)
  181.             page = page[endpos:]
  182.         else:
  183.             break
  184.     return links
  185.  
  186.  
  187. def union(a, b):
  188.     for e in b:
  189.         if e not in a:
  190.             a.append(e)
  191.  
  192. def add_page_to_index(index, url, content):
  193.     words = content.split()
  194.     for word in words:
  195.         add_to_index(index, word, url)
  196.  
  197. def add_to_index(index, keyword, url):
  198.     if keyword in index:
  199.         index[keyword].append(url)
  200.     else:
  201.         index[keyword] = [url]
  202.  
  203. def lookup(index, keyword):
  204.     if keyword in index:
  205.         return index[keyword]
  206.     else:
  207.         return None
  208.  
  209. def crawl_web(seed): # returns index, graph of inlinks
  210.     tocrawl = [seed]
  211.     crawled = []
  212.     graph = {}  # <url>, [list of pages it links to]
  213.     index = {}
  214.     while tocrawl:
  215.         page = tocrawl.pop()
  216.         if page not in crawled:
  217.             content = get_page(page)
  218.             add_page_to_index(index, page, content)
  219.             outlinks = get_all_links(content)
  220.             graph[page] = outlinks
  221.             union(tocrawl, outlinks)
  222.             crawled.append(page)
  223.     return index, graph
  224.  
  225. def compute_ranks(graph):
  226.     d = 0.8 # damping factor
  227.     numloops = 10
  228.  
  229.     ranks = {}
  230.     npages = len(graph)
  231.     for page in graph:
  232.         ranks[page] = 1.0 / npages
  233.  
  234.     for i in range(0, numloops):
  235.         newranks = {}
  236.         for page in graph:
  237.             newrank = (1 - d) / npages
  238.             for node in graph:
  239.                 if page in graph[node]:
  240.                     newrank = newrank + d * (ranks[node] / len(graph[node]))
  241.             newranks[page] = newrank
  242.         ranks = newranks
  243.     return ranks
  244.  
  245.  
  246. # Here are some example showing what ordered_search should do:
  247.  
  248. # Observe that the result list is sorted so the highest-ranking site is at the
  249. # beginning of the list.
  250.  
  251. # Note: the intent of this question is for students to write their own sorting
  252. # code, not to use the built-in sort procedure.
  253.  
  254. index, graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
  255. ranks = compute_ranks(graph)
  256.  
  257. print ordered_search(index, ranks, 'Hummus')
  258. #>>> ['http://udacity.com/cs101x/urank/kathleen.html',
  259. #    'http://udacity.com/cs101x/urank/nickel.html',
  260. #    'http://udacity.com/cs101x/urank/arsenic.html',
  261. #    'http://udacity.com/cs101x/urank/hummus.html',
  262. #    'http://udacity.com/cs101x/urank/index.html']
  263.  
  264. print ordered_search(index, ranks, 'the')
  265. #>>> ['http://udacity.com/cs101x/urank/nickel.html',
  266. #    'http://udacity.com/cs101x/urank/arsenic.html',
  267. #    'http://udacity.com/cs101x/urank/hummus.html',
  268. #    'http://udacity.com/cs101x/urank/index.html']
  269.  
  270.  
  271. print ordered_search(index, ranks, 'babaganoush')
  272. #>>> None
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement