1. # Triple Gold Star
  2.  
  3. # Only A Little Lucky
  4.  
  5. # The Feeling Lucky question (from the regular homework) assumed it was enough
  6. # to find the best-ranked page for a given query. For most queries, though, we
  7. # don't just want the best page (according to the page ranking algorithm), we
  8. # want a list of many pages that match the query, ordered from the most likely
  9. # to be useful to the least likely.
  10.  
  11. # Your goal for this question is to define a procedure, ordered_search(index,
  12. # ranks, keyword), that takes the same inputs as lucky_search from Question 5,
  13. # but returns an ordered list of all the URLs that match the query.
  14.  
  15. # To order the pages, use the quicksort algorithm, invented by Sir Tony Hoare in
  16. # 1959. Quicksort provides a way to sort any list of data, using an expected
  17. # number of comparisons that scales as n log n where n is the number of elements
  18. # in the list.
  19.  
  20. # The idea of quicksort is quite simple:
  21.  
  22. # If the list has zero or one elements, it is already sorted.
  23.  
  24. # Otherwise, pick a pivot element, and split the list into two partitions: one
  25. # contains all the elements equal to or lower than the value of the pivot
  26. # element, and the other contains all the elements that are greater than the
  27. # pivot element. Recursively sort each of the sub-lists, and then return the
  28. # result of concatenating the sorted left sub-list, the pivot element, and the
  29. # sorted right sub-list.
  30.  
  31. # For simplicity, use the first element in the list as your pivot element (this
  32. # is not usually a good choice, since it means if the input list is already
  33. # nearly sorted, the actual work will be much worse than expected).
  34.  
  35. def ordered_search(index, ranks, keyword):
  36.     # Your code here
  37.  
  38. def quicksort(pages, ranks):
  39.     # Your code here
  40.  
  41. cache = {
  42.     'http://udacity.com/cs101x/urank/index.html': """<html>
  43. <body>
  44. <h1>Dave's Cooking Algorithms</h1>
  45. <p>
  46. Here are my favorite recipies:
  47. <ul>
  48. <li> <a href="http://udacity.com/cs101x/urank/hummus.html">Hummus Recipe</a>
  49. <li> <a href="http://udacity.com/cs101x/urank/arsenic.html">World's Best Hummus</a>
  50. <li> <a href="http://udacity.com/cs101x/urank/kathleen.html">Kathleen's Hummus Recipe</a>
  51. </ul>
  52.  
  53. For more expert opinions, check out the
  54. <a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>
  55. and <a href="http://udacity.com/cs101x/urank/zinc.html">Zinc Chef</a>.
  56. </body>
  57. </html>
  58.  
  59.  
  60. """,
  61.     'http://udacity.com/cs101x/urank/zinc.html': """<html>
  62. <body>
  63. <h1>The Zinc Chef</h1>
  64. <p>
  65. I learned everything I know from
  66. <a href="http://udacity.com/cs101x/urank/nickel.html">the Nickel Chef</a>.
  67. </p>
  68. <p>
  69. For great hummus, try
  70. <a href="http://udacity.com/cs101x/urank/arsenic.html">this recipe</a>.
  71.  
  72. </body>
  73. </html>
  74.  
  75.  
  76. """,
  77.     'http://udacity.com/cs101x/urank/nickel.html': """<html>
  78. <body>
  79. <h1>The Nickel Chef</h1>
  80. <p>
  81. This is the
  82. <a href="http://udacity.com/cs101x/urank/kathleen.html">
  83. best Hummus recipe!
  84. </a>
  85.  
  86. </body>
  87. </html>
  88.  
  89.  
  90. """,
  91.     'http://udacity.com/cs101x/urank/kathleen.html': """<html>
  92. <body>
  93. <h1>
  94. Kathleen's Hummus Recipe
  95. </h1>
  96. <p>
  97.  
  98. <ol>
  99. <li> Open a can of garbonzo beans.
  100. <li> Crush them in a blender.
  101. <li> Add 3 tablesppons of tahini sauce.
  102. <li> Squeeze in one lemon.
  103. <li> Add salt, pepper, and buttercream frosting to taste.
  104. </ol>
  105.  
  106. </body>
  107. </html>
  108.  
  109.  
  110. """,
  111.     'http://udacity.com/cs101x/urank/arsenic.html': """<html>
  112. <body>
  113. <h1>
  114. The Arsenic Chef's World Famous Hummus Recipe
  115. </h1>
  116. <p>
  117.  
  118. <ol>
  119. <li> Kidnap the <a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>.
  120. <li> Force her to make hummus for you.
  121. </ol>
  122.  
  123. </body>
  124. </html>
  125.  
  126.  
  127. """,
  128.     'http://udacity.com/cs101x/urank/hummus.html': """<html>
  129. <body>
  130. <h1>
  131. Hummus Recipe
  132. </h1>
  133. <p>
  134.  
  135. <ol>
  136. <li> Go to the store and buy a container of hummus.
  137. <li> Open it.
  138. </ol>
  139.  
  140. </body>
  141. </html>
  142.  
  143.  
  144. """,
  145.     }
  146.  
  147. def get_page(url):
  148.     if url in cache:
  149.         return cache[url]
  150.     return ""
  151.  
  152.  
  153. def get_next_target(page):
  154.     start_link = page.find('<a href=')
  155.     if start_link == -1:
  156.         return None, 0
  157.     start_quote = page.find('"', start_link)
  158.     end_quote = page.find('"', start_quote + 1)
  159.     url = page[start_quote + 1:end_quote]
  160.     return url, end_quote
  161.  
  162. def get_all_links(page):
  163.     links = []
  164.     while True:
  165.         url, endpos = get_next_target(page)
  166.         if url:
  167.             links.append(url)
  168.             page = page[endpos:]
  169.         else:
  170.             break
  171.     return links
  172.  
  173.  
  174. def union(a, b):
  175.     for e in b:
  176.         if e not in a:
  177.             a.append(e)
  178.  
  179. def add_page_to_index(index, url, content):
  180.     words = content.split()
  181.     for word in words:
  182.         add_to_index(index, word, url)
  183.  
  184. def add_to_index(index, keyword, url):
  185.     if keyword in index:
  186.         index[keyword].append(url)
  187.     else:
  188.         index[keyword] = [url]
  189.  
  190. def lookup(index, keyword):
  191.     if keyword in index:
  192.         return index[keyword]
  193.     else:
  194.         return None
  195.  
  196. def crawl_web(seed): # returns index, graph of inlinks
  197.     tocrawl = [seed]
  198.     crawled = []
  199.     graph = {}  # <url>, [list of pages it links to]
  200.     index = {}
  201.     while tocrawl:
  202.         page = tocrawl.pop()
  203.         if page not in crawled:
  204.             content = get_page(page)
  205.             add_page_to_index(index, page, content)
  206.             outlinks = get_all_links(content)
  207.             graph[page] = outlinks
  208.             union(tocrawl, outlinks)
  209.             crawled.append(page)
  210.     return index, graph
  211.  
  212. def compute_ranks(graph):
  213.     d = 0.8 # damping factor
  214.     numloops = 10
  215.  
  216.     ranks = {}
  217.     npages = len(graph)
  218.     for page in graph:
  219.         ranks[page] = 1.0 / npages
  220.  
  221.     for i in range(0, numloops):
  222.         newranks = {}
  223.         for page in graph:
  224.             newrank = (1 - d) / npages
  225.             for node in graph:
  226.                 if page in graph[node]:
  227.                     newrank = newrank + d * (ranks[node] / len(graph[node]))
  228.             newranks[page] = newrank
  229.         ranks = newranks
  230.     return ranks
  231.  
  232.  
  233. # Here are some example showing what ordered_search should do:
  234.  
  235. # Observe that the result list is sorted so the highest-ranking site is at the
  236. # beginning of the list.
  237.  
  238. # Note: the intent of this question is for students to write their own sorting
  239. # code, not to use the built-in sort procedure.
  240.  
  241. index, graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
  242. ranks = compute_ranks(graph)
  243.  
  244. print ordered_search(index, ranks, 'Hummus')
  245. #>>> ['http://udacity.com/cs101x/urank/kathleen.html',
  246. #    'http://udacity.com/cs101x/urank/nickel.html',
  247. #    'http://udacity.com/cs101x/urank/arsenic.html',
  248. #    'http://udacity.com/cs101x/urank/hummus.html',
  249. #    'http://udacity.com/cs101x/urank/index.html']
  250.  
  251. print ordered_search(index, ranks, 'the')
  252. #>>> ['http://udacity.com/cs101x/urank/nickel.html',
  253. #    'http://udacity.com/cs101x/urank/arsenic.html',
  254. #    'http://udacity.com/cs101x/urank/hummus.html',
  255. #    'http://udacity.com/cs101x/urank/index.html']
  256.  
  257.  
  258. print ordered_search(index, ranks, 'babaganoush')
  259. #>>> None