Advertisement
Guest User

Untitled

a guest
Nov 22nd, 2014
170
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.65 KB | None | 0 0
  1. # Triple Gold Star
  2.  
  3. # Only A Little Lucky
  4.  
  5. # The Feeling Lucky question (from the regular homework) assumed it was enough
  6. # to find the best-ranked page for a given query. For most queries, though, we
  7. # don't just want the best page (according to the page ranking algorithm), we
  8. # want a list of many pages that match the query, ordered from the most likely
  9. # to be useful to the least likely.
  10.  
  11. # Your goal for this question is to define a procedure, ordered_search(index,
  12. # ranks, keyword), that takes the same inputs as lucky_search from Question 5,
  13. # but returns an ordered list of all the URLs that match the query.
  14.  
  15. # To order the pages, use the quicksort algorithm, invented by Sir Tony Hoare in
  16. # 1959. Quicksort provides a way to sort any list of data, using an expected
  17. # number of comparisons that scales as n log n where n is the number of elements
  18. # in the list.
  19.  
  20. # The idea of quicksort is quite simple:
  21.  
  22. # If the list has zero or one elements, it is already sorted.
  23.  
  24. # Otherwise, pick a pivot element, and split the list into two partitions: one
  25. # contains all the elements equal to or lower than the value of the pivot
  26. # element, and the other contains all the elements that are greater than the
  27. # pivot element. Recursively sort each of the sub-lists, and then return the
  28. # result of concatenating the sorted left sub-list, the pivot element, and the
  29. # sorted right sub-list.
  30.  
  31. # For simplicity, use the first element in the list as your pivot element (this
  32. # is not usually a good choice, since it means if the input list is already
  33. # nearly sorted, the actual work will be much worse than expected).
  34.  
  35. def ordered_search(index, ranks, keyword):
  36.     urls = lookup(index,keyword)
  37.     if len(urls) == 0 or len(urls) == 1:
  38.         return urls
  39.     pivot = urls[len(urls)/2]
  40.     less = []
  41.     more = []
  42.     remaining = urls[:len(urls)/2] + urls[len(urls)/2 +1:]
  43.     for url in remaining:
  44.         if ranks[url] > ranks[pivot]:
  45.             less.append(url)
  46.         else:
  47.             more.append(url)
  48.     return ordered_search(less,ranks,keyword) + [pivot] + ordered_search(more,ranks,keyword)        
  49.    
  50.  
  51. def lucky_search(index, ranks, keyword):
  52.     urls = lookup(index,keyword)
  53.     if urls == None:
  54.         return None
  55.     current = ranks[urls[0]]
  56.     lucky = urls[0]
  57.     for url in urls:
  58.         next_url = ranks[url]
  59.         if current <= next_url:
  60.             current = next_url
  61.             lucky = url
  62.     return lucky  
  63.  
  64. cache = {
  65.    'http://udacity.com/cs101x/urank/index.html': """<html>
  66. <body>
  67. <h1>Dave's Cooking Algorithms</h1>
  68. <p>
  69. Here are my favorite recipies:
  70. <ul>
  71. <li> <a href="http://udacity.com/cs101x/urank/hummus.html">Hummus Recipe</a>
  72. <li> <a href="http://udacity.com/cs101x/urank/arsenic.html">World's Best Hummus</a>
  73. <li> <a href="http://udacity.com/cs101x/urank/kathleen.html">Kathleen's Hummus Recipe</a>
  74. </ul>
  75.  
  76. For more expert opinions, check out the
  77. <a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>
  78. and <a href="http://udacity.com/cs101x/urank/zinc.html">Zinc Chef</a>.
  79. </body>
  80. </html>
  81.  
  82.  
  83.  
  84.  
  85.  
  86.  
  87. """,
  88.    'http://udacity.com/cs101x/urank/zinc.html': """<html>
  89. <body>
  90. <h1>The Zinc Chef</h1>
  91. <p>
  92. I learned everything I know from
  93. <a href="http://udacity.com/cs101x/urank/nickel.html">the Nickel Chef</a>.
  94. </p>
  95. <p>
  96. For great hummus, try
  97. <a href="http://udacity.com/cs101x/urank/arsenic.html">this recipe</a>.
  98.  
  99. </body>
  100. </html>
  101.  
  102.  
  103.  
  104.  
  105.  
  106.  
  107. """,
  108.    'http://udacity.com/cs101x/urank/nickel.html': """<html>
  109. <body>
  110. <h1>The Nickel Chef</h1>
  111. <p>
  112. This is the
  113. <a href="http://udacity.com/cs101x/urank/kathleen.html">
  114. best Hummus recipe!
  115. </a>
  116.  
  117. </body>
  118. </html>
  119.  
  120.  
  121.  
  122.  
  123.  
  124.  
  125. """,
  126.    'http://udacity.com/cs101x/urank/kathleen.html': """<html>
  127. <body>
  128. <h1>
  129. Kathleen's Hummus Recipe
  130. </h1>
  131. <p>
  132.  
  133. <ol>
  134. <li> Open a can of garbonzo beans.
  135. <li> Crush them in a blender.
  136. <li> Add 3 tablesppons of tahini sauce.
  137. <li> Squeeze in one lemon.
  138. <li> Add salt, pepper, and buttercream frosting to taste.
  139. </ol>
  140.  
  141. </body>
  142. </html>
  143.  
  144. """,
  145.    'http://udacity.com/cs101x/urank/arsenic.html': """<html>
  146. <body>
  147. <h1>
  148. The Arsenic Chef's World Famous Hummus Recipe
  149. </h1>
  150. <p>
  151.  
  152. <ol>
  153. <li> Kidnap the <a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>.
  154. <li> Force her to make hummus for you.
  155. </ol>
  156.  
  157. </body>
  158. </html>
  159.  
  160. """,
  161.    'http://udacity.com/cs101x/urank/hummus.html': """<html>
  162. <body>
  163. <h1>
  164. Hummus Recipe
  165. </h1>
  166. <p>
  167.  
  168. <ol>
  169. <li> Go to the store and buy a container of hummus.
  170. <li> Open it.
  171. </ol>
  172.  
  173. </body>
  174. </html>
  175.  
  176.  
  177.  
  178.  
  179. """,
  180. }
  181.  
  182. def get_page(url):
  183.     if url in cache:
  184.         return cache[url]
  185.     return ""
  186.  
  187.  
  188. def get_next_target(page):
  189.     start_link = page.find('<a href=')
  190.     if start_link == -1:
  191.         return None, 0
  192.     start_quote = page.find('"', start_link)
  193.     end_quote = page.find('"', start_quote + 1)
  194.     url = page[start_quote + 1:end_quote]
  195.     return url, end_quote
  196.  
  197. def get_all_links(page):
  198.     links = []
  199.     while True:
  200.         url, endpos = get_next_target(page)
  201.         if url:
  202.             links.append(url)
  203.             page = page[endpos:]
  204.         else:
  205.             break
  206.     return links
  207.  
  208.  
  209. def union(a, b):
  210.     for e in b:
  211.         if e not in a:
  212.             a.append(e)
  213.  
  214. def add_page_to_index(index, url, content):
  215.     words = content.split()
  216.     for word in words:
  217.         add_to_index(index, word, url)
  218.  
  219. def add_to_index(index, keyword, url):
  220.     if keyword in index:
  221.         index[keyword].append(url)
  222.     else:
  223.         index[keyword] = [url]
  224.  
  225. def lookup(index, keyword):
  226.     if keyword in index:
  227.         return index[keyword]
  228.     else:
  229.         return []
  230.  
  231. def crawl_web(seed): # returns index, graph of inlinks
  232.     tocrawl = [seed]
  233.     crawled = []
  234.     graph = {}  # <url>, [list of pages it links to]
  235.     index = {}
  236.     while tocrawl:
  237.         page = tocrawl.pop()
  238.         if page not in crawled:
  239.             content = get_page(page)
  240.             add_page_to_index(index, page, content)
  241.             outlinks = get_all_links(content)
  242.             graph[page] = outlinks
  243.             union(tocrawl, outlinks)
  244.             crawled.append(page)
  245.     return index, graph
  246.  
  247. def compute_ranks(graph):
  248.     d = 0.8 # damping factor
  249.     numloops = 10
  250.  
  251.     ranks = {}
  252.     npages = len(graph)
  253.     for page in graph:
  254.         ranks[page] = 1.0 / npages
  255.  
  256.     for i in range(0, numloops):
  257.         newranks = {}
  258.         for page in graph:
  259.             newrank = (1 - d) / npages
  260.             for node in graph:
  261.                 if page in graph[node]:
  262.                     newrank = newrank + d * (ranks[node] / len(graph[node]))
  263.             newranks[page] = newrank
  264.         ranks = newranks
  265.     return ranks
  266.  
  267.  
  268. # Here are some example showing what ordered_search should do:
  269.  
  270. # Observe that the result list is sorted so the highest-ranking site is at the
  271. # beginning of the list.
  272.  
  273. # Note: the intent of this question is for students to write their own sorting
  274. # code, not to use the built-in sort procedure.
  275.  
  276. index, graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
  277. ranks = compute_ranks(graph)
  278.  
  279. print ordered_search(index, ranks, 'Hummus')
  280. #>>> ['http://udacity.com/cs101x/urank/kathleen.html',
  281. #    'http://udacity.com/cs101x/urank/nickel.html',
  282. #    'http://udacity.com/cs101x/urank/arsenic.html',
  283. #    'http://udacity.com/cs101x/urank/hummus.html',
  284. #    'http://udacity.com/cs101x/urank/index.html']
  285.  
  286. #print ordered_search(index, ranks, 'the')
  287. #>>> ['http://udacity.com/cs101x/urank/nickel.html',
  288. #    'http://udacity.com/cs101x/urank/arsenic.html',
  289. #    'http://udacity.com/cs101x/urank/hummus.html',
  290. #    'http://udacity.com/cs101x/urank/index.html']
  291.  
  292.  
  293. #print ordered_search(index, ranks, 'babaganoush')
  294. #>>> None
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement