Advertisement
LevMukoseev

Phil.py

Mar 18th, 2019
114
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.16 KB | None | 0 0
  1. import re
  2. import urllib.request
  3. from difflib import SequenceMatcher
  4.  
  5.  
  6. def get_content(name):
  7.     link = "https://ru.wikipedia.org/wiki/" + urllib.request.quote(name)
  8.     try:
  9.         with urllib.request.urlopen(link) as page:
  10.             content = page.read().decode('utf-8')
  11.             return urllib.request.unquote(content)
  12.     except urllib.request.HTTPError:
  13.         return None
  14.  
  15.  
  16. def extract_content(page):
  17.     true_page = urllib.request.unquote(page)
  18.     pattern = r'<div id="bodyContent"(.*)<div class="visualClear">'
  19.     result = re.search(pattern, true_page, re.DOTALL)
  20.     return result.start(), result.end() if result is not None else (0, 0)
  21.  
  22.  
  23. def extract_links(page, begin, end):
  24.     true_page = urllib.request.unquote(page)
  25.     content = true_page[begin:end]
  26.     pattern = r'[hrefHREF]{4}=[\'"]/wiki/([-\w]+?)[\'"]'
  27.     links = re.findall(pattern, content)
  28.     return list(set(links)) if links is not None else []
  29.  
  30.  
  31. def find_chain(start, finish):
  32.     current_page = start
  33.     passed_pages = set()
  34.     passed_pages.add(start)
  35.     route = []
  36.     try:
  37.         while True:
  38.             route.append(current_page)
  39.             current_links = get_links(current_page)
  40.             if finish in current_links:
  41.                 route.append(finish)
  42.                 return route
  43.             else:
  44.                 current_links = get_deviation_table(current_links, finish)
  45.                 current_page = get_best_link(current_links, passed_pages)
  46.                 passed_pages.add(current_page)
  47.     except Exception:
  48.         return None
  49.  
  50.  
  51. def get_links(name):
  52.     content = get_content(name)
  53.     start, end = extract_content(content)
  54.     return extract_links(content, start, end)
  55.  
  56.  
  57. def get_deviation_table(words, pattern):
  58.     deviation_table = []
  59.     for word in words:
  60.         deviation_table.append((SequenceMatcher(None, word, pattern).ratio(), word))
  61.     deviation_table.sort(reverse=True)
  62.     return deviation_table
  63.  
  64.  
  65. def get_best_link(useful_links, extra_links):
  66.     for propability, link in useful_links:
  67.         if link not in extra_links:
  68.             return link
  69.  
  70.  
  71. def main():
  72.     pass
  73.  
  74.  
  75. if __name__ == '__main__':
  76.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement