Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import urllib.request
- from difflib import SequenceMatcher
- def get_content(name):
- link = "https://ru.wikipedia.org/wiki/" + urllib.request.quote(name)
- try:
- with urllib.request.urlopen(link) as page:
- content = page.read().decode('utf-8')
- return urllib.request.unquote(content)
- except urllib.request.HTTPError:
- return None
- def extract_content(page):
- true_page = urllib.request.unquote(page)
- pattern = r'<div id="bodyContent"(.*)<div class="visualClear">'
- result = re.search(pattern, true_page, re.DOTALL)
- return result.start(), result.end() if result is not None else (0, 0)
- def extract_links(page, begin, end):
- true_page = urllib.request.unquote(page)
- content = true_page[begin:end]
- pattern = r'[hrefHREF]{4}=[\'"]/wiki/([-\w]+?)[\'"]'
- links = re.findall(pattern, content)
- return list(set(links)) if links is not None else []
- def find_chain(start, finish):
- current_page = start
- passed_pages = set()
- passed_pages.add(start)
- route = []
- try:
- while True:
- route.append(current_page)
- current_links = get_links(current_page)
- if finish in current_links:
- route.append(finish)
- return route
- else:
- current_links = get_deviation_table(current_links, finish)
- current_page = get_best_link(current_links, passed_pages)
- passed_pages.add(current_page)
- except Exception:
- return None
- def get_links(name):
- content = get_content(name)
- start, end = extract_content(content)
- return extract_links(content, start, end)
- def get_deviation_table(words, pattern):
- deviation_table = []
- for word in words:
- deviation_table.append((SequenceMatcher(None, word, pattern).ratio(), word))
- deviation_table.sort(reverse=True)
- return deviation_table
- def get_best_link(useful_links, extra_links):
- for propability, link in useful_links:
- if link not in extra_links:
- return link
- def main():
- pass
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement