Joker0day

wikiquote #1

Dec 18th, 2017
257
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.75 KB | None | 0 0
  1. import urllib.request
  2. import urllib.parse
  3. import json
  4. import lxml.html
  5. import random
  6. from itertools import islice
  7.  
  8.  
  9. class NoSuchPageException(Exception):
  10.     pass
  11.  
  12.  
  13. class DisambiguationPageException(Exception):
  14.     pass
  15.  
  16.  
  17. class UnsupportedLanguageException(Exception):
  18.     pass
  19.  
  20.  
  21. W_URL = 'http://{lang}.wikiquote.org/w/api.php'
  22. SRCH_URL = W_URL + '?format=json&action=query&list=search&continue=&srsearch='
  23. PAGE_URL = W_URL + '?format=json&action=parse&prop=text|categories&page='
  24. MAINPAGE_URL = W_URL + '?format=json&action=parse&page=Main%20Page&prop=text'
  25. CTGRY_MMBRS_URL = W_URL + (
  26.     '?action=query&continue=-||&cmcontinue={cmcontinue}'
  27.     '&list=categorymembers&format=json&cmtitle={page}&cmtype={command}')
  28. MIN_QUOTE_LEN = 6
  29. MIN_QUOTE_WORDS = 3
  30. DEFAULT_MAX_QUOTES = 20
  31. WORD_BLACKLIST = ['quoted', 'Variant:', 'Retrieved', 'Notes:']
  32. SUPPORTED_LANGUAGES = ['en', 'fr']
  33.  
  34.  
  35. def json_from_url(url):
  36.     res = urllib.request.urlopen(url)
  37.     body = res.read().decode()
  38.     return json.loads(body)
  39.  
  40.  
  41. def category_members(category, command='subcat', lang='en'):
  42.     '''Generate a list of members of a category (subactegory xor pages)
  43.  
  44.    Keyword arguments:
  45.    category -- the category
  46.    command -- the type of members ('page' or 'subcat', default 'subcat'
  47.    lang -- lang of your category (default 'en'
  48.    '''
  49.     if command not in ['subcat', 'page']:
  50.         raise ValueError('Unknown command {}'.format(command))
  51.     category = urllib.parse.quote(category)
  52.     page = CTGRY_MMBRS_URL.format(lang=lang,
  53.                                   cmcontinue='{cmcontinue}',
  54.                                   page=category,
  55.                                   command=command)
  56.     cmcontinue = ''
  57.     while_end = True
  58.     while while_end:
  59.         # Wikiquote doesn't give you all members,
  60.         # you need to ask for the next page
  61.         current_page = page.format(cmcontinue=cmcontinue)
  62.         my_json = json_from_url(current_page)
  63.         if 'error' in my_json:
  64.             raise NoSuchPageException(
  65.                     'No category matched the title: ' + category)
  66.         try:
  67.             cmcontinue = my_json['continue']['cmcontinue']
  68.         except KeyError:
  69.             while_end = False
  70.         for item in my_json['query']['categorymembers']:
  71.             yield item['title']
  72.  
  73.  
  74. def search(s, lang='en'):
  75.     if not s:
  76.         return []
  77.     search_terms = urllib.parse.quote(s)
  78.     local_srch_url = SRCH_URL.format(lang=lang)
  79.     data = json_from_url(local_srch_url + search_terms)
  80.     results = [entry['title'] for entry in data['query']['search']]
  81.     return results
  82.  
  83.  
  84. def is_disambiguation(categories):
  85.     # Checks to see if at least one category includes 'Disambiguation_pages'
  86.     return not categories or any([
  87.         category['*'] == 'Disambiguation_pages' for category in categories
  88.     ])
  89.  
  90.  
  91. def is_cast_credit(txt_split):
  92.     # Checks to see if the text is a cast credit:
  93.     #   <actor name> as <character name>
  94.     #   <actor name> - <character name>
  95.     if not 2 < len(txt_split) < 7:
  96.         return False
  97.  
  98.     separators = ['as', '-', '–']
  99.     return all([w[0].isupper() or w in separators or w[0] == '"'
  100.                for w in txt_split])
  101.  
  102.  
  103. def is_quote(txt):
  104.     txt_split = txt.split()
  105.     invalid_conditions = [
  106.         not txt or not txt[0].isupper() or len(txt) < MIN_QUOTE_LEN,
  107.         len(txt_split) < MIN_QUOTE_WORDS,
  108.         any([True for word in txt_split if word in WORD_BLACKLIST]),
  109.         txt.endswith(('(', ':', ']')),
  110.         is_cast_credit(txt_split)
  111.     ]
  112.  
  113.     # Returns false if any invalid conditions are true, otherwise returns True.
  114.     return not any(invalid_conditions)
  115.  
  116.  
  117. def extract_quotes(html_content, max_quotes):
  118.     tree = lxml.html.fromstring(html_content)
  119.     quotes_list = []
  120.  
  121.     # List items inside unordered lists
  122.     node_list = tree.xpath('//div/ul/li')
  123.  
  124.     # Description tags inside description lists,
  125.     # first one is generally not a quote
  126.     dd_list = tree.xpath('//div/dl/dd')[1:]
  127.     if len(dd_list) > len(node_list):
  128.         node_list += dd_list
  129.  
  130.     for txt in node_list:
  131.         uls = txt.xpath('ul')
  132.         for ul in uls:
  133.             ul.getparent().remove(ul)
  134.  
  135.         txt = txt.text_content().strip()
  136.         if is_quote(txt) and max_quotes > len(quotes_list):
  137.             txt_normal = ' '.join(txt.split())
  138.             quotes_list.append(txt_normal)
  139.  
  140.             if max_quotes == len(quotes_list):
  141.                 break
  142.  
  143.     return quotes_list
  144.  
  145.  
  146. def extract_quotes_fr(html_content, max_quotes):
  147.     '''Extract quotes from the french wiki
  148.  
  149.    Keyword arguments:
  150.    html_content -- the data returned by the wikiquote api
  151.    max_quote -- max number of quotes to retrieve
  152.    '''
  153.     # List items inside unordered lists
  154.     tree = lxml.html.fromstring(html_content)
  155.     node_list = tree.xpath('//p/span[@class="citation"]')
  156.     quotes = list(islice((span.text_content()
  157.                           for span in node_list),
  158.                          max_quotes))
  159.     # Description tags inside description lists,
  160.     # first one is generally not a quote
  161.     return quotes
  162.  
  163.  
  164. def quotes(page_title, max_quotes=DEFAULT_MAX_QUOTES, lang='en'):
  165.     if lang not in SUPPORTED_LANGUAGES:
  166.         raise UnsupportedLanguageException('Unsupported language ' + lang)
  167.     local_page_url = PAGE_URL.format(lang=lang)
  168.     data = json_from_url(local_page_url + urllib.parse.quote(page_title))
  169.     if 'error' in data:
  170.         raise NoSuchPageException('No pages matched the title: ' + page_title)
  171.  
  172.     if is_disambiguation(data['parse']['categories']):
  173.         raise DisambiguationPageException(
  174.             'Title returned a disambiguation page.')
  175.  
  176.     html_content = data['parse']['text']['*']
  177.     if lang == 'fr':
  178.         return extract_quotes_fr(html_content, max_quotes)
  179.     return extract_quotes(html_content, max_quotes)
  180.  
  181.  
  182. def explore_category(category, lang='en', categories=None):
  183.     '''Recursively explore and index quotes
  184.    from a category and its subcategories
  185.  
  186.    Keyword arguments:
  187.    category -- root category
  188.    lang -- lang of the wiki
  189.    categories -- categories to ignore
  190.    '''
  191.  
  192.     if categories is None:
  193.         categories = set()
  194.     subs = set(category_members(category, command='subcat', lang=lang))
  195.     new_categories = set(sub for sub in subs if sub not in categories)
  196.     quotes_index = []
  197.     for sub in new_categories:
  198.         quotes_index += explore_category(sub)
  199.     categories.update(new_categories)
  200.     pages = set(category_members(category, command='page', lang=lang))
  201.     for page in pages:
  202.         try:
  203.             new_quote_index = [(page, index, lang)
  204.                                for index, quote
  205.                                in enumerate(quotes(page, lang=lang))]
  206.         except DisambiguationPageException:
  207.             continue
  208.         else:
  209.             quotes_index += new_quote_index
  210.     return quotes_index
  211.  
  212.  
  213. def quote_of_the_day():
  214.     data = json_from_url(MAINPAGE_URL.format(lang='en'))
  215.     tree = lxml.html.fromstring(data['parse']['text']['*'])
  216.     tree = tree.get_element_by_id('mf-qotd')
  217.  
  218.     raw_quote = tree.xpath('div/div/table/tr')[0].text_content().split('~')
  219.     quote = raw_quote[0].strip()
  220.     author = raw_quote[1].strip()
  221.     return quote, author
  222.  
  223.  
  224. def get_quote(page, index, lang='en'):
  225.     '''Get a quote from a page
  226.  
  227.    keyword arguments:
  228.    page -- page to search
  229.    index -- index of the quote inside the page
  230.    lang -- lang of the wiki
  231.    '''
  232.     quote = quotes(page, lang=lang)[index]
  233.     return quote
  234.  
  235.  
  236. def random_quote_from_categories(quote_index):
  237.     '''Returns a random quote from an existing index
  238.    Keyword arguments :
  239.  
  240.    quote_index -- a set generated by explore_category'''
  241.     my_id = random.randint(0, len(quote_index))
  242.     page = quote_index[my_id][0]
  243.     lang = quote_index[my_id][2]
  244.     quote = get_quote(page, quote_index[my_id][1], lang=lang)
  245.     return quote, page
  246.  
  247.  
  248. def quotes_with_original(page_title, lang='en', max_quotes=DEFAULT_MAX_QUOTES):
  249.     '''Extract quotes from the french wiki with original version:
  250.    Some quotes translated to french come with the original one.
  251.    this retrive boths as a tuple (french,original).
  252.    original is None for quote that are french only
  253.  
  254.    Keyword arguments:
  255.    html_content -- the data returned by the wikiquote api
  256.    max_quote -- max number of quotes to retrieve
  257.    '''
  258.     if lang not in SUPPORTED_LANGUAGES:
  259.         raise UnsupportedLanguageException('Unsupported language ' + lang)
  260.  
  261.     if lang == 'en':
  262.         return []
  263.     local_page_url = PAGE_URL.format(lang=lang)
  264.     data = json_from_url(local_page_url + urllib.parse.quote(page_title))
  265.     if 'error' in data:
  266.         raise NoSuchPageException('No pages matched the title: ' + page_title)
  267.  
  268.     if is_disambiguation(data['parse']['categories']):
  269.         raise DisambiguationPageException(
  270.             'Title returned a disambiguation page.')
  271.  
  272.     html_content = data['parse']['text']['*']
  273.     tree = lxml.html.fromstring(html_content)
  274.     node_list = tree.xpath('//p/span[@class="citation"]')
  275.     quotes_xpath = [{'span': x, 'vf': x.text_content().replace('\xa0', ' ')}
  276.                     for x in node_list[:max_quotes]]
  277.     quotes_tuple = []
  278.     for quote in quotes_xpath:
  279.         parent = quote['span'].getparent()
  280.         quote['vo'] = None
  281.         try:
  282.             xpath = './/span[@class="original"]'
  283.             vo_span = next(filter(lambda x: x is not None,
  284.                                   map(lambda s: s.find(xpath),
  285.                                       islice(parent.itersiblings(), 2))))
  286.         except StopIteration:
  287.             pass
  288.         else:
  289.             quote['vo'] = vo_span.text_content().replace('\xa0', ' ')
  290.         quotes_tuple.append((quote['vf'], quote['vo']))
  291.  
  292.     return quotes_tuple
Advertisement
Add Comment
Please, Sign In to add comment