Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib.request
- import urllib.parse
- import json
- import lxml.html
- import random
- from itertools import islice
- class NoSuchPageException(Exception):
- pass
- class DisambiguationPageException(Exception):
- pass
- class UnsupportedLanguageException(Exception):
- pass
- W_URL = 'http://{lang}.wikiquote.org/w/api.php'
- SRCH_URL = W_URL + '?format=json&action=query&list=search&continue=&srsearch='
- PAGE_URL = W_URL + '?format=json&action=parse&prop=text|categories&page='
- MAINPAGE_URL = W_URL + '?format=json&action=parse&page=Main%20Page&prop=text'
- CTGRY_MMBRS_URL = W_URL + (
- '?action=query&continue=-||&cmcontinue={cmcontinue}'
- '&list=categorymembers&format=json&cmtitle={page}&cmtype={command}')
- MIN_QUOTE_LEN = 6
- MIN_QUOTE_WORDS = 3
- DEFAULT_MAX_QUOTES = 20
- WORD_BLACKLIST = ['quoted', 'Variant:', 'Retrieved', 'Notes:']
- SUPPORTED_LANGUAGES = ['en', 'fr']
- def json_from_url(url):
- res = urllib.request.urlopen(url)
- body = res.read().decode()
- return json.loads(body)
- def category_members(category, command='subcat', lang='en'):
- '''Generate a list of members of a category (subactegory xor pages)
- Keyword arguments:
- category -- the category
- command -- the type of members ('page' or 'subcat', default 'subcat'
- lang -- lang of your category (default 'en'
- '''
- if command not in ['subcat', 'page']:
- raise ValueError('Unknown command {}'.format(command))
- category = urllib.parse.quote(category)
- page = CTGRY_MMBRS_URL.format(lang=lang,
- cmcontinue='{cmcontinue}',
- page=category,
- command=command)
- cmcontinue = ''
- while_end = True
- while while_end:
- # Wikiquote doesn't give you all members,
- # you need to ask for the next page
- current_page = page.format(cmcontinue=cmcontinue)
- my_json = json_from_url(current_page)
- if 'error' in my_json:
- raise NoSuchPageException(
- 'No category matched the title: ' + category)
- try:
- cmcontinue = my_json['continue']['cmcontinue']
- except KeyError:
- while_end = False
- for item in my_json['query']['categorymembers']:
- yield item['title']
- def search(s, lang='en'):
- if not s:
- return []
- search_terms = urllib.parse.quote(s)
- local_srch_url = SRCH_URL.format(lang=lang)
- data = json_from_url(local_srch_url + search_terms)
- results = [entry['title'] for entry in data['query']['search']]
- return results
- def is_disambiguation(categories):
- # Checks to see if at least one category includes 'Disambiguation_pages'
- return not categories or any([
- category['*'] == 'Disambiguation_pages' for category in categories
- ])
- def is_cast_credit(txt_split):
- # Checks to see if the text is a cast credit:
- # <actor name> as <character name>
- # <actor name> - <character name>
- if not 2 < len(txt_split) < 7:
- return False
- separators = ['as', '-', '–']
- return all([w[0].isupper() or w in separators or w[0] == '"'
- for w in txt_split])
- def is_quote(txt):
- txt_split = txt.split()
- invalid_conditions = [
- not txt or not txt[0].isupper() or len(txt) < MIN_QUOTE_LEN,
- len(txt_split) < MIN_QUOTE_WORDS,
- any([True for word in txt_split if word in WORD_BLACKLIST]),
- txt.endswith(('(', ':', ']')),
- is_cast_credit(txt_split)
- ]
- # Returns false if any invalid conditions are true, otherwise returns True.
- return not any(invalid_conditions)
- def extract_quotes(html_content, max_quotes):
- tree = lxml.html.fromstring(html_content)
- quotes_list = []
- # List items inside unordered lists
- node_list = tree.xpath('//div/ul/li')
- # Description tags inside description lists,
- # first one is generally not a quote
- dd_list = tree.xpath('//div/dl/dd')[1:]
- if len(dd_list) > len(node_list):
- node_list += dd_list
- for txt in node_list:
- uls = txt.xpath('ul')
- for ul in uls:
- ul.getparent().remove(ul)
- txt = txt.text_content().strip()
- if is_quote(txt) and max_quotes > len(quotes_list):
- txt_normal = ' '.join(txt.split())
- quotes_list.append(txt_normal)
- if max_quotes == len(quotes_list):
- break
- return quotes_list
- def extract_quotes_fr(html_content, max_quotes):
- '''Extract quotes from the french wiki
- Keyword arguments:
- html_content -- the data returned by the wikiquote api
- max_quote -- max number of quotes to retrieve
- '''
- # List items inside unordered lists
- tree = lxml.html.fromstring(html_content)
- node_list = tree.xpath('//p/span[@class="citation"]')
- quotes = list(islice((span.text_content()
- for span in node_list),
- max_quotes))
- # Description tags inside description lists,
- # first one is generally not a quote
- return quotes
- def quotes(page_title, max_quotes=DEFAULT_MAX_QUOTES, lang='en'):
- if lang not in SUPPORTED_LANGUAGES:
- raise UnsupportedLanguageException('Unsupported language ' + lang)
- local_page_url = PAGE_URL.format(lang=lang)
- data = json_from_url(local_page_url + urllib.parse.quote(page_title))
- if 'error' in data:
- raise NoSuchPageException('No pages matched the title: ' + page_title)
- if is_disambiguation(data['parse']['categories']):
- raise DisambiguationPageException(
- 'Title returned a disambiguation page.')
- html_content = data['parse']['text']['*']
- if lang == 'fr':
- return extract_quotes_fr(html_content, max_quotes)
- return extract_quotes(html_content, max_quotes)
- def explore_category(category, lang='en', categories=None):
- '''Recursively explore and index quotes
- from a category and its subcategories
- Keyword arguments:
- category -- root category
- lang -- lang of the wiki
- categories -- categories to ignore
- '''
- if categories is None:
- categories = set()
- subs = set(category_members(category, command='subcat', lang=lang))
- new_categories = set(sub for sub in subs if sub not in categories)
- quotes_index = []
- for sub in new_categories:
- quotes_index += explore_category(sub)
- categories.update(new_categories)
- pages = set(category_members(category, command='page', lang=lang))
- for page in pages:
- try:
- new_quote_index = [(page, index, lang)
- for index, quote
- in enumerate(quotes(page, lang=lang))]
- except DisambiguationPageException:
- continue
- else:
- quotes_index += new_quote_index
- return quotes_index
- def quote_of_the_day():
- data = json_from_url(MAINPAGE_URL.format(lang='en'))
- tree = lxml.html.fromstring(data['parse']['text']['*'])
- tree = tree.get_element_by_id('mf-qotd')
- raw_quote = tree.xpath('div/div/table/tr')[0].text_content().split('~')
- quote = raw_quote[0].strip()
- author = raw_quote[1].strip()
- return quote, author
- def get_quote(page, index, lang='en'):
- '''Get a quote from a page
- keyword arguments:
- page -- page to search
- index -- index of the quote inside the page
- lang -- lang of the wiki
- '''
- quote = quotes(page, lang=lang)[index]
- return quote
- def random_quote_from_categories(quote_index):
- '''Returns a random quote from an existing index
- Keyword arguments :
- quote_index -- a set generated by explore_category'''
- my_id = random.randint(0, len(quote_index))
- page = quote_index[my_id][0]
- lang = quote_index[my_id][2]
- quote = get_quote(page, quote_index[my_id][1], lang=lang)
- return quote, page
- def quotes_with_original(page_title, lang='en', max_quotes=DEFAULT_MAX_QUOTES):
- '''Extract quotes from the french wiki with original version:
- Some quotes translated to french come with the original one.
- this retrive boths as a tuple (french,original).
- original is None for quote that are french only
- Keyword arguments:
- html_content -- the data returned by the wikiquote api
- max_quote -- max number of quotes to retrieve
- '''
- if lang not in SUPPORTED_LANGUAGES:
- raise UnsupportedLanguageException('Unsupported language ' + lang)
- if lang == 'en':
- return []
- local_page_url = PAGE_URL.format(lang=lang)
- data = json_from_url(local_page_url + urllib.parse.quote(page_title))
- if 'error' in data:
- raise NoSuchPageException('No pages matched the title: ' + page_title)
- if is_disambiguation(data['parse']['categories']):
- raise DisambiguationPageException(
- 'Title returned a disambiguation page.')
- html_content = data['parse']['text']['*']
- tree = lxml.html.fromstring(html_content)
- node_list = tree.xpath('//p/span[@class="citation"]')
- quotes_xpath = [{'span': x, 'vf': x.text_content().replace('\xa0', ' ')}
- for x in node_list[:max_quotes]]
- quotes_tuple = []
- for quote in quotes_xpath:
- parent = quote['span'].getparent()
- quote['vo'] = None
- try:
- xpath = './/span[@class="original"]'
- vo_span = next(filter(lambda x: x is not None,
- map(lambda s: s.find(xpath),
- islice(parent.itersiblings(), 2))))
- except StopIteration:
- pass
- else:
- quote['vo'] = vo_span.text_content().replace('\xa0', ' ')
- quotes_tuple.append((quote['vf'], quote['vo']))
- return quotes_tuple
Advertisement
Add Comment
Please, Sign In to add comment