Advertisement
dmpb14

UYD4RSS

Apr 12th, 2020
209
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 19.22 KB | None | 0 0
  1. """
  2. UYD4RSS
  3.  
  4. Python 3.6+
  5.  
  6. Dependencies:
  7. beautifulsoup4
  8. selenium
  9. webdriver_manager
  10.  
  11. --help for options
  12. """
  13.  
  14. import argparse
  15. import collections
  16. import datetime
  17. import logging
  18. import os
  19. import platform
  20. import sys
  21. import time
  22. from typing import Iterable, List, Set, Tuple
  23.  
  24. from bs4 import BeautifulSoup
  25. from bs4.element import Tag
  26. from selenium import webdriver
  27. from selenium.common.exceptions import WebDriverException
  28. from webdriver_manager.chrome import ChromeDriverManager
  29. from webdriver_manager.firefox import GeckoDriverManager
  30.  
  31.  
  32. logging.basicConfig(
  33.     format='%(asctime)s: %(msg)s',
  34.     datefmt='%H:%M:%S %p',
  35.     level=logging.INFO
  36. )
  37.  
  38. DEFAULT_HTML_PATH = os.path.join(os.getcwd(), 'uyd_archive.html')
  39. DEFAULT_RSS_PATH = os.path.join(os.getcwd(), 'uyd_archive.rss')
  40. VAULT_ENTRANCE_URL = 'https://uhhyeahdude.com/archive/'
  41. VAULT_SUCCESS_URL = 'https://uhhyeahdude.com/index.php/archive'
  42. WEBDRIVERS_PATH = os.path.join(os.getcwd(), '.drivers')
  43.  
  44.  
  45. class UYDEpisode:
  46.  
  47.     _EPISODE_XML_TEMPLATE = (  # Note this is one implicitly concatenated string literal, not a tuple
  48.         """<item>"""
  49.         """\n\t<title>{title}</title>"""
  50.         """\n\t<pubDate>{dt} -0000</pubDate>"""
  51.         """\n\t<link>{url}</link>"""
  52.         """\n</item>"""
  53.     )
  54.  
  55.     @staticmethod
  56.     def _parse_date(date_str: str) -> datetime.datetime:
  57.         parts = date_str.split()
  58.         parts[1] = parts[1].strip("stndrh")  # '1st', '2nd', '3rd', '4th' become 1, 2, 3, 4
  59.         if len(parts[1]) == 1:
  60.             parts[1] = '0' + parts[1]
  61.         fixed_str = " ".join(parts)
  62.         return datetime.datetime.strptime(fixed_str, '%B %d %Y')
  63.  
  64.     @staticmethod
  65.     def _parse_number(title: str) -> int:
  66.         # TODO: This works for the data on the page as currently formatted, but could break in the future
  67.         n_str = title.split()[1]
  68.         return int(n_str)
  69.  
  70.     def __init__(self, title: str, date_str: str, url: str, live: bool, marcia: bool):
  71.         self.title = title
  72.         self.number = self._parse_number(title)
  73.         self.dt = self._parse_date(date_str)  # Date parsed from str; time component set to 00:00:00
  74.         self.url = url
  75.         self.supplement = 'Supplement' in title  # TODO: Also works for existing data but is fragile
  76.         self.video = self.number == 100 or 'video' in title.lower()  # TODO: Gets 100, 500 but check URL or something?
  77.         self.live = live
  78.         self.marcia = marcia
  79.  
  80.     def __lt__(self, other: 'UYDEpisode'):
  81.         return (self.dt, self.number, self.supplement, self.video, self.title) < \
  82.                (other.dt, other.number, other.supplement, other.video, other.title)
  83.  
  84.     def __str__(self) -> str:
  85.         s = '{}, {}: {}'.format(self.title, self.dt.strftime('%b %d %Y'), self.url)
  86.         return s
  87.  
  88.     def __repr__(self):
  89.         return str(self)
  90.  
  91.     def to_xml(self) -> str:
  92.         # Get non-zero-padded day without platform-specific %#d vs. %-d
  93.         date_str = f"{self.dt.strftime('%B')} {self.dt.day}, {self.dt.year}"
  94.         title_for_xml = self.title + ' ' + date_str
  95.         if self.video and 'video' not in self.title.lower():  # 100 not tagged in title, 500 is
  96.             title_for_xml += ' (Video)'
  97.         if self.live:
  98.             title_for_xml += ' (Live)'
  99.         if self.marcia:
  100.             title_for_xml += ' (feat. Marcia)'
  101.         dt_fmt = '%a, %d %b %Y %H:%M:%S'
  102.         episode_xml = UYDEpisode._EPISODE_XML_TEMPLATE.format(
  103.             title=title_for_xml,
  104.             dt=self.dt.strftime(dt_fmt),
  105.             url=self.url
  106.         )
  107.         return episode_xml
  108.  
  109.  
  110. # Setup
  111. def get_chromedriver(headless: bool, disable_logging: bool, executable: str) -> webdriver.Chrome:
  112.     options = webdriver.ChromeOptions()
  113.     if headless:
  114.         options.add_argument('headless')
  115.     if disable_logging:
  116.         options.add_experimental_option("excludeSwitches", ['enable-logging'])
  117.     if executable:
  118.         executable_path = executable
  119.     else:
  120.         basename = 'chromedriver' + ('.exe' if platform.system() == 'Windows' else '')
  121.         cwd_path = os.path.join(os.getcwd(), basename)
  122.         if os.path.exists(cwd_path):
  123.             executable_path = cwd_path
  124.         else:
  125.             logging.info('Attempting to download Chromedriver in {}'.format(WEBDRIVERS_PATH))
  126.             try:
  127.                 executable_path = ChromeDriverManager(path=WEBDRIVERS_PATH).install()
  128.             except:
  129.                 msg = 'Failed to download Chromedriver! Download and place in this directory to proceed.'
  130.                 logging.exception(msg)
  131.                 sys.exit(1)
  132.     logging.info('Initializing webdriver at {}'.format(executable_path))
  133.     driver = webdriver.Chrome(options=options, executable_path=executable_path)
  134.     return driver
  135.  
  136.  
  137. def get_geckodriver(headless: bool, disable_logging: bool, executable: str) -> webdriver.Firefox:
  138.     options = webdriver.FirefoxOptions()
  139.     options.headless = headless
  140.     ff_kwargs = {}
  141.     if disable_logging:
  142.         ff_kwargs['service_log_path'] = os.path.devnull
  143.     if executable:
  144.         ff_kwargs['executable_path'] = executable
  145.     else:
  146.         basename = 'geckodriver' + ('.exe' if platform.system() == 'Windows' else '')
  147.         cwd_path = os.path.join(os.getcwd(), basename)
  148.         if os.path.exists(cwd_path):
  149.             ff_kwargs['executable_path'] = cwd_path
  150.         else:
  151.             logging.info('Attempting to download Geckodriver in {}'.format(WEBDRIVERS_PATH))
  152.             try:
  153.                 ff_kwargs['executable_path'] = GeckoDriverManager(path=WEBDRIVERS_PATH).install()
  154.             except:
  155.                 msg = 'Failed to download Geckodriver! Download and place in this directory to proceed.'
  156.                 logging.exception(msg)
  157.                 sys.exit(1)
  158.     logging.info('Initializing webdriver at {}'.format(ff_kwargs['executable_path']))
  159.     driver = webdriver.Firefox(options=options, **ff_kwargs)
  160.     return driver
  161.  
  162.  
  163. def get_webdriver(browser: str, headless: bool, disable_logging: bool, executable: str):
  164.     if browser == 'firefox':
  165.         return get_geckodriver(headless, disable_logging, executable)
  166.     else:
  167.         return get_chromedriver(headless, disable_logging, executable)
  168.  
  169.  
  170. # User input
  171. def prompt_credentials() -> Tuple[str, str]:
  172.     print('Please enter the username and password for access to the Uhh Yeah Dude archive.')
  173.     time.sleep(1)
  174.     print('These are NOT the credentials to your Patreon account.')
  175.     time.sleep(1)
  176.     print(
  177.         'They are the credentials given to all subscribers to the Uhh Yeah Dude Patreon.',
  178.         '\nCheck https://patreon.com/UHHYEAHDUDE or your old emails; make sure to use the most recent ones.'
  179.     )
  180.     username = input('Username: ')
  181.     password = input('Password: ')
  182.     return username, password
  183.  
  184.  
  185. def prompt_int(question: str, min_n: int, max_n: int) -> int:
  186.     if max_n < min_n:
  187.         raise ValueError('Maximum (given {}) must be greater than or equal to minimum (given {})'.format(max_n, min_n))
  188.     question += ' [{}-{}]: '.format(min_n, max_n)
  189.     invalid_msg = 'Please respond with a number from {} to {}'.format(min_n, max_n)
  190.     while True:
  191.         answer_str = input(question)
  192.         try:
  193.             answer = int(answer_str)
  194.         except ValueError:
  195.             print(invalid_msg)
  196.             continue
  197.         else:
  198.             if min_n <= answer <= max_n:
  199.                 return answer
  200.             else:
  201.                 print(invalid_msg)
  202.                 continue
  203.  
  204.  
  205. def prompt_yn(question: str, default: bool) -> bool:
  206.     question += ' [Y/n]: ' if default else ' [y/N]: '
  207.     while True:
  208.         answer = input(question)
  209.         if not answer:
  210.             return default
  211.         elif answer.lower() in {'y', 'yes'}:
  212.             return True
  213.         elif answer.lower() in {'n', 'no'}:
  214.             return False
  215.         else:
  216.             print("Please respond with either 'y' or 'n'")
  217.             continue
  218.  
  219.  
  220. # Navigate the monsterweb
  221. def enter_credentials(driver: webdriver.Chrome, username: str, password: str):
  222.     driver.get(VAULT_ENTRANCE_URL)
  223.     logging.info('Entrance page loaded')
  224.     username_box = driver.find_element_by_name('username')
  225.     password_box = driver.find_element_by_name('password')
  226.     submit_button = driver.find_element_by_name('submit')
  227.     username_box.send_keys(username)
  228.     password_box.send_keys(password)
  229.     submit_button.click()
  230.     logging.info('Credentials submitted')
  231.  
  232.  
  233. def validate_credentials(driver: webdriver.Chrome) -> bool:
  234.     interval = 3
  235.     time.sleep(interval)
  236.     return driver.current_url == VAULT_SUCCESS_URL
  237.  
  238.  
  239. def scroll_to_bottom(driver: webdriver.Chrome) -> bool:
  240.     last_title_span_text = "Episode 1February 11th 2006"
  241.     # Scroll to bottom; limit breaks to exit if it's not working
  242.     interval = 5
  243.     count = 0
  244.     limit = 15
  245.     while count < limit:
  246.         time.sleep(interval)
  247.         episode_title_spans = driver.find_elements_by_xpath(
  248.             "//div[@id='archive__tab-content-01']//span[@class='archive-ep__title']"
  249.         )
  250.         if episode_title_spans:
  251.             if episode_title_spans[-1].text == last_title_span_text:
  252.                 logging.info('All episodes found!')
  253.                 return True
  254.             else:
  255.                 logging.info('{} episodes found, getting more...'.format(len(episode_title_spans)))
  256.                 driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  257.                 count += 1
  258.     else:
  259.         return False
  260.  
  261.  
  262. def scrape_site(browser: str, headless: bool, disable_logging: bool, executable: str) -> str:
  263.     driver = get_webdriver(browser, headless, disable_logging, executable)
  264.     username, password = prompt_credentials()
  265.     error_msg, source, success = '', '', False
  266.  
  267.     try:
  268.         enter_credentials(driver, username, password)
  269.         success = validate_credentials(driver)
  270.         if not success:
  271.             error_msg = 'Login attempt failed! Check credentials again.'
  272.         else:
  273.             success = scroll_to_bottom(driver)
  274.             if not success:
  275.                 error_msg = 'Failed to reach end of archive page!'
  276.             else:
  277.                 source = driver.page_source
  278.     except WebDriverException as error:
  279.         logging.error('Fatal error!', exc_info=error)
  280.     finally:
  281.         driver.close()
  282.     if not success:
  283.         logging.critical(error_msg)
  284.         sys.exit(1)
  285.     else:
  286.         return source
  287.  
  288.  
  289. # Process the data
  290. def parse_episodes(source: str) -> List[UYDEpisode]:
  291.     soup = BeautifulSoup(source, 'html.parser')
  292.     all_episodes_tab_id = "archive__tab-content-01"
  293.     all_episode_divs = get_episode_divs_from_tab(soup, all_episodes_tab_id)
  294.     live_tab_id = "archive__tab-content-02"
  295.     live_episode_divs = get_episode_divs_from_tab(soup, live_tab_id)
  296.     live_episode_urls = get_episode_urls_from_divs(live_episode_divs)
  297.     marcia_tab_id = "archive__tab-content-03"
  298.     marcia_episode_divs = get_episode_divs_from_tab(soup, marcia_tab_id)
  299.     marcia_episode_urls = get_episode_urls_from_divs(marcia_episode_divs)
  300.  
  301.     episodes = []
  302.     for episode_div in all_episode_divs:
  303.         title_tag = episode_div.find(class_='archive-ep__title')
  304.         date_str = title_tag.span.text
  305.         title_str = title_tag.text[:-len(date_str)]
  306.         url_str = episode_div.find('a', class_="archive-ep__download")['href']
  307.         live = url_str in live_episode_urls
  308.         marcia = url_str in marcia_episode_urls
  309.         episode = UYDEpisode(title_str, date_str, url_str, live, marcia)
  310.         episodes.append(episode)
  311.     logging.info('{} items retrieved!'.format(len(episodes)))
  312.     fix_datetimes(episodes)
  313.     episodes.sort()
  314.     return episodes
  315.  
  316.  
  317. def get_episode_divs_from_tab(soup: BeautifulSoup, tab_id: str) -> Iterable[Tag]:
  318.     return soup.find(id=tab_id).find_all(class_='archive-ep')
  319.  
  320.  
  321. def get_episode_urls_from_divs(divs: Iterable[Tag]) -> Set[str]:
  322.     return {div.find('a', class_="archive-ep__download")['href'] for div in divs}
  323.  
  324.  
  325. def fix_datetimes(episodes: List[UYDEpisode]) -> None:
  326.     """
  327.    We've only parsed episode dates out of the HTML, not times (defaulting to 12:00 a.m.) Many episodes share the same
  328.    date. Arbitrarily increment the times of the latter of multiple episodes with the same date so they sort properly
  329.    in podcast apps.
  330.    """
  331.     # Group episodes by date
  332.     by_date = collections.defaultdict(list)
  333.     for ep in episodes:
  334.         by_date[ep.dt.date()].append(ep)
  335.     for day_eps in by_date.values():
  336.         if len(day_eps) > 1:
  337.             day_eps.sort()
  338.             for i, ep in enumerate(day_eps):
  339.                 offset = datetime.timedelta(minutes=i)
  340.                 ep.dt += offset
  341.  
  342.  
  343. def filter_episodes(
  344.         episodes: List[UYDEpisode],
  345.         first_ep: int,
  346.         last_ep: int,
  347.         supplements: bool,
  348.         video: bool,
  349.         live: bool,
  350.         marcia: bool
  351. ) -> List[UYDEpisode]:
  352.     episodes = filter(lambda e: first_ep <= e.number <= last_ep, episodes)
  353.     if not supplements:
  354.         episodes = filter(lambda e: not e.supplement, episodes)
  355.     if not video and (first_ep <= 100 <= last_ep):
  356.         episodes = filter(lambda e: not e.video, episodes)
  357.     if not live:
  358.         episodes = filter(lambda e: not e.live, episodes)
  359.     if not marcia:
  360.         episodes = filter(lambda e: not e.marcia, episodes)
  361.     return list(episodes)
  362.  
  363.  
  364. # Output
  365. def build_xml(episodes: List[UYDEpisode]) -> str:
  366.     logging.info('Preparing RSS data...')
  367.     header = '<?xml version="1.0" encoding="UTF-8" ?>\n<rss version="2.0">\n<channel>'
  368.     first_ep_n = episodes[0].number
  369.     last_ep_n = episodes[-1].number
  370.     title = '\n<title>Uhh Yeah Dude Archive Episodes {}-{}</title>'.format(first_ep_n, last_ep_n)
  371.     description = '\n<description>America through the eyes of two American Americans</description>'
  372.     language = '\n<language>en-us</language>\n'
  373.     footer = '\n</channel>\n</rss>'
  374.     episodes_xml = '\n'.join(episode.to_xml() for episode in episodes)
  375.     text = header + title + description + language + episodes_xml + footer
  376.     return text
  377.  
  378.  
  379. def print_instructions():
  380.     msg = """
  381. ## How to use this RSS file?
  382. - Check if your podcast app allows importing RSS files stored locally.
  383. - If not, host it somewhere accessible to you. One easy way is as follows:
  384.  
  385. ### Dropbox
  386. - Log in to Dropbox via the web interface and upload the RSS file generated by this program to a non-public folder.
  387. - Select the uploaded file and find the "Share" button.
  388. - Select "Create link".
  389. - The provided link will end with `?dl=0`. Change `0` to `1` when adding this URL to your podcast app.
  390. - **Ensure that your podcast app is not adding this feed to a public catalog. Most apps that offer such a feature will
  391. ask you to specify if a feed is private when subscribing. Take care to do so.**
  392. - (You can also tell your app not to update this feed automatically, since it won't be changing.)
  393.  
  394. ## Please don't use this for piracy
  395. This script is intended for the convenience of paying patrons, not to facilitate the indiscriminate delivery of episodes
  396. UYD has chosen to put in the vault. Please support the boys!
  397. - If hosting the RSS file on the internet, make the link available only to yourself.
  398. - As noted above, ensure your feed is not being added to a public catalog of podcasts.
  399. - Remain a patron, at whatever level is required for Vault access, for as long as you use these files.
  400. """
  401.     print(msg)
  402.  
  403.  
  404. def get_options() -> argparse.Namespace:
  405.     arg_parser = argparse.ArgumentParser(
  406.         description='Create an RSS feed from the UYD archives; can scrape site or process an HTML file'
  407.     )
  408.     scrape_options = arg_parser.add_argument_group(title='Scraping options')
  409.     scrape_options.add_argument(
  410.         '--browser', choices=['chrome', 'firefox'], default='chrome',
  411.         help='Specify which browser to use for scraping'
  412.     )
  413.     scrape_options.add_argument(
  414.         '--webdriver-path', default=None, dest='webdriver_executable',
  415.         help=('Specify path to Chrome/Gecko driver executable; if not given, defaults to checking current directory, '
  416.               'then attempting to download the correct version automatically')
  417.  
  418.     )
  419.     scrape_options.add_argument(
  420.         '--show-browser', action='store_false', dest='headless',
  421.         help='Make the web browser visible while scraping'
  422.     )
  423.     scrape_options.add_argument(
  424.         '--verbose-browser', action='store_false', dest='disable_browser_log',
  425.         help='Show information logged by web browser while scraping'
  426.     )
  427.     scrape_options.add_argument(
  428.         '--save-source', dest='save_source', default=DEFAULT_HTML_PATH,
  429.         help=('Write archive page source HTML to the given path for future use with the --from-source option; '
  430.               'defaults to uyd_archive.html')
  431.     )
  432.     parse_options = arg_parser.add_argument_group(title='Parsing options')
  433.     parse_options.add_argument(
  434.         '--from-source', nargs='?', dest='from_source', default=None, const=DEFAULT_HTML_PATH,
  435.         help=('Process HTML file found at given path rather than scraping site; '
  436.               'if argument present but path not given, defaults to uyd_archive.html')
  437.     )
  438.     parse_options.add_argument(
  439.         '--out-file', dest='out_file', default=DEFAULT_RSS_PATH,
  440.         help='Write RSS file to the given path; if not present, defaults to uyd_archive.rss'
  441.     )
  442.     return arg_parser.parse_args()
  443.  
  444.  
  445. def main() -> None:
  446.     options = get_options()
  447.     if not options.from_source:
  448.         source = scrape_site(
  449.             options.browser, options.headless, options.disable_browser_log, options.webdriver_executable
  450.         )
  451.         with open(options.save_source, 'w', encoding='utf-8') as out_html:
  452.             out_html.write(source)
  453.         logging.info('HTML file written to {}'.format(options.save_source))
  454.     else:
  455.         with open(options.from_source, 'r', encoding='utf-8') as in_html:
  456.             source = in_html.read()
  457.  
  458.     episodes = parse_episodes(source)
  459.     if not episodes:
  460.         logging.critical('Failed to locate any episodes on page, exiting!')
  461.         sys.exit(1)
  462.  
  463.     min_ep, max_ep = episodes[0].number, episodes[-1].number
  464.     first_ep = prompt_int('Start at episode', min_ep, max_ep)
  465.     last_ep = prompt_int('End at episode', first_ep, max_ep)
  466.     supplements_prompt = 'Include supplements (a few mini-episodes in the 100s), if any in range?'
  467.     supplements = prompt_yn(supplements_prompt, True)
  468.     video_prompt = 'Include videos, if any in range? (#100: only available as video; #500: audio version also in feed)'
  469.     video = (first_ep <= 100 <= last_ep or first_ep <= 500 <= last_ep) and prompt_yn(video_prompt, False)
  470.     live_prompt = 'Include live episodes?'
  471.     live = prompt_yn(live_prompt, True)
  472.     marcia_prompt = 'Include episodes featuring Marcia Romatelli?'
  473.     marcia = prompt_yn(marcia_prompt, True)
  474.     episodes = filter_episodes(episodes, first_ep, last_ep, supplements, video, live, marcia)
  475.  
  476.     if not episodes:
  477.         logging.info('No episodes within given parameters, exiting!')
  478.         sys.exit(0)
  479.  
  480.     logging.info('{} episodes to be included'.format(len(episodes)))
  481.     xml = build_xml(episodes)
  482.     with open(options.out_file, 'w', encoding='utf-8') as f:
  483.         f.write(xml)
  484.     logging.info('RSS file written to {}'.format(options.out_file))
  485.     print_instructions()
  486.  
  487.  
  488. if __name__ == '__main__':
  489.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement