Advertisement
Guest User

supersubtitles.py

a guest
Nov 7th, 2019
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 18.52 KB | None | 0 0
  1. # coding=utf-8
  2. import io
  3. import six
  4. import os
  5. from pkg_resources import require
  6. import logging
  7. import re
  8. import os
  9. import time
  10.  
  11. from babelfish import language_converters
  12. from subzero.language import Language
  13. from requests import Session
  14.  
  15. from subliminal.subtitle import fix_line_ending
  16. from subliminal_patch.providers import Provider
  17. from subliminal_patch.providers.mixins import ProviderSubtitleArchiveMixin
  18. from subliminal.providers import ParserBeautifulSoup
  19. from subliminal_patch.exceptions import ProviderError
  20. from subliminal.score import get_equivalent_release_groups
  21. from subliminal_patch.subtitle import Subtitle, guess_matches
  22. from subliminal.utils import sanitize, sanitize_release_group
  23. from subliminal.video import Episode, Movie
  24. from zipfile import ZipFile, is_zipfile
  25. from rarfile import RarFile, is_rarfile
  26. from subliminal_patch.utils import sanitize, fix_inconsistent_naming as _fix_inconsistent_naming
  27. from guessit import guessit
  28.  
  29.  
  30. logger = logging.getLogger(__name__)
  31.  
  32. language_converters.register('supersubtitles = subliminal_patch.converters.supersubtitles:SuperSubtitlesConverter')
  33.  
  34. def fix_inconsistent_naming(title):
  35.     """Fix titles with inconsistent naming using dictionary and sanitize them.
  36.  
  37.    :param str title: original title.
  38.    :return: new title.
  39.    :rtype: str
  40.  
  41.    """
  42.     return _fix_inconsistent_naming(title, {"For All Mankind": "For All Mankind (2019)",
  43.                                             "See": "See (2019)", "The Morning Show": "The Morning Show (2019)",
  44.                                             "Titans": "Titans (2018)", "The House of Flowers": "The House of Flowers (2018)",
  45.                                             "Animal Kingdom (2016)": "Animal Kingdom (US) (2016)",
  46.                                             "Animal Kingdom": "Animal Kingdom (US) (2016)",
  47.                                             "Ghostwriter": "Ghostwriter (2019)"}, True)
  48.  
  49.  
  50. class SuperSubtitlesSubtitle(Subtitle):
  51.     """SuperSubtitles Subtitle."""
  52.     provider_name = 'supersubtitles'
  53.  
  54.     def __str__(self):
  55.         subtit = "Subtitle id: " + str(self.subtitle_id) \
  56.                  + " Series: " + self.series \
  57.                  + " Season: " + str(self.season) \
  58.                  + " Episode: " + str(self.episode) \
  59.                  + " Version: " + str(self.version) \
  60.                  + " Releases: " + str(self.releases) \
  61.                  + " DownloadLink: " + str(self.page_link) \
  62.                  + " Matches: " + str(self.matches)
  63.         if self.year:
  64.             subtit = subtit + " Year: " + str(self.year)
  65.         return subtit.encode('utf-8')
  66.  
  67.     def __init__(self, language, page_link, subtitle_id, series, season, episode, version,
  68.                  releases, year, imdb_id, asked_for_episode=None, asked_for_release_group=None):
  69.         super(SuperSubtitlesSubtitle, self).__init__(language, page_link=page_link)
  70.         self.subtitle_id = subtitle_id
  71.         self.series = series
  72.         self.season = season
  73.         self.episode = episode
  74.         self.version = version
  75.         self.releases = releases
  76.         self.year = year
  77.         if year:
  78.             self.year = int(year)
  79.  
  80.         self.release_info = u", ".join(releases)
  81.         self.page_link = page_link
  82.         self.asked_for_release_group = asked_for_release_group
  83.         self.asked_for_episode = asked_for_episode
  84.         self.imdb_id = imdb_id
  85.         self.is_pack = True
  86.  
  87.     def numeric_id(self):
  88.         return self.subtitle_id
  89.  
  90.     def __repr__(self):
  91.         ep_addon = (" S%02dE%02d" % (self.season, self.episode)) if self.episode else ""
  92.         return '<%s %r [%s]>' % (
  93.             self.__class__.__name__, u"%s%s%s [%s]" % (self.series, " (%s)" % self.year if self.year else "", ep_addon,
  94.                                                        self.release_info), self.language)
  95.  
  96.     @property
  97.     def id(self):
  98.         return str(self.subtitle_id)
  99.  
  100.     def get_matches(self, video):
  101.         matches = guess_matches(video, guessit(self.release_info.encode("utf-8")))
  102.  
  103.         # episode
  104.         if isinstance(video, Episode):
  105.             # series
  106.             if video.series and (sanitize(self.series) == sanitize(fix_inconsistent_naming(video.series)) or sanitize(self.series) == sanitize(video.series)):
  107.                 matches.add('series')
  108.             # season
  109.             if video.season and self.season == video.season:
  110.                 matches.add('season')
  111.             # episode
  112.             if video.episode and self.episode == video.episode:
  113.                 matches.add('episode')
  114.             # imdb_id
  115.             if video.series_imdb_id and self.imdb_id and str(self.imdb_id) == str(video.series_imdb_id):
  116.                 matches.add('series_imdb_id')
  117.                 matches.add('series')
  118.                 matches.add('year')
  119.             # year
  120.             if ('series' in matches and video.original_series and self.year is None or
  121.                     video.year and video.year == self.year):
  122.                 matches.add('year')
  123.         # movie
  124.         elif isinstance(video, Movie):
  125.             # title
  126.             if video.title and (sanitize(self.series) in (
  127.                     sanitize(name) for name in [video.title] + video.alternative_titles)):
  128.                 matches.add('title')
  129.             # imdb_id
  130.             if video.imdb_id and self.imdb_id == video.imdb_id:
  131.                 matches.add('imdb_id')
  132.                 matches.add('title')
  133.                 matches.add('year')
  134.             # year
  135.             if video.year and self.year == video.year:
  136.                 matches.add('year')
  137.  
  138.         # release_group
  139.         if (video.release_group and self.version and
  140.                 any(r in sanitize_release_group(self.version)
  141.                     for r in get_equivalent_release_groups(sanitize_release_group(video.release_group)))):
  142.             matches.add('release_group')
  143.         # resolution
  144.         if video.resolution and self.version and video.resolution in self.version.lower():
  145.             matches.add('resolution')
  146.         # format
  147.         if video.format and self.version and video.format.lower() in self.version.lower():
  148.             matches.add('format')
  149.  
  150.         self.matches = matches
  151.         return matches
  152.  
  153.  
  154. class SuperSubtitlesProvider(Provider, ProviderSubtitleArchiveMixin):
  155.     """SuperSubtitles Provider."""
  156.     languages = {Language('hun', 'HU')} | {Language(l) for l in [
  157.         'hun', 'eng'
  158.     ]}
  159.     video_types = (Episode, Movie)
  160.     # https://www.feliratok.info/?search=&soriSorszam=&nyelv=&sorozatnev=The+Flash+%282014%29&sid=3212&complexsearch=true&knyelv=0&evad=4&epizod1=1&cimke=0&minoseg=0&rlsr=0&tab=all
  161.     server_url = 'https://www.feliratok.info/'
  162.     subtitle_class = SuperSubtitlesSubtitle
  163.     hearing_impaired_verifiable = False
  164.     multi_result_throttle = 2  # seconds
  165.  
  166.     def initialize(self):
  167.         self.session = Session()
  168.         self.session.headers = {'User-Agent': os.environ.get("SZ_USER_AGENT", "Sub-Zero/2")}
  169.  
  170.     def terminate(self):
  171.         self.session.close()
  172.  
  173.     def get_language(self, text):
  174.         if text == 'Magyar':
  175.             return Language.fromsupersubtitles('hu')
  176.         if text == 'Angol':
  177.             return Language.fromsupersubtitles('en')
  178.         return None
  179.  
  180.     def find_imdb_id(self, sub_id):
  181.         """
  182.  
  183.        """
  184.  
  185.         url = self.server_url + "index.php?tipus=adatlap&azon=a_" + sub_id
  186.         # url = https://www.feliratok.info/index.php?tipus=adatlap&azon=a_1518600916
  187.         logger.info('Get IMDB id from URL %s', url)
  188.         r = self.session.get(url, timeout=10).content
  189.  
  190.         soup = ParserBeautifulSoup(r, ['lxml'])
  191.         links = soup.find_all("a")
  192.  
  193.         for value in links:
  194.             if "imdb.com" in str(value):
  195.                 # <a alt="iMDB" href="http://www.imdb.com/title/tt2357547/" target="_blank"><img alt="iMDB" src="img/adatlap/imdb.png"/></a>
  196.                 imdb_id = re.findall(r'(?<=www\.imdb\.com/title/).*(?=/")', str(value))[0]
  197.                 return imdb_id
  198.  
  199.         return None
  200.  
  201.     def find_id(self, series, year, original_title):
  202.         """
  203.        We need to find the id of the series at the following url:
  204.        https://www.feliratok.info/index.php?term=SERIESNAME&nyelv=0&action=autoname
  205.        Where SERIESNAME is a searchable string.
  206.        The result will be something like this:
  207.        [{"name":"DC\u2019s Legends of Tomorrow (2016)","ID":"3725"},{"name":"Miles from Tomorrowland (2015)","ID":"3789"}
  208.        ,{"name":"No Tomorrow (2016)","ID":"4179"}]
  209.  
  210.        """
  211.  
  212.         # Search for exact name
  213.         logger.info('Search for exact name for %s', series)
  214.         url = self.server_url + "index.php?term=" + series + "&nyelv=0&action=autoname"
  215.         # url = self.server_url + "index.php?term=" + "fla"+ "&nyelv=0&action=autoname"
  216.         logger.info('Get series id from URL %s', url)
  217.         r = self.session.get(url, timeout=10)
  218.  
  219.         # r is something like this:
  220.         # [{"name":"DC\u2019s Legends of Tomorrow (2016)","ID":"3725"},{"name":"Miles from Tomorrowland (2015)","ID":"3789"}
  221.         # ,{"name":"No Tomorrow (2016)","ID":"4179"}]
  222.  
  223.         results = r.json()
  224.  
  225.         # check all of the results:
  226.         for result in results:
  227.             try:
  228.                 # "name":"Miles from Tomorrowland (2015)","ID":"3789"
  229.                 result_year = re.findall(r"(?<=\()\d\d\d\d(?=\))", result['name'])[0]
  230.             except IndexError:
  231.                 result_year = ""
  232.  
  233.             try:
  234.                 # "name":"Miles from Tomorrowland (2015)","ID":"3789"
  235.                 result_title = re.findall(r".*(?=\(\d\d\d\d\))", result['name'])[0]
  236.                 result_id = result['ID']
  237.             except IndexError:
  238.                 continue
  239.  
  240.             result_title = result_title.strip().replace("�", "").replace(" ", ".")
  241.  
  242.             guessable = result_title.strip() + ".s01e01." + result_year
  243.             guess = guessit(guessable, {'type': "episode"})
  244.             is_true = sanitize(original_title) == sanitize(guess['title']) and year and guess['year'] and year == guess['year']
  245.             if sanitize(original_title) == sanitize(guess['title']) and year and guess['year'] and year == guess['year']:
  246.                 # Return the founded id
  247.                 return result_id
  248.  
  249.         return None
  250.  
  251.     def query(self, series, video=None):
  252.         year = video.year
  253.         subtitle = None
  254.         if isinstance(video, Episode):
  255.             series = video.series
  256.             season = video.season
  257.             episode = video.episode
  258.             #seriesa = series.replace(' ', '+')
  259.             if year is None:
  260.                 try:
  261.                     year = int(re.findall(r"(?<=\()\d\d\d\d(?=\))", fix_inconsistent_naming(video.series))[0])
  262.                 except IndexError:
  263.                     year = video.year
  264.  
  265.             # Get ID of series with original name
  266.             series_id = self.find_id(series, year, series)
  267.             if not series_id:
  268.                 # If not founded try without ' char
  269.                 modified_series = series.replace(' ', '+').replace('\'', '')
  270.                 series_id = self.find_id(modified_series, year, series)
  271.                 if not series_id and modified_series:
  272.                     # If still not founded try with the longest word is series title
  273.                     modified_series = modified_series.split('+')
  274.                     modified_series = max(modified_series, key=len)
  275.                     series_id = self.find_id(modified_series, year, series)
  276.  
  277.                     if not series_id:
  278.                         return None
  279.  
  280.             # https://www.feliratok.info/index.php?search=&soriSorszam=&nyelv=&sorozatnev=&sid=2075&complexsearch=true&knyelv=0&evad=6&epizod1=16&cimke=0&minoseg=0&rlsr=0&tab=all
  281.             url = self.server_url + "index.php?search=&soriSorszam=&nyelv=&sorozatnev=&sid=" + \
  282.                   str(series_id) + "&complexsearch=true&knyelv=0&evad=" + str(season) + "&epizod1=" + str(
  283.                 episode) + "&cimke=0&minoseg=0&rlsr=0&tab=all"
  284.             subtitle = self.process_subs(series, video, url)
  285.  
  286.             if not subtitle:
  287.                 # No Subtitle found. Maybe already archived to season pack
  288.                 url = self.server_url + "index.php?search=&soriSorszam=&nyelv=&sorozatnev=&sid=" + \
  289.                       str(series_id) + "&complexsearch=true&knyelv=0&evad=" + str(
  290.                     season) + "&epizod1=&evadpakk=on&cimke=0&minoseg=0&rlsr=0&tab=all"
  291.                 subtitle = self.process_subs(series, video, url)
  292.  
  293.         if isinstance(video, Movie):
  294.             title = series.replace(" ", "+")
  295.  
  296.             # https://www.feliratok.info/index.php?search=The+Hitman%27s+BodyGuard&soriSorszam=&nyelv=&tab=film
  297.             url = self.server_url + "index.php?search=" + title + "&soriSorszam=&nyelv=&tab=film"
  298.             subtitle = self.process_subs(series, video, url)
  299.  
  300.         return subtitle
  301.  
  302.     def process_subs(self, series, video, url):
  303.  
  304.         subtitles = []
  305.  
  306.         logger.info('URL for subtitles %s', url)
  307.         r = self.session.get(url, timeout=10).content
  308.  
  309.         soup = ParserBeautifulSoup(r, ['lxml'])
  310.         tables = soup.find_all("table")
  311.         tables = tables[0].find_all("tr")
  312.         i = 0
  313.         series_imdb_id = None
  314.         for table in tables:
  315.             if "vilagit" in str(table) and i > 1:
  316.                 try:
  317.                     sub_hun_name = table.findAll("div", {"class": "magyar"})[0]
  318.                     if isinstance(video, Episode):
  319.                         if "vad)" not in str(sub_hun_name):
  320.                             # <div class="magyar">A pletykaf�szek (3. �vad)</div>
  321.                             sub_hun_name = re.findall(r'(?<=<div class="magyar">).*(?= -)', str(sub_hun_name))[0]
  322.                         else:
  323.                             # <div class="magyar">A holnap legend�i - 3x11</div>
  324.                             sub_hun_name = re.findall(r'(?<=<div class="magyar">).*(?= \()', str(sub_hun_name))[0]
  325.                     if isinstance(video, Movie):
  326.                         sub_hun_name = re.findall(r'(?<=<div class="magyar">).*(?=</div)', str(sub_hun_name))[0]
  327.                 except IndexError:
  328.                     sub_hun_name = ""
  329.  
  330.                 asked_for_episode = None
  331.                 sub_season = None
  332.                 sub_episode = None
  333.                 sub_english = table.findAll("div", {"class": "eredeti"})
  334.                 if isinstance(video, Episode):
  335.                     asked_for_episode = video.episode
  336.                     if "Season" not in str(sub_english):
  337.                         # [<div class="eredeti">Gossip Girl (Season 3) (DVDRip-REWARD)</div>]
  338.                         sub_english_name = re.findall(r'(?<=<div class="eredeti">).*?(?= -)', str(sub_english))[0]
  339.                         sub_season = int((re.findall(r"(?<=- ).*?(?= - )", str(sub_english))[0].split('x')[0]).strip())
  340.                         sub_episode = int((re.findall(r"(?<=- ).*?(?= - )", str(sub_english))[0].split('x')[1]).strip())
  341.  
  342.                     else:
  343.                         # [<div class="eredeti">DC's Legends of Tomorrow - 3x11 - Here I Go Again (HDTV-AFG, HDTV-RMX, 720p-SVA, 720p-PSA </div>]
  344.                         sub_english_name = \
  345.                             re.findall(r'(?<=<div class="eredeti">).*?(?=\(Season)', str(sub_english))[0]
  346.                         sub_season = int(re.findall(r"(?<=Season )\d+(?=\))", str(sub_english))[0])
  347.                         sub_episode = int(video.episode)
  348.                 if isinstance(video, Movie):
  349.                     sub_english_name = re.findall(r'(?<=<div class="eredeti">).*?(?=\()', str(sub_english))[0]
  350.  
  351.                 sub_version = (str(sub_english).split('(')[len(str(sub_english).split('(')) - 1]).split(')')[0]
  352.                 # <small>Angol</small>
  353.                 lang = table.findAll("small")[0]
  354.                 sub_language = self.get_language(re.findall(r"(?<=<small>).*(?=</small>)", str(lang))[0])
  355.  
  356.                 # <a href="/index.php?action=letolt&amp;fnev=DCs Legends of Tomorrow - 03x11 - Here I Go Again.SVA.English.C.orig.Addic7ed.com.srt&amp;felirat=1519162191">
  357.                 link = str(table.findAll("a")[len(table.findAll("a")) - 1]).replace("amp;", "")
  358.                 sub_downloadlink = self.server_url + re.findall(r'(?<=href="/).*(?=">)', link)[0]
  359.  
  360.                 sub_id = re.findall(r"(?<=felirat\=).*(?=\"\>)", link)[0]
  361.                 sub_year = video.year
  362.                 sub_releases = [s.strip() for s in sub_version.split(',')]
  363.  
  364.                 # For episodes we open the series page so all subtitles imdb_id must be the same. no need to check all
  365.                 if isinstance(video, Episode) and series_imdb_id is not None:
  366.                     sub_imdb_id = series_imdb_id
  367.                 else:
  368.                     sub_imdb_id = self.find_imdb_id(sub_id)
  369.                     series_imdb_id = sub_imdb_id
  370.  
  371.                 subtitle = SuperSubtitlesSubtitle(sub_language, sub_downloadlink, sub_id, sub_english_name.strip(), sub_season,
  372.                                                   sub_episode, sub_version, sub_releases, sub_year, sub_imdb_id,
  373.                                                   asked_for_episode, asked_for_release_group=video.release_group )
  374.                 subtitles.append(subtitle)
  375.             i = i + 1
  376.         return subtitles
  377.  
  378.     def list_subtitles(self, video, languages):
  379.         if isinstance(video, Episode):
  380.             titles = [video.series] + video.alternative_series
  381.         elif isinstance(video, Movie):
  382.             titles = [video.title] + video.alternative_titles
  383.  
  384.         for title in titles:
  385.             subs = self.query(title, video=video)
  386.             if subs:
  387.                 return subs
  388.  
  389.             time.sleep(self.multi_result_throttle)
  390.             return []
  391.  
  392.     def download_subtitle(self, subtitle):
  393.  
  394.         # download as a zip
  395.         logger.info('Downloading subtitle %r', subtitle.subtitle_id)
  396.         r = self.session.get(subtitle.page_link, timeout=10)
  397.         r.raise_for_status()
  398.  
  399.         if ".rar" in subtitle.page_link:
  400.             logger.debug('Archive identified as rar')
  401.             archive_stream = io.BytesIO(r.content)
  402.             archive = RarFile(archive_stream)
  403.             subtitle.content = self.get_subtitle_from_archive(subtitle, archive)
  404.         elif ".zip" in subtitle.page_link:
  405.             logger.debug('Archive identified as zip')
  406.             archive_stream = io.BytesIO(r.content)
  407.             archive = ZipFile(archive_stream)
  408.             subtitle.content = self.get_subtitle_from_archive(subtitle, archive)
  409.         else:
  410.             subtitle.content = fix_line_ending(r.content)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement