Advertisement
Guest User

scraper.py

a guest
Jun 12th, 2020
118
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.92 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. # KodiAddon (CBC News)
  3. #
  4. from t1mlib import t1mAddon
  5. import datetime
  6. import json
  7. import re
  8. import urllib
  9. import urllib2
  10. import xbmcplugin
  11. import xbmcgui
  12. import HTMLParser
  13. import sys
  14. import xbmc
  15.  
  16. h = HTMLParser.HTMLParser()
  17. UTF8 = 'utf-8'
  18.  
  19. class myAddon(t1mAddon):
  20.  
  21.  def getAddonMenu(self,url,ilist):
  22.    html  = self.getRequest('http://www.cbc.ca/player')
  23.    shows = re.compile('<h2 class="section-title"[^>]*><a[^>]* href="(.+?)">(.+?)</a>', re.DOTALL).findall(html)
  24.    # Static strings s/b converted to language numbers for official Kodi add-on.
  25.    # However, does not work on my Kodi 17.6 Krypton (empty string displayed).
  26.    # Perhaps because my language locale is not en_gb?
  27.    shows.append(('/player/news/TV%20Shows/MarketPlace', 'Marketplace')) # self.addon.getLocalizedString(30001)))
  28.    shows.append(('/player/news/TV%20Shows/Power%20&%20Politics', 'Power & Politics')) # self.addon.getLocalizedString(30003)))
  29.    shows.append(('/player/news/TV%20Shows/The%20Fifth%20Estate', 'The Fifth Estate')) # self.addon.getLocalizedString(30004)))
  30.    shows.append(('/player/news/TV%20Shows/The%20National/Latest%20Broadcast', 'The National')) # self.addon.getLocalizedString(30005)))
  31.    shows.append(('/player/news/TV%20Shows/The%20Weekly', 'The Weekly')) # self.addon.getLocalizedString(30006)))
  32.    for url, name in shows:
  33.       infoList = {}
  34.       infoList['mediatype'] = 'tvshow'
  35.       infoList['Title'] = name
  36.       infoList['TVShowTitle'] = name
  37.       ilist = self.addMenuItem(name, 'GS', ilist, url, self.addonIcon, self.addonFanart, infoList, isFolder=True)
  38.    return(ilist)
  39.  
  40.  def getAddonCats(self,url,ilist):
  41.    # Retrieve list of local regions
  42.    html  = self.getRequest('http://www.cbc.ca/news/local')
  43.    html = re.compile('window.__INITIAL_STATE__ = (.+?);</script>', re.DOTALL).search(html).group(1)
  44.    a = json.loads(html)
  45.    for b in a['regions']['regionList']:
  46.      name = b['title']
  47.      link = b['link']
  48.      lurl = "/player"+link
  49.      # Manual fixes for multi-word locations
  50.      if lurl == '/player/news/canada/british-columbia':
  51.         lurl = '/player/news/canada/bc'
  52.      if lurl == '/player/news/canada/thunder-bay':
  53.         lurl = '/player/news/canada/thunder%20bay'
  54.      if lurl == '/player/news/canada/new-brunswick':
  55.         lurl = '/player/news/canada/nb'
  56.      if lurl == '/player/news/canada/prince-edward-island':
  57.         lurl = '/player/news/canada/pei'
  58.      if lurl == '/player/news/canada/nova-scotia':
  59.         lurl = '/player/news/canada/ns'
  60.      if lurl == '/player/news/canada/newfoundland-labrador':
  61.         lurl = '/player/news/canada/nl'
  62.      infoList = {}
  63.      infoList['mediatype'] = 'tvshow'
  64.      infoList['Title'] = name
  65.      infoList['TVShowTitle'] = name
  66.      ilist = self.addMenuItem(name, 'GE', ilist, lurl, self.addonIcon, self.addonFanart, infoList, isFolder=True)
  67.      if lurl == '/news/canada/toronto':
  68.         ilist = self.addMenuItem('Ottawa', 'GE', ilist, '/player/news/canada/ottawa', self.addonIcon, self.addonFanart, infoList, isFolder=True)
  69.    return(ilist)
  70.  
  71.  def getAddonShows(self,url,ilist):
  72.    html  = self.getRequest('http://www.cbc.ca%s' % url)
  73.    # Parse shows for this section (no href link)
  74.    shows = re.compile('<h2 class="section-title">([^<]+?)</h2>', re.DOTALL).findall(html)
  75.    for name in shows:
  76.        infoList = {}
  77.        infoList['mediatype'] = 'tvshow'
  78.        infoList['Title'] = name
  79.        infoList['TVShowTitle'] = name
  80.        # Avoid doubling up on some shows (E.g., "Music" under "Music" section)
  81.        if (name != "More From CBC") and not (url.find('TV%20Shows') > 0) \
  82.           and (name != "Digital Archives") and (name != "Music"):
  83.           ilist = self.addMenuItem(name, 'GE', ilist, url, self.addonIcon, self.addonFanart, infoList, isFolder=True)
  84.    # Parse shows on other pages (have href link)
  85.    shows = re.compile('<h2 class="section-title"[^>]*><a[^>]* href="(.+?)">(.+?)</a>', re.DOTALL).findall(html)
  86.    count = 0
  87.    for lurl, name in shows:
  88.        count+=1
  89.    if (count <= 0) or (url.find('TV%20Shows') > 0):
  90.        self.getAddonEpisodes(url, ilist)
  91.    else:
  92.        for lurl, name in shows:
  93.            name = name.replace("&#x27;", "'")
  94.            lurl = lurl.replace(' ', '%20')
  95.            infoList = {}
  96.            infoList['mediatype'] = 'tvshow'
  97.            infoList['Title'] = name
  98.            infoList['TVShowTitle'] = name
  99.            if (lurl == '/player/news/canada'):
  100.               ilist = self.addMenuItem(name, 'GC', ilist, lurl, self.addonIcon, self.addonFanart, infoList, isFolder=True)
  101.            elif (name != "LIVE"):
  102.               ilist = self.addMenuItem(name, 'GE', ilist, lurl, self.addonIcon, self.addonFanart, infoList, isFolder=True)
  103.    return(ilist)
  104.  
  105.  def getAddonEpisodes(self,url,ilist):
  106.    self.defaultVidStream['width']  = 1280
  107.    self.defaultVidStream['height'] = 720
  108.    cat = re.compile('/([^/]+?)$', re.DOTALL).search(url).group(1).replace('%20', ' ')
  109.    html = self.getRequest('http://www.cbc.ca%s' % url)
  110.    html = re.compile('window.__INITIAL_STATE__ = (.+?);</script>', re.DOTALL).search(html).group(1)
  111.    a = json.loads(html)
  112.    # Locate exact category name
  113.    for b in a['video']['clipsByCategory']:
  114.        if re.search(cat+"$", b, re.IGNORECASE):  # category must be at end of string
  115.           idxcat = b
  116.    for b in a['video']['clipsByCategory'][idxcat]['items']:
  117.       name = b['title'].replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u2014", "--").encode('ascii', 'xmlcharrefreplace')
  118.       plot = b['description'].replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u2014", "--").encode('ascii', 'xmlcharrefreplace')
  119.       vurl = str(b['id'])  # mediaID
  120.       thumb = b['thumbnail']
  121.       fanart = thumb
  122.       if b['captions']:
  123.          captions = b['captions']['src']
  124.       else:
  125.          captions = 'N0NE'
  126.       vurl = str(vurl)+'|'+str(captions)
  127.       #xbmc.log("DEBUG gAE name="+name+", vurl="+vurl, xbmc.LOGNOTICE)
  128.       infoList = {}
  129.       infoList['mediatype'] = 'tvshow'
  130.       infoList['Title'] = name
  131.       infoList['TVShowTitle'] = name
  132.       infoList['Plot'] = plot
  133.       infoList['Duration'] = b['duration']
  134.       infoList['Aired'] = datetime.datetime.fromtimestamp(b['airDate']/1000).strftime('%Y-%m-%d')
  135.       ilist = self.addMenuItem(name, 'GV', ilist, vurl, thumb, fanart, infoList, isFolder=False)
  136.    return(ilist)
  137.  
  138.  def getAddonVideo(self,url):
  139.       url = url.split('|',1)
  140.       captions = url[1]
  141.       url = url[0]
  142.       u = 'https://link.theplatform.com/s/ExhSPC/media/guid/2655402169/' + url
  143.       u = u + '/meta.smil'
  144.       u = u + '?mbr=true&manifest=m3u&feed=Player%20Selector%20-%20Prod'
  145.       html = self.getRequest(u)
  146.       u = re.compile('RESOLUTION=1280x720.+?\n(http.+?)\?', re.DOTALL).search(html).group(1)
  147.       if u is None:
  148.            return
  149.       liz = xbmcgui.ListItem(path = u.strip())
  150.       if captions != 'N0NE':
  151.           liz.setSubtitles([captions])
  152.       xbmcplugin.setResolvedUrl(int(sys.argv[1]), True, liz)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement