Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #artetv_v1.py
- from amutil import AmUtil #mainly for genericUrlReader
- from enum import Enum
- import json
- class Languages(Enum):
- English = "en"
- Portuguese = "pt"
- German = "de"
- #class Languages
- class ArteTvConsumer:
- BASE_LAST_CHANCE_URL = "https://www.arte.tv/%s/videos/last-chance/"
- LAST_CHANCE_JSON_SCRIPT_START = "<script id=\"__NEXT_DATA__\" type=\"application/json\">"
- LAST_CHANCE_JSON_SCRIPT_END = "</script>"
- def __init__(
- self,
- pLanguage = Languages.English
- ):
- if(pLanguage==Languages.Portuguese):
- ArteTvConsumer.BASE_LAST_CHANCE_URL = ArteTvConsumer.BASE_LAST_CHANCE_URL % (Languages.Portuguese)
- else:
- ArteTvConsumer.BASE_LAST_CHANCE_URL = ArteTvConsumer.BASE_LAST_CHANCE_URL % ("en")
- #if
- #def __init__
- def __str__(self):
- strAll = ""
- strAll = "BASE_URL= {}".format(ArteTvConsumer.BASE_LAST_CHANCE_URL)
- return strAll
- #def __str__
- #classmethod
- @staticmethod
- def getDictForLastChanceVideos(
- pUrl:str = BASE_LAST_CHANCE_URL,
- pOnlyVideos = True,
- pDebug = False
- ):
- strHTML = AmUtil.genericUrlReader(pUrl)
- bSuccessOnAccess = strHTML!=""
- if (bSuccessOnAccess):
- iStartRelevantPortion = strHTML.find(ArteTvConsumer.LAST_CHANCE_JSON_SCRIPT_START)
- bStartExists = iStartRelevantPortion!=-1
- if (bStartExists):
- strRelevant = strHTML[
- iStartRelevantPortion + len(ArteTvConsumer.LAST_CHANCE_JSON_SCRIPT_START)
- :
- ]
- #"banana".find("batata") #-1
- iEndExists = strRelevant.find(ArteTvConsumer.LAST_CHANCE_JSON_SCRIPT_END)
- bEndExists = iEndExists!=-1
- if (bEndExists):
- strRelevant = strRelevant[
- 0
- :
- iEndExists
- ]
- #strRelevant is a string, in the JSON notation, that corresponds to a dict describing videos, about to disappear from the site
- dictFullVideosLastChance = json.loads(strRelevant)
- dictVideosOnly = dictFullVideosLastChance["props"]["pageProps"]["initialPage"]["zones"][0]["data"]
- if (pOnlyVideos):
- if (pDebug):
- strVideosOnly = json.dumps(dictVideosOnly)
- fw = open(
- file="example_videos_only.JSON",
- mode="wt",
- encoding="UTF-8"
- )
- fw.write(strVideosOnly)
- fw.close()
- return dictVideosOnly
- else:
- if (pDebug):
- fw = open(
- file="example_full.JSON",
- mode="wt",
- encoding="UTF-8"
- )
- fw.write(strRelevant)
- fw.close()
- dictFullVideosLastChance
- #if there is an ending
- #if there is a proper start
- #if there was HTML access
- #def getDictForLastChanceVideos
- def getLastChanceVideosNPagesDeep(
- self,
- pN = 2,
- pDebug = False
- )->list:
- #listVideos = list()
- listVideos = []
- bCompleted = False
- iPage = 1
- url = ArteTvConsumer.BASE_LAST_CHANCE_URL+"?page=%d"%(iPage) #"https://www.arte.tv/en/videos/last-chance/?page=1" .. "https://www.arte.tv/en/videos/last-chance/?page=10"
- while (not bCompleted):
- dictVideosInCurrentPage = ArteTvConsumer.getDictForLastChanceVideos(url)
- for video in dictVideosInCurrentPage:
- listVideos.append(video)
- #TODO listVideos must grow, must incorporate the found videos
- iPage+=1
- url = ArteTvConsumer.BASE_LAST_CHANCE_URL + "?page=%d" % (iPage)
- bCompleted = iPage>pN
- #while
- if (pDebug):
- fw = open (
- file="list_of_all_videos_found.JSON",
- mode="wt",
- encoding="UTF-8"
- )
- #fw.write(json.dumps(listVideos))
- strJson = json.dumps(listVideos)
- fw.write(strJson)
- fw.close()
- #if
- return listVideos
- #def getLastChanceVideosNPagesDeep
- """
- single table
- tVideos
- video (programId\turl, title, duration, ageRating)
- programId\turl\ttitle\tduration\tageRating\n
- ...
- record2\n
- record3\n
- """
- @staticmethod
- def addVideosToTSVDatabase(
- pListVideos:list,
- pDatabaseName = "ARTE_TV_DB.TSV"
- ):
- fw = open(
- file=pDatabaseName,
- #mode="wt" #destructive write/replace text
- mode="at", #non-destructive append text
- encoding="UTF-8"
- )
- for video in pListVideos:
- programId = video["programId"]
- url = video["url"]
- title = video["title"]
- duration = video["duration"]
- ageRating = video["ageRating"]
- #bRecordAlreadyExists = findVideoInTSVDatabase(pDatabaseName, programId)
- bRecordAlreadyExists = False
- if (not bRecordAlreadyExists):
- strRecordTSV = "%s\t%s\t%s\t%d\t%d\n"%(
- programId,
- url,
- title,
- duration,
- ageRating
- )
- fw.write(strRecordTSV)
- #if
- #for all videos
- fw.close()
- return pDatabaseName
- #def addVideosToTSVDatabase
- @staticmethod
- def findLongestVideo(
- pListOfVideos:list
- ):
- iLongestDuration = None
- idxLongestVideo = None
- #idx 0 .. 39
- for idx in range(len(pListOfVideos)):
- video = pListOfVideos[idx]
- duration = video["duration"]
- if (iLongestDuration==None or duration>iLongestDuration):
- iLongestDuration = duration
- idxLongestVideo = idx
- #if
- #for
- oLongestVideo = pListOfVideos[idxLongestVideo]
- return oLongestVideo
- #def findLongestVideo
- #class ArteTvConsumer
- tv = ArteTvConsumer()
- #print (ArteTvConsumer.BASE_URL)
- print(tv)
- #tv.getDictForLastChanceVideos() #would be possible for classmethod
- #dictVideos = ArteTvConsumer.getDictForLastChanceVideos(pDebug=True) #will also write an "example.json" file
- #dictVideos = ArteTvConsumer.getDictForLastChanceVideos() #will also write an "example.json" file
- #print(dictVideos)
- listAllVideosUpToPageNumber2 = tv.getLastChanceVideosNPagesDeep(
- pN=2,
- pDebug=True
- )
- #print(listAllVideosUpToPageNumber2)
- strDB = ArteTvConsumer.addVideosToTSVDatabase(listAllVideosUpToPageNumber2)
- print (strDB)
Add Comment
Please, Sign In to add comment