Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #artetv_v1.py
- from amutil import AmUtil #mainly for genericUrlReader
- from enum import Enum
- import json
- from artetvvideo import ArteTvVideo
- class Languages(Enum):
- English = "en"
- Portuguese = "pt"
- German = "de"
- French = "fr"
- Spanish = "es"
- #class Languages
- class ArteTvConsumer:
- BASE_LAST_CHANCE_URL = "https://www.arte.tv/%s/videos/last-chance/"
- LAST_CHANCE_JSON_SCRIPT_START = "<script id=\"__NEXT_DATA__\" type=\"application/json\">"
- LAST_CHANCE_JSON_SCRIPT_END = "</script>"
- def __init__(
- self,
- pLanguage = Languages.English
- ):
- ArteTvConsumer.BASE_LAST_CHANCE_URL = ArteTvConsumer.BASE_LAST_CHANCE_URL % (pLanguage.value)
- #def __init__
- def __str__(self):
- strAll = "BASE_URL= {}".format(ArteTvConsumer.BASE_LAST_CHANCE_URL)
- return strAll
- #def __str__
- #classmethod
- @staticmethod
- def getDictForLastChanceVideos(
- pUrl:str = BASE_LAST_CHANCE_URL,
- pOnlyVideos = True,
- pDebug = False
- ):
- strHTML = AmUtil.genericUrlReader(pUrl)
- bSuccessOnAccess = strHTML!=""
- if (bSuccessOnAccess):
- iStartRelevantPortion = strHTML.find(ArteTvConsumer.LAST_CHANCE_JSON_SCRIPT_START)
- bStartExists = iStartRelevantPortion!=-1
- if (bStartExists):
- strRelevant = strHTML[
- iStartRelevantPortion + len(ArteTvConsumer.LAST_CHANCE_JSON_SCRIPT_START)
- :
- ]
- #"banana".find("batata") #-1
- iEndExists = strRelevant.find(ArteTvConsumer.LAST_CHANCE_JSON_SCRIPT_END)
- bEndExists = iEndExists!=-1
- if (bEndExists):
- strRelevant = strRelevant[
- 0
- :
- iEndExists
- ]
- #strRelevant is a string, in the JSON notation, that corresponds to a dict describing videos, about to disappear from the site
- dictFullVideosLastChance = json.loads(strRelevant)
- dictVideosOnly = dictFullVideosLastChance["props"]["pageProps"]["initialPage"]["zones"][0]["data"]
- if (pOnlyVideos):
- if (pDebug):
- strVideosOnly = json.dumps(dictVideosOnly)
- fw = open(
- file="example_videos_only.JSON",
- mode="wt",
- encoding="UTF-8"
- )
- fw.write(strVideosOnly)
- fw.close()
- return dictVideosOnly
- else:
- if (pDebug):
- fw = open(
- file="example_full.JSON",
- mode="wt",
- encoding="UTF-8"
- )
- fw.write(strRelevant)
- fw.close()
- dictFullVideosLastChance
- #if there is an ending
- #if there is a proper start
- #if there was HTML access
- #def getDictForLastChanceVideos
- def getLastChanceVideosNPagesDeep(
- self,
- pN = 2,
- pDebug = False
- )->list:
- #listVideos = list()
- listVideos = []
- bCompleted = False
- iPage = 1
- url = ArteTvConsumer.BASE_LAST_CHANCE_URL+"?page=%d"%(iPage) #"https://www.arte.tv/en/videos/last-chance/?page=1" .. "https://www.arte.tv/en/videos/last-chance/?page=10"
- while (not bCompleted):
- dictVideosInCurrentPage = ArteTvConsumer.getDictForLastChanceVideos(url)
- for video in dictVideosInCurrentPage:
- listVideos.append(video)
- #TODO listVideos must grow, must incorporate the found videos
- iPage+=1
- url = ArteTvConsumer.BASE_LAST_CHANCE_URL + "?page=%d" % (iPage)
- bCompleted = iPage>pN
- #while
- if (pDebug):
- fw = open (
- file="list_of_all_videos_found.JSON",
- mode="wt",
- encoding="UTF-8"
- )
- #fw.write(json.dumps(listVideos))
- strJson = json.dumps(listVideos)
- fw.write(strJson)
- fw.close()
- #if
- return listVideos
- #def getLastChanceVideosNPagesDeep
- """
- single table
- tVideos
- video (programId\turl, title, duration, ageRating)
- programId\turl\ttitle\tduration\tageRating\n
- ...
- record2\n
- record3\n
- """
- @staticmethod
- def addVideosToTSVDatabase(
- pListVideos:list,
- pDatabaseName = "ARTE_TV_DB.TSV"
- ):
- fw = open(
- file=pDatabaseName,
- #mode="wt" #destructive write/replace text
- mode="at", #non-destructive append text
- encoding="UTF-8"
- )
- for video in pListVideos:
- programId = video["programId"]
- url = video["url"]
- title = video["title"]
- duration = video["duration"]
- ageRating = video["ageRating"]
- #bRecordAlreadyExists = findVideoInTSVDatabase(pDatabaseName, programId)
- bRecordAlreadyExists = False
- if (not bRecordAlreadyExists):
- strRecordTSV = "%s\t%s\t%s\t%d\t%d\n"%(
- programId,
- url,
- title,
- duration,
- ageRating
- )
- fw.write(strRecordTSV)
- #if
- #for all videos
- fw.close()
- return pDatabaseName
- #def addVideosToTSVDatabase
- @staticmethod
- def findLongestVideo(
- pListOfVideos:list
- ):
- iLongestDuration = None
- idxLongestVideo = None
- #idx 0 .. 39
- for idx in range(len(pListOfVideos)):
- video = pListOfVideos[idx]
- duration = video["duration"]
- if (iLongestDuration==None or duration>iLongestDuration):
- iLongestDuration = duration
- idxLongestVideo = idx
- #if
- #for
- oLongestVideo = pListOfVideos[idxLongestVideo]
- return oLongestVideo
- #def findLongestVideo
- @staticmethod
- def readVideosFromTsvFile(pTsvFile:str)->list:
- listRet = []
- try:
- fr = open(
- file=pTsvFile,
- encoding="UTF-8",
- mode="rt",
- )
- strAll = fr.read()
- fr.close()
- if (strAll!=""):
- aRecords = strAll.split("\n")
- for record in aRecords:
- aFields = record.split("\t")
- bValidRecord:bool = len(aFields)==5
- if (bValidRecord):
- dictVideo = dict()
- dictVideo["programId"] = aFields[0]
- dictVideo["url"] = aFields[1]
- dictVideo["title"] = aFields[2]
- #dictVideo["duration"] = aFields[3] #duration will be str
- #dictVideo["ageRating"] = aFields[4] #ageRating will be str
- dictVideo["duration"] = int(aFields[3]) #duration will be int
- dictVideo["ageRating"] = int(aFields[4]) #ageRating will be int
- listRet.append(dictVideo)
- #if valid record
- #for all records
- #if there was content
- #try
- except:
- print("Could not read file ", pTsvFile)
- return []
- #try-except
- return listRet
- #def readVideosFromTsvFile
- @staticmethod
- def readArteTvVideosFromTsvFile(pTsvFile: str) -> list:
- listRet = []
- try:
- fr = open(
- file=pTsvFile,
- encoding="UTF-8",
- mode="rt",
- )
- strAll = fr.read()
- fr.close()
- if (strAll != ""):
- aRecords = strAll.split("\n")
- for record in aRecords:
- aFields = record.split("\t")
- bValidRecord: bool = len(aFields) == 5
- if (bValidRecord):
- oArteTvVideo = ArteTvVideo(
- aFields[0], #programId
- aFields[1], #url
- aFields[2], #title
- int(aFields[3]), #duration
- int(aFields[4]) #age rating
- )
- listRet.append(oArteTvVideo)
- # if valid record
- # for all records
- # if there was content
- # try
- except:
- print("Could not read file ", pTsvFile)
- return []
- # try-except
- return listRet
- # def readArteTvVideosFromTsvFile
- @staticmethod
- def sortListOfVideosByDuration(pListOfVideos:list, pReverse=False):
- pListOfVideos.sort(key=byDuration, reverse=pReverse) #key should be the name of a function that extracts the data that will be used as the criterion (critério) / criteria (critérios) for ascending sorting
- #def sortListOfVideosByDuration
- @staticmethod
- def sortListOfVideosByTitle(pListOfVideos:list, pReverse=False):
- pListOfVideos.sort(key=byTitle, reverse=pReverse)
- #def sortListOfVideosByTitle
- #class ArteTvConsumer
- tv = ArteTvConsumer()
- #print (ArteTvConsumer.BASE_URL)
- print(tv)
- #tv.getDictForLastChanceVideos() #would be possible for classmethod
- #dictVideos = ArteTvConsumer.getDictForLastChanceVideos(pDebug=True) #will also write an "example.json" file
- #dictVideos = ArteTvConsumer.getDictForLastChanceVideos() #will also write an "example.json" file
- #print(dictVideos)
- #listAllVideosUpToPageNumber2 = tv.getLastChanceVideosNPagesDeep(pN=2,pDebug=True)
- #print(listAllVideosUpToPageNumber2)
- #strDB = ArteTvConsumer.addVideosToTSVDatabase(listAllVideosUpToPageNumber2)
- #print (strDB)
- #oLongestVideo = ArteTvConsumer.findLongestVideo(listAllVideosUpToPageNumber2)
- #print (oLongestVideo)
- def byDuration(pAnyVideo):
- #return pAnyVideo["duration"]
- #return pAnyVideo.get("duration")
- return int(pAnyVideo.get("duration"))
- #def byDuration
- def byTitle(pAnyVideo):
- return pAnyVideo.get("title")
- #def
- """
- listOfVideosPreviouslySaved = ArteTvConsumer.readVideosFromTsvFile("ARTE_TV_DB.TSV")
- print ("Not sorted: ")
- print (listOfVideosPreviouslySaved)
- ArteTvConsumer.sortListOfVideosByDuration(listOfVideosPreviouslySaved, pReverse=True)
- print ("Sorted by duration: ")
- print (listOfVideosPreviouslySaved)
- print ("__"*40)
- ArteTvConsumer.sortListOfVideosByTitle(listOfVideosPreviouslySaved)
- print ("Sorted by title: ")
- print (listOfVideosPreviouslySaved)
- """
- listOfArteTvVideos = ArteTvConsumer.readArteTvVideosFromTsvFile("ARTE_TV_DB.TSV")
- print(listOfArteTvVideos)
- ********************
- class ArteTvVideo:
- def __init__(self, pProgramId, pUrl, pTitle, pDuration, pAgeRating):
- self.mProgramId = pProgramId
- self.mUrl = pUrl
- self.mTitle = pTitle
- self.mDuration = pDuration
- self.mAgeRating = pAgeRating
- #def __init__
- #for print of a SINGLE video
- def __str__(self):
- strFormat = "program id: %s\nURL: %s\nTitle: %s\nDuration: %d\nAge rating: %d\n%s\n"%(
- self.mProgramId,
- self.mUrl,
- self.mTitle,
- self.mDuration,
- self.mAgeRating,
- "-"*80
- )
- return strFormat
- #def __str__
- # for print of collections of videos
- def __repr__(self):
- return self.__str__()
- def __eq__(self, other):
- bIdenticalByTitle = self.mTitle == other.mTitle
- return bIdenticalByTitle
- def __gt__(self, other):
- return self.mDuration>other.mDuration
- def __lt__(self, v2):
- return self.mDuration<v2.mDuration
- #class ArteTvVideo
Add Comment
Please, Sign In to add comment