am_dot_com

FP 2021-12-21

Dec 21st, 2021 (edited)
71
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 12.14 KB | None | 0 0
  1. #artetv_v1.py
  2. from amutil import AmUtil #mainly for genericUrlReader
  3.  
  4. from enum import Enum
  5. import json
  6.  
  7. from artetvvideo import ArteTvVideo
  8.  
  9. class Languages(Enum):
  10. English = "en"
  11. Portuguese = "pt"
  12. German = "de"
  13. French = "fr"
  14. Spanish = "es"
  15. #class Languages
  16.  
  17. class ArteTvConsumer:
  18.  
  19. BASE_LAST_CHANCE_URL = "https://www.arte.tv/%s/videos/last-chance/"
  20.  
  21. LAST_CHANCE_JSON_SCRIPT_START = "<script id=\"__NEXT_DATA__\" type=\"application/json\">"
  22. LAST_CHANCE_JSON_SCRIPT_END = "</script>"
  23.  
  24. def __init__(
  25. self,
  26. pLanguage = Languages.English
  27. ):
  28. ArteTvConsumer.BASE_LAST_CHANCE_URL = ArteTvConsumer.BASE_LAST_CHANCE_URL % (pLanguage.value)
  29. #def __init__
  30.  
  31. def __str__(self):
  32. strAll = "BASE_URL= {}".format(ArteTvConsumer.BASE_LAST_CHANCE_URL)
  33. return strAll
  34. #def __str__
  35.  
  36. #classmethod
  37. @staticmethod
  38. def getDictForLastChanceVideos(
  39. pUrl:str = BASE_LAST_CHANCE_URL,
  40. pOnlyVideos = True,
  41. pDebug = False
  42. ):
  43. strHTML = AmUtil.genericUrlReader(pUrl)
  44. bSuccessOnAccess = strHTML!=""
  45. if (bSuccessOnAccess):
  46. iStartRelevantPortion = strHTML.find(ArteTvConsumer.LAST_CHANCE_JSON_SCRIPT_START)
  47. bStartExists = iStartRelevantPortion!=-1
  48. if (bStartExists):
  49. strRelevant = strHTML[
  50. iStartRelevantPortion + len(ArteTvConsumer.LAST_CHANCE_JSON_SCRIPT_START)
  51. :
  52. ]
  53.  
  54. #"banana".find("batata") #-1
  55. iEndExists = strRelevant.find(ArteTvConsumer.LAST_CHANCE_JSON_SCRIPT_END)
  56. bEndExists = iEndExists!=-1
  57. if (bEndExists):
  58. strRelevant = strRelevant[
  59. 0
  60. :
  61. iEndExists
  62. ]
  63. #strRelevant is a string, in the JSON notation, that corresponds to a dict describing videos, about to disappear from the site
  64. dictFullVideosLastChance = json.loads(strRelevant)
  65. dictVideosOnly = dictFullVideosLastChance["props"]["pageProps"]["initialPage"]["zones"][0]["data"]
  66. if (pOnlyVideos):
  67. if (pDebug):
  68. strVideosOnly = json.dumps(dictVideosOnly)
  69. fw = open(
  70. file="example_videos_only.JSON",
  71. mode="wt",
  72. encoding="UTF-8"
  73. )
  74. fw.write(strVideosOnly)
  75. fw.close()
  76. return dictVideosOnly
  77. else:
  78. if (pDebug):
  79. fw = open(
  80. file="example_full.JSON",
  81. mode="wt",
  82. encoding="UTF-8"
  83. )
  84. fw.write(strRelevant)
  85. fw.close()
  86. dictFullVideosLastChance
  87. #if there is an ending
  88. #if there is a proper start
  89. #if there was HTML access
  90. #def getDictForLastChanceVideos
  91.  
  92. def getLastChanceVideosNPagesDeep(
  93. self,
  94. pN = 2,
  95. pDebug = False
  96. )->list:
  97. #listVideos = list()
  98. listVideos = []
  99.  
  100. bCompleted = False
  101. iPage = 1
  102. url = ArteTvConsumer.BASE_LAST_CHANCE_URL+"?page=%d"%(iPage) #"https://www.arte.tv/en/videos/last-chance/?page=1" .. "https://www.arte.tv/en/videos/last-chance/?page=10"
  103. while (not bCompleted):
  104.  
  105. dictVideosInCurrentPage = ArteTvConsumer.getDictForLastChanceVideos(url)
  106.  
  107. for video in dictVideosInCurrentPage:
  108. listVideos.append(video)
  109.  
  110. #TODO listVideos must grow, must incorporate the found videos
  111.  
  112. iPage+=1
  113. url = ArteTvConsumer.BASE_LAST_CHANCE_URL + "?page=%d" % (iPage)
  114. bCompleted = iPage>pN
  115. #while
  116.  
  117. if (pDebug):
  118. fw = open (
  119. file="list_of_all_videos_found.JSON",
  120. mode="wt",
  121. encoding="UTF-8"
  122. )
  123. #fw.write(json.dumps(listVideos))
  124. strJson = json.dumps(listVideos)
  125. fw.write(strJson)
  126. fw.close()
  127. #if
  128.  
  129. return listVideos
  130. #def getLastChanceVideosNPagesDeep
  131.  
  132. """
  133. single table
  134. tVideos
  135. video (programId\turl, title, duration, ageRating)
  136. programId\turl\ttitle\tduration\tageRating\n
  137. ...
  138. record2\n
  139. record3\n
  140. """
  141. @staticmethod
  142. def addVideosToTSVDatabase(
  143. pListVideos:list,
  144. pDatabaseName = "ARTE_TV_DB.TSV"
  145. ):
  146. fw = open(
  147. file=pDatabaseName,
  148. #mode="wt" #destructive write/replace text
  149. mode="at", #non-destructive append text
  150. encoding="UTF-8"
  151. )
  152. for video in pListVideos:
  153. programId = video["programId"]
  154. url = video["url"]
  155. title = video["title"]
  156. duration = video["duration"]
  157. ageRating = video["ageRating"]
  158.  
  159. #bRecordAlreadyExists = findVideoInTSVDatabase(pDatabaseName, programId)
  160. bRecordAlreadyExists = False
  161.  
  162. if (not bRecordAlreadyExists):
  163. strRecordTSV = "%s\t%s\t%s\t%d\t%d\n"%(
  164. programId,
  165. url,
  166. title,
  167. duration,
  168. ageRating
  169. )
  170. fw.write(strRecordTSV)
  171. #if
  172. #for all videos
  173. fw.close()
  174. return pDatabaseName
  175. #def addVideosToTSVDatabase
  176.  
  177. @staticmethod
  178. def findLongestVideo(
  179. pListOfVideos:list
  180. ):
  181. iLongestDuration = None
  182. idxLongestVideo = None
  183.  
  184. #idx 0 .. 39
  185. for idx in range(len(pListOfVideos)):
  186. video = pListOfVideos[idx]
  187. duration = video["duration"]
  188. if (iLongestDuration==None or duration>iLongestDuration):
  189. iLongestDuration = duration
  190. idxLongestVideo = idx
  191. #if
  192. #for
  193.  
  194. oLongestVideo = pListOfVideos[idxLongestVideo]
  195. return oLongestVideo
  196. #def findLongestVideo
  197.  
  198. @staticmethod
  199. def readVideosFromTsvFile(pTsvFile:str)->list:
  200. listRet = []
  201. try:
  202. fr = open(
  203. file=pTsvFile,
  204. encoding="UTF-8",
  205. mode="rt",
  206. )
  207. strAll = fr.read()
  208. fr.close()
  209.  
  210. if (strAll!=""):
  211. aRecords = strAll.split("\n")
  212. for record in aRecords:
  213. aFields = record.split("\t")
  214. bValidRecord:bool = len(aFields)==5
  215. if (bValidRecord):
  216. dictVideo = dict()
  217. dictVideo["programId"] = aFields[0]
  218. dictVideo["url"] = aFields[1]
  219. dictVideo["title"] = aFields[2]
  220. #dictVideo["duration"] = aFields[3] #duration will be str
  221. #dictVideo["ageRating"] = aFields[4] #ageRating will be str
  222. dictVideo["duration"] = int(aFields[3]) #duration will be int
  223. dictVideo["ageRating"] = int(aFields[4]) #ageRating will be int
  224. listRet.append(dictVideo)
  225. #if valid record
  226. #for all records
  227. #if there was content
  228. #try
  229. except:
  230. print("Could not read file ", pTsvFile)
  231. return []
  232. #try-except
  233.  
  234. return listRet
  235. #def readVideosFromTsvFile
  236.  
  237. @staticmethod
  238. def readArteTvVideosFromTsvFile(pTsvFile: str) -> list:
  239. listRet = []
  240. try:
  241. fr = open(
  242. file=pTsvFile,
  243. encoding="UTF-8",
  244. mode="rt",
  245. )
  246. strAll = fr.read()
  247. fr.close()
  248.  
  249. if (strAll != ""):
  250. aRecords = strAll.split("\n")
  251. for record in aRecords:
  252. aFields = record.split("\t")
  253. bValidRecord: bool = len(aFields) == 5
  254. if (bValidRecord):
  255. oArteTvVideo = ArteTvVideo(
  256. aFields[0], #programId
  257. aFields[1], #url
  258. aFields[2], #title
  259. int(aFields[3]), #duration
  260. int(aFields[4]) #age rating
  261. )
  262.  
  263. listRet.append(oArteTvVideo)
  264. # if valid record
  265. # for all records
  266. # if there was content
  267. # try
  268. except:
  269. print("Could not read file ", pTsvFile)
  270. return []
  271. # try-except
  272.  
  273. return listRet
  274. # def readArteTvVideosFromTsvFile
  275.  
  276. @staticmethod
  277. def sortListOfVideosByDuration(pListOfVideos:list, pReverse=False):
  278. pListOfVideos.sort(key=byDuration, reverse=pReverse) #key should be the name of a function that extracts the data that will be used as the criterion (critério) / criteria (critérios) for ascending sorting
  279. #def sortListOfVideosByDuration
  280.  
  281. @staticmethod
  282. def sortListOfVideosByTitle(pListOfVideos:list, pReverse=False):
  283. pListOfVideos.sort(key=byTitle, reverse=pReverse)
  284. #def sortListOfVideosByTitle
  285. #class ArteTvConsumer
  286.  
  287. tv = ArteTvConsumer()
  288. #print (ArteTvConsumer.BASE_URL)
  289. print(tv)
  290.  
  291. #tv.getDictForLastChanceVideos() #would be possible for classmethod
  292. #dictVideos = ArteTvConsumer.getDictForLastChanceVideos(pDebug=True) #will also write an "example.json" file
  293. #dictVideos = ArteTvConsumer.getDictForLastChanceVideos() #will also write an "example.json" file
  294. #print(dictVideos)
  295. #listAllVideosUpToPageNumber2 = tv.getLastChanceVideosNPagesDeep(pN=2,pDebug=True)
  296. #print(listAllVideosUpToPageNumber2)
  297. #strDB = ArteTvConsumer.addVideosToTSVDatabase(listAllVideosUpToPageNumber2)
  298. #print (strDB)
  299. #oLongestVideo = ArteTvConsumer.findLongestVideo(listAllVideosUpToPageNumber2)
  300. #print (oLongestVideo)
  301.  
  302. def byDuration(pAnyVideo):
  303. #return pAnyVideo["duration"]
  304. #return pAnyVideo.get("duration")
  305. return int(pAnyVideo.get("duration"))
  306. #def byDuration
  307.  
  308. def byTitle(pAnyVideo):
  309. return pAnyVideo.get("title")
  310. #def
  311.  
  312. """
  313. listOfVideosPreviouslySaved = ArteTvConsumer.readVideosFromTsvFile("ARTE_TV_DB.TSV")
  314. print ("Not sorted: ")
  315. print (listOfVideosPreviouslySaved)
  316.  
  317. ArteTvConsumer.sortListOfVideosByDuration(listOfVideosPreviouslySaved, pReverse=True)
  318. print ("Sorted by duration: ")
  319. print (listOfVideosPreviouslySaved)
  320.  
  321. print ("__"*40)
  322. ArteTvConsumer.sortListOfVideosByTitle(listOfVideosPreviouslySaved)
  323. print ("Sorted by title: ")
  324. print (listOfVideosPreviouslySaved)
  325. """
  326.  
  327. listOfArteTvVideos = ArteTvConsumer.readArteTvVideosFromTsvFile("ARTE_TV_DB.TSV")
  328. print(listOfArteTvVideos)
  329.  
  330.  
  331.  
  332.  
  333. ********************
  334.  
  335. class ArteTvVideo:
  336. def __init__(self, pProgramId, pUrl, pTitle, pDuration, pAgeRating):
  337. self.mProgramId = pProgramId
  338. self.mUrl = pUrl
  339. self.mTitle = pTitle
  340. self.mDuration = pDuration
  341. self.mAgeRating = pAgeRating
  342. #def __init__
  343.  
  344. #for print of a SINGLE video
  345. def __str__(self):
  346. strFormat = "program id: %s\nURL: %s\nTitle: %s\nDuration: %d\nAge rating: %d\n%s\n"%(
  347. self.mProgramId,
  348. self.mUrl,
  349. self.mTitle,
  350. self.mDuration,
  351. self.mAgeRating,
  352. "-"*80
  353. )
  354. return strFormat
  355. #def __str__
  356.  
  357. # for print of collections of videos
  358. def __repr__(self):
  359. return self.__str__()
  360.  
  361. def __eq__(self, other):
  362. bIdenticalByTitle = self.mTitle == other.mTitle
  363. return bIdenticalByTitle
  364.  
  365. def __gt__(self, other):
  366. return self.mDuration>other.mDuration
  367.  
  368. def __lt__(self, v2):
  369. return self.mDuration<v2.mDuration
  370. #class ArteTvVideo
Add Comment
Please, Sign In to add comment