am_dot_com

FP 2021-12-20

Dec 20th, 2021 (edited)
111
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.14 KB | None | 0 0
  1. #artetv_v1.py
  2. from amutil import AmUtil #mainly for genericUrlReader
  3.  
  4. from enum import Enum
  5. import json
  6.  
  7. class Languages(Enum):
  8. English = "en"
  9. Portuguese = "pt"
  10. German = "de"
  11. #class Languages
  12.  
  13. class ArteTvConsumer:
  14.  
  15. BASE_LAST_CHANCE_URL = "https://www.arte.tv/%s/videos/last-chance/"
  16.  
  17. LAST_CHANCE_JSON_SCRIPT_START = "<script id=\"__NEXT_DATA__\" type=\"application/json\">"
  18. LAST_CHANCE_JSON_SCRIPT_END = "</script>"
  19.  
  20. def __init__(
  21. self,
  22. pLanguage = Languages.English
  23. ):
  24. if(pLanguage==Languages.Portuguese):
  25. ArteTvConsumer.BASE_LAST_CHANCE_URL = ArteTvConsumer.BASE_LAST_CHANCE_URL % (Languages.Portuguese)
  26. else:
  27. ArteTvConsumer.BASE_LAST_CHANCE_URL = ArteTvConsumer.BASE_LAST_CHANCE_URL % ("en")
  28. #if
  29. #def __init__
  30.  
  31. def __str__(self):
  32. strAll = ""
  33. strAll = "BASE_URL= {}".format(ArteTvConsumer.BASE_LAST_CHANCE_URL)
  34. return strAll
  35. #def __str__
  36.  
  37. #classmethod
  38. @staticmethod
  39. def getDictForLastChanceVideos(
  40. pUrl:str = BASE_LAST_CHANCE_URL,
  41. pOnlyVideos = True,
  42. pDebug = False
  43. ):
  44. strHTML = AmUtil.genericUrlReader(pUrl)
  45. bSuccessOnAccess = strHTML!=""
  46. if (bSuccessOnAccess):
  47. iStartRelevantPortion = strHTML.find(ArteTvConsumer.LAST_CHANCE_JSON_SCRIPT_START)
  48. bStartExists = iStartRelevantPortion!=-1
  49. if (bStartExists):
  50. strRelevant = strHTML[
  51. iStartRelevantPortion + len(ArteTvConsumer.LAST_CHANCE_JSON_SCRIPT_START)
  52. :
  53. ]
  54.  
  55. #"banana".find("batata") #-1
  56. iEndExists = strRelevant.find(ArteTvConsumer.LAST_CHANCE_JSON_SCRIPT_END)
  57. bEndExists = iEndExists!=-1
  58. if (bEndExists):
  59. strRelevant = strRelevant[
  60. 0
  61. :
  62. iEndExists
  63. ]
  64. #strRelevant is a string, in the JSON notation, that corresponds to a dict describing videos, about to disappear from the site
  65. dictFullVideosLastChance = json.loads(strRelevant)
  66. dictVideosOnly = dictFullVideosLastChance["props"]["pageProps"]["initialPage"]["zones"][0]["data"]
  67. if (pOnlyVideos):
  68. if (pDebug):
  69. strVideosOnly = json.dumps(dictVideosOnly)
  70. fw = open(
  71. file="example_videos_only.JSON",
  72. mode="wt",
  73. encoding="UTF-8"
  74. )
  75. fw.write(strVideosOnly)
  76. fw.close()
  77. return dictVideosOnly
  78. else:
  79. if (pDebug):
  80. fw = open(
  81. file="example_full.JSON",
  82. mode="wt",
  83. encoding="UTF-8"
  84. )
  85. fw.write(strRelevant)
  86. fw.close()
  87. dictFullVideosLastChance
  88. #if there is an ending
  89. #if there is a proper start
  90. #if there was HTML access
  91. #def getDictForLastChanceVideos
  92.  
  93. def getLastChanceVideosNPagesDeep(
  94. self,
  95. pN = 2,
  96. pDebug = False
  97. )->list:
  98. #listVideos = list()
  99. listVideos = []
  100.  
  101. bCompleted = False
  102. iPage = 1
  103. url = ArteTvConsumer.BASE_LAST_CHANCE_URL+"?page=%d"%(iPage) #"https://www.arte.tv/en/videos/last-chance/?page=1" .. "https://www.arte.tv/en/videos/last-chance/?page=10"
  104. while (not bCompleted):
  105.  
  106. dictVideosInCurrentPage = ArteTvConsumer.getDictForLastChanceVideos(url)
  107.  
  108. for video in dictVideosInCurrentPage:
  109. listVideos.append(video)
  110.  
  111. #TODO listVideos must grow, must incorporate the found videos
  112.  
  113. iPage+=1
  114. url = ArteTvConsumer.BASE_LAST_CHANCE_URL + "?page=%d" % (iPage)
  115. bCompleted = iPage>pN
  116. #while
  117.  
  118. if (pDebug):
  119. fw = open (
  120. file="list_of_all_videos_found.JSON",
  121. mode="wt",
  122. encoding="UTF-8"
  123. )
  124. #fw.write(json.dumps(listVideos))
  125. strJson = json.dumps(listVideos)
  126. fw.write(strJson)
  127. fw.close()
  128. #if
  129.  
  130. return listVideos
  131. #def getLastChanceVideosNPagesDeep
  132.  
  133. """
  134. single table
  135. tVideos
  136. video (programId\turl, title, duration, ageRating)
  137. programId\turl\ttitle\tduration\tageRating\n
  138. ...
  139. record2\n
  140. record3\n
  141. """
  142. @staticmethod
  143. def addVideosToTSVDatabase(
  144. pListVideos:list,
  145. pDatabaseName = "ARTE_TV_DB.TSV"
  146. ):
  147. fw = open(
  148. file=pDatabaseName,
  149. #mode="wt" #destructive write/replace text
  150. mode="at", #non-destructive append text
  151. encoding="UTF-8"
  152. )
  153. for video in pListVideos:
  154. programId = video["programId"]
  155. url = video["url"]
  156. title = video["title"]
  157. duration = video["duration"]
  158. ageRating = video["ageRating"]
  159.  
  160. #bRecordAlreadyExists = findVideoInTSVDatabase(pDatabaseName, programId)
  161. bRecordAlreadyExists = False
  162.  
  163. if (not bRecordAlreadyExists):
  164. strRecordTSV = "%s\t%s\t%s\t%d\t%d\n"%(
  165. programId,
  166. url,
  167. title,
  168. duration,
  169. ageRating
  170. )
  171. fw.write(strRecordTSV)
  172. #if
  173. #for all videos
  174. fw.close()
  175. return pDatabaseName
  176. #def addVideosToTSVDatabase
  177.  
  178. @staticmethod
  179. def findLongestVideo(
  180. pListOfVideos:list
  181. ):
  182. iLongestDuration = None
  183. idxLongestVideo = None
  184.  
  185. #idx 0 .. 39
  186. for idx in range(len(pListOfVideos)):
  187. video = pListOfVideos[idx]
  188. duration = video["duration"]
  189. if (iLongestDuration==None or duration>iLongestDuration):
  190. iLongestDuration = duration
  191. idxLongestVideo = idx
  192. #if
  193. #for
  194.  
  195. oLongestVideo = pListOfVideos[idxLongestVideo]
  196. return oLongestVideo
  197. #def findLongestVideo
  198. #class ArteTvConsumer
  199.  
  200. tv = ArteTvConsumer()
  201. #print (ArteTvConsumer.BASE_URL)
  202. print(tv)
  203.  
  204. #tv.getDictForLastChanceVideos() #would be possible for classmethod
  205. #dictVideos = ArteTvConsumer.getDictForLastChanceVideos(pDebug=True) #will also write an "example.json" file
  206. #dictVideos = ArteTvConsumer.getDictForLastChanceVideos() #will also write an "example.json" file
  207. #print(dictVideos)
  208. listAllVideosUpToPageNumber2 = tv.getLastChanceVideosNPagesDeep(
  209. pN=2,
  210. pDebug=True
  211. )
  212. #print(listAllVideosUpToPageNumber2)
  213.  
  214. strDB = ArteTvConsumer.addVideosToTSVDatabase(listAllVideosUpToPageNumber2)
  215. print (strDB)
Add Comment
Please, Sign In to add comment