Advertisement
am_dot_com

FP 2021-12-14

Dec 14th, 2021 (edited)
60
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.86 KB | None | 0 0
  1. from amutil import AmUtil
  2. import datetime
  3. #pip install beautifulsoup4
  4. from bs4 import BeautifulSoup
  5.  
  6. class PublicConsumer:
  7. URL = "https://www.publico.pt/"
  8. HOW_MANY_SYMBOLS_FROM_CONTENT_TO_DISPLAY_ON_STR = 64
  9.  
  10. def __init__(self):
  11. #local variable / "scoping"
  12. strHTML = AmUtil.genericUrlReader(
  13. PublicConsumer.URL
  14. )
  15. #data member
  16. self.mContent = strHTML
  17.  
  18. #local var
  19. dateToday:datetime.date = \
  20. AmUtil.getDateCorrespondingToToday()
  21.  
  22. #other 3 data members for the instance
  23. self.mYear = dateToday.year
  24. self.mMonth = dateToday.month
  25. self.mDay = dateToday.day
  26.  
  27. #self.mAllTheAnchors = None
  28. #self.getHyperlinks() #would init self.mAllTheAnchors
  29. #def __init__
  30.  
  31. #dunder = double underscore
  32. def __str__(self):
  33. iHowManySymbolsInContent = len(self.mContent)
  34.  
  35. strAll = "mContent: %s ...\n"
  36. strAll += "#symbols in mContent: %d\n"
  37. strAll +="mYear: %d\n"
  38. strAll += "mMonth: %d\n"
  39. strAll += "mDay: %d\n"
  40. strAll = strAll%(
  41. self.mContent[
  42. #0
  43. : #slice operator
  44. PublicConsumer.HOW_MANY_SYMBOLS_FROM_CONTENT_TO_DISPLAY_ON_STR
  45. ],
  46. iHowManySymbolsInContent,
  47. self.mYear,
  48. self.mMonth,
  49. self.mDay
  50. )
  51. return strAll
  52. #def __str__
  53.  
  54. def getHyperlinks(self):
  55. bs = BeautifulSoup(
  56. self.mContent,
  57. "html5lib" #pip install html5lib
  58. )
  59. # <a href="http://site.com/noticia/n1">Bem vindo site.com</a>
  60. #{href="..." , ... "text"="Bem vindo ...."}
  61. allTheAnchors = bs.findAll("a")
  62.  
  63. #inits the self.mAllTheAnchors data member
  64. #self.mAllTheAnchors = allTheAnchors
  65. return allTheAnchors
  66. #def getHyperlinks
  67.  
  68. def getNoticiasRaw(self):
  69. listNoticias = []
  70. allTheAnchors = self.getHyperlinks()
  71. strFilterForNoticia = "/noticia/"
  72. for anchor in allTheAnchors:
  73. #filter only those that in href contain "/noticia/"
  74. #anchor.attrs is a dictionary of all the element's attributes
  75. attributesForAnchors = anchor.attrs.keys()
  76. bWithHref = "href" in attributesForAnchors
  77. if (bWithHref):
  78. strHref = anchor.attrs["href"]
  79. strText = anchor.text
  80. bNoticia = strHref.find(strFilterForNoticia)!=-1
  81. #TODO: reject hrefs that end in #comments
  82. if (bNoticia):
  83. dictNoticia = {}
  84. dictNoticia["href"] = strHref
  85. dictNoticia["text"] = strText
  86. listNoticias.append(dictNoticia)
  87. #if
  88. #if
  89. #for
  90. #self.mNoticias = listNoticias #mNoticias?
  91. return listNoticias
  92. #def getNoticiasRaw
  93.  
  94. @staticmethod
  95. def getNoticias(pListNoticias):
  96. listWithoutComments = []
  97. listOnlyWithAbsoluteUrls = []
  98. listProcessedWithoutRepetitions = list() #[]
  99. #self.mNoticias accessible
  100.  
  101. # task 1 - reject hrefs that contain #comments
  102. for n in pListNoticias: #each n is a dict
  103. href = n["href"]
  104. bReject:bool = \
  105. href.find("#comments")!=-1
  106. if (not bReject):
  107. listWithoutComments.append(n)
  108. #for task 1 (walked over all the news)
  109.  
  110. #task 2 - Absolute URLs
  111. for n in listWithoutComments:
  112. href = n["href"]
  113. #"http://" or "https://"
  114. bAbsUrl:bool = \
  115. href.find("http://")==0 \
  116. or \
  117. href.find("https://")==0
  118. if (bAbsUrl):
  119. listOnlyWithAbsoluteUrls.append(n)
  120. else:
  121. #do not end prefix with / because relative URLs already start with that
  122. prefix = "https://www.publico.pt"
  123. newHref = prefix+href
  124. n["href"] = newHref
  125. listOnlyWithAbsoluteUrls.append(n)
  126. #if-else
  127. #for (walked over news without comments)
  128.  
  129. #task 3 - assure that there are NO repititions
  130. for n in listOnlyWithAbsoluteUrls:
  131. bAlreadyExists:bool = \
  132. n in listProcessedWithoutRepetitions
  133. if (not bAlreadyExists):
  134. listProcessedWithoutRepetitions.append(n)
  135. #for
  136.  
  137. return listProcessedWithoutRepetitions
  138. #def getNoticias
  139.  
  140. @staticmethod
  141. def viewHrefs(pListOfNoticias:list):
  142. for noticia in pListOfNoticias:
  143. href = noticia["href"]
  144. print(href)
  145. #for
  146. #def viewHrefs
  147.  
  148. def getNewsForDay(self, pY, pM, pD)->list:
  149. UrlForTheDay = "https://www.publico.pt/%d/%d/%d"%(pY, pM, pD)
  150. self.mContent = AmUtil.genericUrlReader(UrlForTheDay)
  151. self.mYear = pY
  152. self.mMonth = pM
  153. self.mDay = pD
  154. newsForTheDayRaw = self.getNoticiasRaw()
  155. newsProcessed = PublicConsumer.getNoticias(newsForTheDayRaw)
  156.  
  157. listForTheDay = list()
  158. for n in newsProcessed:
  159. href = n["href"]
  160. bIsForTheDay:bool = href.find(UrlForTheDay)==0
  161. if (bIsForTheDay):
  162. listForTheDay.append(n)
  163. #if
  164. #for
  165.  
  166. return listForTheDay
  167. #def getNewsForDay
  168. #class PublicConsumer
  169.  
  170. p = PublicConsumer()
  171. allTheNewsProcessed = p.getNewsForDay(2020, 12, 20)
  172. print (allTheNewsProcessed)
  173.  
  174.  
  175. **********************
  176.  
  177. import json
  178. from amutil import AmUtil
  179.  
  180. #URL = "https://sitecomapi.net/api/api.php?user=23892938293&token=92839283"
  181. #respostaJSON = AmUtil.genericUrlReader(URL)
  182.  
  183. respostaJSON = '{"nome":"Art", "number":123}'
  184. dictResposta = json.loads(respostaJSON)
  185. print(dictResposta)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement