am_dot_com

FP 2021-12-13

Dec 13th, 2021 (edited)
75
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.79 KB | None | 0 0
  1. from amutil import AmUtil
  2. import datetime
  3. #pip install beautifulsoup4
  4. from bs4 import BeautifulSoup
  5.  
  6. class PublicConsumer:
  7. URL = "https://www.publico.pt/"
  8. HOW_MANY_SYMBOLS_FROM_CONTENT_TO_DISPLAY_ON_STR = 64
  9.  
  10. def __init__(self):
  11. strHTML = AmUtil.genericUrlReader(
  12. PublicConsumer.URL
  13. )
  14. self.mContent = strHTML
  15.  
  16. dateToday:datetime.date = \
  17. AmUtil.getDateCorrespondingToToday()
  18.  
  19. self.mYear = dateToday.year
  20. self.mMonth = dateToday.month
  21. self.mDay = dateToday.day
  22.  
  23. #self.mAllTheAnchors = None
  24. #self.getHyperlinks() #would init self.mAllTheAnchors
  25. #def __init__
  26.  
  27. def __str__(self):
  28. iHowManySymbolsInContent = len(self.mContent)
  29.  
  30. strAll = "mContent: %s ...\n"
  31. strAll += "#symbols in mContent: %d\n"
  32. strAll +="mYear: %d\n"
  33. strAll += "mMonth: %d\n"
  34. strAll += "mDay: %d\n"
  35. strAll = strAll%(
  36. self.mContent[
  37. #0
  38. : #splice operator
  39. PublicConsumer.HOW_MANY_SYMBOLS_FROM_CONTENT_TO_DISPLAY_ON_STR
  40. ],
  41. iHowManySymbolsInContent,
  42. self.mYear,
  43. self.mMonth,
  44. self.mDay
  45. )
  46. return strAll
  47. #def __str__
  48.  
  49. def getHyperlinks(self):
  50. bs = BeautifulSoup(
  51. self.mContent,
  52. "html5lib" #pip install html5lib
  53. )
  54. allTheAnchors = bs.findAll("a")
  55.  
  56. #inits the self.mAllTheAnchors data member
  57. #self.mAllTheAnchors = allTheAnchors
  58. return allTheAnchors
  59. #def getHyperlinks
  60.  
  61. def getNoticias(self):
  62. listNoticias = []
  63. allTheAnchors = self.getHyperlinks()
  64. strFilterForNoticia = "/noticia/"
  65. for anchor in allTheAnchors:
  66. #filter only those that in href contain "/noticia/"
  67. #anchor.attrs is a dictionary of all the element's attributes
  68. attributesForAnchors = anchor.attrs.keys()
  69. bWithHref = "href" in attributesForAnchors
  70. if (bWithHref):
  71. strHref = anchor.attrs["href"]
  72. strText = anchor.text
  73. bNoticia = strHref.find(strFilterForNoticia)!=-1
  74. #TODO: reject hrefs that end in #comments
  75. if (bNoticia):
  76. dictNoticia = {}
  77. dictNoticia["href"] = strHref
  78. dictNoticia["text"] = strText
  79. listNoticias.append(dictNoticia)
  80. #if
  81. #if
  82. #for
  83. return listNoticias
  84. #def getNoticias
  85. #class PublicConsumer
  86.  
  87. p = PublicConsumer()
  88. print(p)
  89. allTheAnchors = p.getHyperlinks()
  90. allTheNoticias = p.getNoticias()
  91. #print (allTheAnchors)
  92. print(allTheNoticias)
  93.  
  94.  
Add Comment
Please, Sign In to add comment