Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from amutil import AmUtil
- import datetime
- #pip install beautifulsoup4
- from bs4 import BeautifulSoup
- class PublicConsumer:
- URL = "https://www.publico.pt/"
- HOW_MANY_SYMBOLS_FROM_CONTENT_TO_DISPLAY_ON_STR = 64
- def __init__(self):
- strHTML = AmUtil.genericUrlReader(
- PublicConsumer.URL
- )
- self.mContent = strHTML
- dateToday:datetime.date = \
- AmUtil.getDateCorrespondingToToday()
- self.mYear = dateToday.year
- self.mMonth = dateToday.month
- self.mDay = dateToday.day
- #self.mAllTheAnchors = None
- #self.getHyperlinks() #would init self.mAllTheAnchors
- #def __init__
- def __str__(self):
- iHowManySymbolsInContent = len(self.mContent)
- strAll = "mContent: %s ...\n"
- strAll += "#symbols in mContent: %d\n"
- strAll +="mYear: %d\n"
- strAll += "mMonth: %d\n"
- strAll += "mDay: %d\n"
- strAll = strAll%(
- self.mContent[
- #0
- : #splice operator
- PublicConsumer.HOW_MANY_SYMBOLS_FROM_CONTENT_TO_DISPLAY_ON_STR
- ],
- iHowManySymbolsInContent,
- self.mYear,
- self.mMonth,
- self.mDay
- )
- return strAll
- #def __str__
- def getHyperlinks(self):
- bs = BeautifulSoup(
- self.mContent,
- "html5lib" #pip install html5lib
- )
- allTheAnchors = bs.findAll("a")
- #inits the self.mAllTheAnchors data member
- #self.mAllTheAnchors = allTheAnchors
- return allTheAnchors
- #def getHyperlinks
- def getNoticias(self):
- listNoticias = []
- allTheAnchors = self.getHyperlinks()
- strFilterForNoticia = "/noticia/"
- for anchor in allTheAnchors:
- #filter only those that in href contain "/noticia/"
- #anchor.attrs is a dictionary of all the element's attributes
- attributesForAnchors = anchor.attrs.keys()
- bWithHref = "href" in attributesForAnchors
- if (bWithHref):
- strHref = anchor.attrs["href"]
- strText = anchor.text
- bNoticia = strHref.find(strFilterForNoticia)!=-1
- #TODO: reject hrefs that end in #comments
- if (bNoticia):
- dictNoticia = {}
- dictNoticia["href"] = strHref
- dictNoticia["text"] = strText
- listNoticias.append(dictNoticia)
- #if
- #if
- #for
- return listNoticias
- #def getNoticias
- #class PublicConsumer
- p = PublicConsumer()
- print(p)
- allTheAnchors = p.getHyperlinks()
- allTheNoticias = p.getNoticias()
- #print (allTheAnchors)
- print(allTheNoticias)
Add Comment
Please, Sign In to add comment