Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from amutil import AmUtil
- import datetime
- #pip install beautifulsoup4
- from bs4 import BeautifulSoup
- class PublicConsumer:
- URL = "https://www.publico.pt/"
- HOW_MANY_SYMBOLS_FROM_CONTENT_TO_DISPLAY_ON_STR = 64
- def __init__(self):
- #local variable / "scoping"
- strHTML = AmUtil.genericUrlReader(
- PublicConsumer.URL
- )
- #data member
- self.mContent = strHTML
- #local var
- dateToday:datetime.date = \
- AmUtil.getDateCorrespondingToToday()
- #other 3 data members for the instance
- self.mYear = dateToday.year
- self.mMonth = dateToday.month
- self.mDay = dateToday.day
- #self.mAllTheAnchors = None
- #self.getHyperlinks() #would init self.mAllTheAnchors
- #def __init__
- #dunder = double underscore
- def __str__(self):
- iHowManySymbolsInContent = len(self.mContent)
- strAll = "mContent: %s ...\n"
- strAll += "#symbols in mContent: %d\n"
- strAll +="mYear: %d\n"
- strAll += "mMonth: %d\n"
- strAll += "mDay: %d\n"
- strAll = strAll%(
- self.mContent[
- #0
- : #slice operator
- PublicConsumer.HOW_MANY_SYMBOLS_FROM_CONTENT_TO_DISPLAY_ON_STR
- ],
- iHowManySymbolsInContent,
- self.mYear,
- self.mMonth,
- self.mDay
- )
- return strAll
- #def __str__
- def getHyperlinks(self):
- bs = BeautifulSoup(
- self.mContent,
- "html5lib" #pip install html5lib
- )
- # <a href="http://site.com/noticia/n1">Bem vindo site.com</a>
- #{href="..." , ... "text"="Bem vindo ...."}
- allTheAnchors = bs.findAll("a")
- #inits the self.mAllTheAnchors data member
- #self.mAllTheAnchors = allTheAnchors
- return allTheAnchors
- #def getHyperlinks
- def getNoticiasRaw(self):
- listNoticias = []
- allTheAnchors = self.getHyperlinks()
- strFilterForNoticia = "/noticia/"
- for anchor in allTheAnchors:
- #filter only those that in href contain "/noticia/"
- #anchor.attrs is a dictionary of all the element's attributes
- attributesForAnchors = anchor.attrs.keys()
- bWithHref = "href" in attributesForAnchors
- if (bWithHref):
- strHref = anchor.attrs["href"]
- strText = anchor.text
- bNoticia = strHref.find(strFilterForNoticia)!=-1
- #TODO: reject hrefs that end in #comments
- if (bNoticia):
- dictNoticia = {}
- dictNoticia["href"] = strHref
- dictNoticia["text"] = strText
- listNoticias.append(dictNoticia)
- #if
- #if
- #for
- #self.mNoticias = listNoticias #mNoticias?
- return listNoticias
- #def getNoticiasRaw
- @staticmethod
- def getNoticias(pListNoticias):
- listWithoutComments = []
- listOnlyWithAbsoluteUrls = []
- listProcessedWithoutRepetitions = list() #[]
- #self.mNoticias accessible
- # task 1 - reject hrefs that contain #comments
- for n in pListNoticias: #each n is a dict
- href = n["href"]
- bReject:bool = \
- href.find("#comments")!=-1
- if (not bReject):
- listWithoutComments.append(n)
- #for task 1 (walked over all the news)
- #task 2 - Absolute URLs
- for n in listWithoutComments:
- href = n["href"]
- #"http://" or "https://"
- bAbsUrl:bool = \
- href.find("http://")==0 \
- or \
- href.find("https://")==0
- if (bAbsUrl):
- listOnlyWithAbsoluteUrls.append(n)
- else:
- #do not end prefix with / because relative URLs already start with that
- prefix = "https://www.publico.pt"
- newHref = prefix+href
- n["href"] = newHref
- listOnlyWithAbsoluteUrls.append(n)
- #if-else
- #for (walked over news without comments)
- #task 3 - assure that there are NO repititions
- for n in listOnlyWithAbsoluteUrls:
- bAlreadyExists:bool = \
- n in listProcessedWithoutRepetitions
- if (not bAlreadyExists):
- listProcessedWithoutRepetitions.append(n)
- #for
- return listProcessedWithoutRepetitions
- #def getNoticias
- @staticmethod
- def viewHrefs(pListOfNoticias:list):
- for noticia in pListOfNoticias:
- href = noticia["href"]
- print(href)
- #for
- #def viewHrefs
- def getNewsForDay(self, pY, pM, pD)->list:
- UrlForTheDay = "https://www.publico.pt/%d/%d/%d"%(pY, pM, pD)
- self.mContent = AmUtil.genericUrlReader(UrlForTheDay)
- self.mYear = pY
- self.mMonth = pM
- self.mDay = pD
- newsForTheDayRaw = self.getNoticiasRaw()
- newsProcessed = PublicConsumer.getNoticias(newsForTheDayRaw)
- listForTheDay = list()
- for n in newsProcessed:
- href = n["href"]
- bIsForTheDay:bool = href.find(UrlForTheDay)==0
- if (bIsForTheDay):
- listForTheDay.append(n)
- #if
- #for
- return listForTheDay
- #def getNewsForDay
- #class PublicConsumer
- p = PublicConsumer()
- allTheNewsProcessed = p.getNewsForDay(2020, 12, 20)
- print (allTheNewsProcessed)
- **********************
- import json
- from amutil import AmUtil
- #URL = "https://sitecomapi.net/api/api.php?user=23892938293&token=92839283"
- #respostaJSON = AmUtil.genericUrlReader(URL)
- respostaJSON = '{"nome":"Art", "number":123}'
- dictResposta = json.loads(respostaJSON)
- print(dictResposta)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement