FP 2021-12-14

from amutil import AmUtil
import datetime
#pip install beautifulsoup4
from bs4 import BeautifulSoup

class PublicConsumer:
    URL = "https://www.publico.pt/"
    HOW_MANY_SYMBOLS_FROM_CONTENT_TO_DISPLAY_ON_STR = 64

    def __init__(self):
        #local variable / "scoping"
        strHTML = AmUtil.genericUrlReader(
            PublicConsumer.URL
        )
        #data member
        self.mContent = strHTML

        #local var
        dateToday:datetime.date = \
            AmUtil.getDateCorrespondingToToday()

        #other 3 data members for the instance
        self.mYear = dateToday.year
        self.mMonth = dateToday.month
        self.mDay = dateToday.day

        #self.mAllTheAnchors = None
        #self.getHyperlinks() #would init self.mAllTheAnchors
    #def __init__

    #dunder = double underscore
    def __str__(self):
        iHowManySymbolsInContent = len(self.mContent)

        strAll = "mContent: %s ...\n"
        strAll += "#symbols in mContent: %d\n"
        strAll +="mYear: %d\n"
        strAll += "mMonth: %d\n"
        strAll += "mDay: %d\n"
        strAll = strAll%(
            self.mContent[
                #0
                : #slice operator
                PublicConsumer.HOW_MANY_SYMBOLS_FROM_CONTENT_TO_DISPLAY_ON_STR
            ],
            iHowManySymbolsInContent,
            self.mYear,
            self.mMonth,
            self.mDay
        )
        return strAll
    #def __str__

    def getHyperlinks(self):
        bs = BeautifulSoup(
            self.mContent,
            "html5lib" #pip install html5lib
        )
        # <a href="http://site.com/noticia/n1">Bem vindo site.com</a>
        #{href="..." , ... "text"="Bem vindo ...."}
        allTheAnchors = bs.findAll("a")

        #inits the self.mAllTheAnchors data member
        #self.mAllTheAnchors = allTheAnchors
        return allTheAnchors
    #def getHyperlinks

    def getNoticiasRaw(self):
        listNoticias = []
        allTheAnchors = self.getHyperlinks()
        strFilterForNoticia = "/noticia/"
        for anchor in allTheAnchors:
            #filter only those that in href contain "/noticia/"
            #anchor.attrs is a dictionary of all the element's attributes
            attributesForAnchors = anchor.attrs.keys()
            bWithHref = "href" in attributesForAnchors
            if (bWithHref):
                strHref = anchor.attrs["href"]
                strText = anchor.text
                bNoticia = strHref.find(strFilterForNoticia)!=-1
                #TODO: reject hrefs that end in #comments
                if (bNoticia):
                    dictNoticia = {}
                    dictNoticia["href"] = strHref
                    dictNoticia["text"] = strText
                    listNoticias.append(dictNoticia)
                #if
            #if
        #for
        #self.mNoticias = listNoticias #mNoticias?
        return listNoticias
    #def getNoticiasRaw

    @staticmethod
    def getNoticias(pListNoticias):
        listWithoutComments = []
        listOnlyWithAbsoluteUrls = []
        listProcessedWithoutRepetitions = list() #[]
        #self.mNoticias accessible

        # task 1 - reject hrefs that contain #comments
        for n in pListNoticias: #each n is a dict
            href = n["href"]
            bReject:bool = \
                href.find("#comments")!=-1
            if (not bReject):
                listWithoutComments.append(n)
        #for task 1 (walked over all the news)

        #task 2 - Absolute URLs
        for n in listWithoutComments:
            href = n["href"]
            #"http://" or "https://"
            bAbsUrl:bool = \
                href.find("http://")==0 \
                or \
                href.find("https://")==0
            if (bAbsUrl):
                listOnlyWithAbsoluteUrls.append(n)
            else:
                #do not end prefix with / because relative URLs already start with that
                prefix = "https://www.publico.pt"
                newHref = prefix+href
                n["href"] = newHref
                listOnlyWithAbsoluteUrls.append(n)
            #if-else
        #for (walked over news without comments)

        #task 3 - assure that there are NO repititions
        for n in listOnlyWithAbsoluteUrls:
            bAlreadyExists:bool = \
                n in listProcessedWithoutRepetitions
            if (not bAlreadyExists):
                listProcessedWithoutRepetitions.append(n)
        #for

        return listProcessedWithoutRepetitions
    #def getNoticias

    @staticmethod
    def viewHrefs(pListOfNoticias:list):
        for noticia in pListOfNoticias:
            href = noticia["href"]
            print(href)
        #for
    #def viewHrefs

    def getNewsForDay(self, pY, pM, pD)->list:
        UrlForTheDay = "https://www.publico.pt/%d/%d/%d"%(pY, pM, pD)
        self.mContent = AmUtil.genericUrlReader(UrlForTheDay)
        self.mYear = pY
        self.mMonth = pM
        self.mDay = pD
        newsForTheDayRaw = self.getNoticiasRaw()
        newsProcessed = PublicConsumer.getNoticias(newsForTheDayRaw)

        listForTheDay = list()
        for n in newsProcessed:
            href = n["href"]
            bIsForTheDay:bool = href.find(UrlForTheDay)==0
            if (bIsForTheDay):
                listForTheDay.append(n)
            #if
        #for

        return listForTheDay
    #def getNewsForDay
#class PublicConsumer

p = PublicConsumer()
allTheNewsProcessed = p.getNewsForDay(2020, 12, 20)
print (allTheNewsProcessed)


**********************

import json
from amutil import AmUtil

#URL = "https://sitecomapi.net/api/api.php?user=23892938293&token=92839283"
#respostaJSON = AmUtil.genericUrlReader(URL)

respostaJSON = '{"nome":"Art", "number":123}'
dictResposta = json.loads(respostaJSON)
print(dictResposta)