IA 2022-01-03

#Am4Chan.py
from amutil_tools import AmUtil

import re #regular expressions
from bs4 import BeautifulSoup

class Am4Chan:
    PATTERN_FOR_BOARD_DISCOVERY = "^//boards\.4channel\.org/[a-zA-Z0-9_]+/$"
    PATTERN_FOR_JPGS = ".*\.jpg$|.*\.jpeg$"
    PATTERN_FOR_PNGS = ".*\.png$"

    def __init__(
        self,
        pBoardName:str
    ):
        self.mBoardName = pBoardName

        self.mAllResources = self.discoverAllResources()
        #print(self.mAllResources)

        self.mBoards = self.discoverBoards()
        print(self.mBoards)
    #def __init__

    #dunder / double underscore
    def __str__(self):
        strAll = "board name: %s"%(self.mBoardName)
        return strAll
    #def __str__

    def buildUrlForPageNumber(self, piPageNumber:int)->str:
        strUrl = "https://boards.4chan.org/%s/"%(self.mBoardName)
        if(piPageNumber==1):
            return strUrl
        else:
            #strUrl = strUrl+str(piPageNumber)
            strUrl = "%s%d"%(strUrl, piPageNumber)
            return strUrl
        #if-else
    #def buildUrlForPageNumber

    def discoverAllResources(self):
        listResources = list()
        strUrl = self.buildUrlForPageNumber(1)
        strSourceCode =\
            AmUtil.genericUrlReader(
                pStrUrl=strUrl,
                pUserAgent=AmUtil.USER_AGENT_STRING_MOZ47,
                pReferer="https://boards.4chan.org/",
                pBytesInsteadOfStr=False
            )
        #print(strSourceCode)
        bs = BeautifulSoup(
            markup=strSourceCode,
            features="html5lib"
        )
        anchors = bs.findAll("a") #each object is of "Tag" (BeautifulSoup), non-hashable
        for a in anchors:
            bHrefExists = "href" in a.attrs.keys()
            if (bHrefExists):
                href = a.attrs['href']
                anchor = a.text
                myDict = dict()
                myDict['href'] = href
                myDict['anchor'] = anchor
                listResources.append(myDict)
            #if
        #for

        #self.mAllResources = listResources #this being is done at __init__
        return listResources
    #def discoverAllResources

    def filterAllResourcesByRegexp(
        self,
        pStrRegExp:str #this is the regular expression that will be used to filter results
    ):
        listFiltered = list()
        for dictResource in self.mAllResources:
            href = dictResource['href']

            reTester = re.compile(
                pStrRegExp,
                re.IGNORECASE
            )
            matchResult = reTester.match(href)
            bNoMatch = matchResult==None
            bMatch = not bNoMatch
            if (bMatch):
                listFiltered.append(dictResource)
            #if the resource's href matches the pattern
        #for every resource
        return listFiltered
    #def filterAllResourcesByRegexp

    def discoverBoards(self):
        listBoards = list()
        #incomplete URLs, may have repetitions
        foundBoards = self.filterAllResourcesByRegexp(Am4Chan.PATTERN_FOR_BOARD_DISCOVERY)
        for dictBoard in foundBoards:
            href = dictBoard['href']
            listBoards.append("https:"+href)
        #for
        #list for absolute URLs for other communities

        #now, elimininate repititions, if there are any
        setNoRepitions = set(listBoards) #set elements are non-hashable
        listNoReptitions = list(setNoRepitions)
        return listNoReptitions
    #def discoverBoards
#class Am4Chan

wg = Am4Chan("wg") #instantiate the consumer
print (wg)
#wg.discoverAllResources()
#wg.discoverBoards() #discover other communities
#wg.discoverMedia() #media is plural of medium (jpgs, pngs, ...)