FP 2022-01-03

from amutil_tools import AmUtil #for accessing the helpers in AmUtil

from bs4 import BeautifulSoup #for parsing html elements

import re #for regular expressions

class Am4Chan:
    PATTERN_FOR_BOARD_SHORT_URLS = "^//boards\.4chan\.org/[a-zA-Z0-9_]+/$|^//boards\.4channel\.org/[a-zA-Z0-9_]+/$"
    PATTERN_FOR_JPG_URL = ".*\.jpg$|.*\.jpeg$"
    PATTERN_FOR_PNG = ".*\.png$"

    BOARDS_BASE_URL = "https://boards.4chan.org/"

    #https://boards.4chan.org/wg/
    #https://boards.4chan.org/wg/2
    #...
    #https://boards.4chan.org/wg/10

    def __init__(
        self,
        pBoardName:str
    ):
        self.mBoardName = pBoardName

        self.mAllResources = self.discoverAllResources()

        self.mFoundBoards = self.filterResourcesByRegexp(
            Am4Chan.PATTERN_FOR_BOARD_SHORT_URLS
        )
        #print(self.mFoundBoards)

        self.mJpgs = self.filterResourcesByRegexp(
            Am4Chan.PATTERN_FOR_JPG_URL
        )
        #print (self.mJpgs)

        self.mPngs = self.filterResourcesByRegexp(
            Am4Chan.PATTERN_FOR_PNG
        )
        #print(self.mPngs)
    #def __init__

    def __str__(self):
        strAll = "board name: %s\n"%(self.mBoardName)
        return strAll
    #def __str__

    def buildUrlForPageNumber(self, piPageNumber:int=1):
        strUrl="https://boards.4chan.org/%s/"%(self.mBoardName)
        if (piPageNumber==1):
            return strUrl
        else:
            strUrl="strUrl%d"%(piPageNumber)
            return strUrl
        #if-else
    #def buildUrlForPageNumber

    def discoverAllResources(self)->list:
        """
        @return: a list of dicts, each with 2 keys ("href", "anchor")
        """
        listRet = list()
        strUrl = self.buildUrlForPageNumber(1)
        strHtml =\
            AmUtil.genericUrlReader(
                pStrUrl=strUrl,
                pUserAgent=AmUtil.USER_AGENT_STRING_MOZ47,
                pReferer=Am4Chan.BOARDS_BASE_URL
            )
        bs = BeautifulSoup(
            markup=strHtml,
            features="html5"
        )
        anchors = bs.findAll("a") #Tag objects (of BeautifulSoup)
        for a in anchors:
            bHrefExists = "href" in a.attrs.keys() #a.attrs is a dictionary
            if (bHrefExists):
                href = a.attrs['href']
                text = a.text
                myDict = dict()
                myDict['href'] = href
                myDict['anchor'] = text
                listRet.append(myDict)
            #if
        #for

        return listRet
    #def discoverAllResources

    def filterResourcesByRegexp(self, pRegExp:str)->list:
        listRet = list()
        for dictResource in self.mAllResources:
            href = dictResource['href'] #e.g. "//www.4chan.org/contact"
            anchor = dictResource['anchor']

            reTester = \
                re.compile(
                    pattern=pRegExp, #e.g. PATTERN_FOR_BOARD_SHORT_URLS
                    flags=re.IGNORECASE
                )

            resultMatch = reTester.match(href) #None is there is no match

            if(resultMatch!=None):
                listRet.append(dictResource)
            #if
        #for
        return listRet
    #def filterResourcesByRegexp

    def downloadAllImages(
        self,
        pDestinationDir:str="."
    ):
        #L1=[1, 2]; L2=[2, 3]; L3=L1+L2=[1,2,2,3]
        listOfAllImages = self.mJpgs + self.mPngs
        listOfAllHref = list()
        for dictImage in listOfAllImages:
            href = dictImage['href']
            listOfAllHref.append(href)
        #for

        setNoRepetitions = set(listOfAllHref) #the set constructor will eliminate all repetitions, because sets do NOT allow repeated elements
        listNoRepetitions = list(setNoRepetitions) #serializable (representable as str) and with no repetitions

        for href in listNoRepetitions:
            fullHref = "https:"+href
            #//i.4cdn.org/wg/1639231804800.jpg
            #href.find("/") #find is the "left-find" - find pos of th FIRST occurence
            #href.rfind("/") #15
            strFileName = href[
              href.rfind("/")+1 #start AFTER the last forward-slash (/)
              : #go to the end
            ]

            theBytes =\
                AmUtil.genericUrlReader(
                    pStrUrl = fullHref,
                    pUserAgent = AmUtil.USER_AGENT_STRING_MOZ47,
                    pReferer="",
                    pBytesInsteadOfStr=True
                )

            fw = open(
                file=strFileName,
                mode="wb" #write bytes
            )
            fw.write(theBytes)
            fw.close()

            strMsg="File %s (%d bytes long) saved OK!"%(strFileName, len(theBytes))
            print(strMsg)
        #for
    #def downloadAllImages
#def Am4Chan

wgConsumer = Am4Chan("wg") #instantiation
print (wgConsumer)
#listAllAnchors = wgConsumer.discoverAllResources()
#print(listAllAnchors)
wgConsumer.downloadAllImages()