IA 2022-01-05

#class_animals.py #classification of images of animals

import os #operating system
from amutil_ai import AmUtilAi

dirWithImages = "."
listOfNamesOfAllFilesInDir =\
    os.listdir(
        dirWithImages
    )
#print(listOfNamesOfAllFilesInDir)

for filename in listOfNamesOfAllFilesInDir:
    bJpg = filename.find(".jpg")!=-1
    bPng = filename.find(".png")!=-1
    bImage = bJpg or bPng
    if (bImage):
        print ("Will classify the image ", filename)
        classificationResult =\
            AmUtilAi.kerasClassify(
                pStrImagePath=filename,
                pbStrClassificationTop1=True,
                pbShowImageWithOverlayedClassificationRightAfterPredicion=False
            )
        #input("Press ENTER to continue") #pause

        #TODO rename
        AmUtilAi.renameImageWithPrefixingTopClassification(
            pStrImageFileName=filename,
            pStrClassification=classificationResult
        )
    #if
#for

****

#amutil_tools.py

import urllib.request
from urllib.request import urlopen
from urllib.error import URLError
from urllib.error import HTTPError
from bs4 import BeautifulSoup

import datetime
from datetime import date, datetime

import json #for json.dumps

#pip install certifi
import certifi
import ssl

class AmUtil:
    USER_AGENT_STRING_MOZ47 = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'
    USER_AGENT_STRING_GOOGLE_NEWS = 'Googlebot-News'
    REFERER_GOOGLE = "https://www.google.com"

    # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
    @staticmethod
    # https://docs.python.org/3/library/urllib.request.html
    def buildDictOfHttpHeadersToFakeUserAgent(
        pStrUserAgentSignature = USER_AGENT_STRING_MOZ47
    ):
        dictHeaders = dict()
        dictHeaders["User-Agent"]=pStrUserAgentSignature
        return dictHeaders
    #def buildDictOfHttpHeadersToFakeUserAgent

    # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
    @staticmethod
    # https://docs.python.org/3/library/urllib.request.html
    def buildDictOfHttpHeadersToFakeReferer(
        pStrDefaultReferer=REFERER_GOOGLE
    ):
        dictHeaders = dict()
        dictHeaders["Referer"] = pStrDefaultReferer
        return dictHeaders
    # def buildDictOfHttpHeadersToFakeReferer

    # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
    @staticmethod
    def combineAllDictionaries(
        *pDictionaries #variable number of not named-param arguments
    ):
        #https://betterprogramming.pub/new-union-operators-to-merge-dictionaries-in-python-3-9-8c7dbbd1080c
        dictRet = dict()
        for d in pDictionaries:
            dictRet.update(d)

        return dictRet
    # def combineAllDictionaries

    # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
    @staticmethod
    def genericUrlReader (
        pStrUrl:str,
        pUserAgent:str="",
        pReferer:str="",
        pStrEncoding="UTF-8",
        pbSilent:bool = False,
        pBytesInsteadOfStr = False
    ):
        strRead = ""
        strErrorMsg = "No error reading %s"%(pStrUrl)

        try:
            sslContext = ssl.create_default_context(
                cafile=certifi.where()
            )

            #oRequest = urlopen(pStrUrl,  cafile=certifi.where()) #DeprecationWarning: cafile, capath and cadefault are deprecated, use a custom context instead.

            dictHeadersToFakeUserAgent = dict()
            dictHeadersToFakeReferer = dict()
            if (pUserAgent!=""):
                dictHeadersToFakeUserAgent = AmUtil.buildDictOfHttpHeadersToFakeUserAgent() #defaults to Firefox v47

            if (pReferer!=""):
                dictHeadersToFakeReferer = AmUtil.buildDictOfHttpHeadersToFakeReferer() #defaults to Google

            dictAllHeaders = AmUtil.combineAllDictionaries(
                dictHeadersToFakeUserAgent,
                dictHeadersToFakeReferer
            )

            #https://docs.python.org/3/library/urllib.request.html
            oRequest = urllib.request.Request(
                url=pStrUrl,
                headers=dictAllHeaders,
            )
            reader = urllib.request.urlopen(
                oRequest,
                context=sslContext
            )
            bytesRead = reader.read()

            """
            #this approach does not allow custom headers, necessary to bypass some sites' barriers to consumption
            oRequest = urlopen(
                pStrUrl, #str or urllib.request.Request object
                context=sslContext
            )
            bytesRead = oRequest.read()
            """

            if (pBytesInsteadOfStr):
                return bytesRead
            else:
                strRead = str(bytesRead, pStrEncoding)

        except URLError as urlError:
            iErrorCode = urlError.reason.errno #e.g. 11004
            strError = urlError.reason.strerror #e.g. 'getaddrinfo failed'

            strUrlError = str(urlError)
            strErrorMsg = "URL error %s opening URL: %s"%(strUrlError, pStrUrl)
        except HTTPError as httpError:
            strHttpError = str(httpError)
            strErrorMsg = "HTTP error %s @URL: %s"%(strHttpError, pStrUrl)
        finally:
            print (strErrorMsg)
        #try/except/finally

        return strRead
    #def genericUrlReader

    # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
    @staticmethod
    def findAnchorsAtUrl(pStrUrl):
        listAnchors = []
        strContents = AmUtil.genericUrlReader(pStrUrl)

        if (strContents != ""):
            bs = BeautifulSoup(
                strContents,
                "html5lib"
            )
            anchors = bs.findAll("a") #<class 'bs4.element.ResultSet'>
            iHowManyAnchors = anchors.__len__()
            for a in anchors:
                dictAttributesOfElement = a.attrs
                bContainsHref = "href" in dictAttributesOfElement.keys()
                if (bContainsHref):
                    href = dictAttributesOfElement["href"]
                    anchor = a.text
                    dictA = {}
                    dictA['href'] = href
                    dictA['anchor'] = anchor

                    bNewDict = not (dictA in listAnchors)

                    if (bNewDict):
                        listAnchors.append(dictA)
                    #if new dict
                #if anchor contains href
                else:
                    print ("Anchor without href: "+str(a))
            # for every anchor
        # if
        return listAnchors
    # def findAnchorsAtUrl

    # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
    @staticmethod
    def findAnchorsAtUrlFilterByHrefWith(
        pStrUrl:str,
        pStrFilterInHref:str=""
    ):
        listAnchors = []
        strContents = AmUtil.genericUrlReader(pStrUrl)

        if (strContents != ""):
            bs = BeautifulSoup(
                strContents,
                "html5lib"
            )
            anchors = bs.findAll("a")  # <class 'bs4.element.ResultSet'>
            iHowManyAnchors = anchors.__len__()
            for a in anchors:
                dictAttributesOfElement = a.attrs
                bContainsHref = "href" in dictAttributesOfElement.keys()

                if (bContainsHref):
                    href = dictAttributesOfElement["href"]
                    anchor = a.text
                    dictA = {}
                    dictA['href'] = href
                    dictA['anchor'] = anchor

                    bNoFilter = pStrFilterInHref==""
                    iWhereFilterExistsInHref = href.find(pStrFilterInHref)
                    bFilterExistsInHref = iWhereFilterExistsInHref!=-1
                    bHrefSatisfiesFilter = bNoFilter or bFilterExistsInHref

                    bNewDict = not (dictA in listAnchors)

                    if (bNewDict and bHrefSatisfiesFilter):
                        listAnchors.append(dictA)
                    #if new dict
                #if anchor contains href
            # for every anchor
        # if
        return listAnchors
    # def findAnchorsAtUrlFilterByHrefWith

    # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
    @staticmethod
    def listOfDictsHrefAnchorToString(pListOfDictsHrefAnchor:list):
        strAll = ""
        for el in pListOfDictsHrefAnchor:
            bCheckHref = "href" in el.keys()
            bCheckAnchor = "anchor" in el.keys()
            bCheck = bCheckHref and bCheckAnchor
            if (bCheck):
                strEl = "href: %s\nanchor: %s"%(el['href'], el['anchor'])
                strAll+=strEl+"\n"
            #if
        #for
        return strAll
    # def listOfDictsHrefAnchorToString

    # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
    @staticmethod
    def listOfDictsHrefAnchorToFile(
        pListOfDictsHrefAnchor: list,
        pStrFileName:str=""
    )->str:
        bAutomaticFileName:bool = pStrFileName==""
        if (bAutomaticFileName):
            strNow = AmUtil.utilNowYMDHMS()
            strFileName:str = "%s_hrefs_anchors.TXT"%(strNow)
        else:
            strFileName = pStrFileName
        #if-else

        fw = open(
            file = strFileName,
            mode = "wt",
            encoding = "UTF-8"
        )
        strAll = AmUtil.listOfDictsHrefAnchorToString(pListOfDictsHrefAnchor)
        strAll = str(
            strAll
        )
        fw.write(
            strAll
        )
        fw.close()

        return strFileName
    #def listOfDictsHrefAnchorToFile

    # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
    @staticmethod
    def genericDataStructureToJsonFile(
            pData,
            pStrFileName: str = ""
    ) -> str:
        bAutomaticFileName: bool = pStrFileName == ""
        if (bAutomaticFileName):
            strNow = AmUtil.utilNowYMDHMS()
            strFileName: str = "%s_data.JSON" % (strNow)
        else:
            strFileName = pStrFileName
        # if-else

        fw = open(
            file=strFileName,
            mode="wt",
            encoding="UTF-8"
        )
        strAll = json.dumps(pData)
        fw.write(
            strAll
        )
        fw.close()

        return strFileName
    # def genericDataStructureToJsonFile

    # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
    #from datetime import today
    @staticmethod
    def getDateCorrespondingToToday():
        today = date.today() #<class 'datime.date'>
        y = today.year
        m = today.month
        d = today.day
        #return (y,m,d)
        return today
    #def getDateCorrespondingToToday

    # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
    #import datetime
    @staticmethod
    def getDateCorrespondingToYMD (pY, pM, pD, pHH=0, pMM=0, pSS=0):
        oYMDHMS = datetime.datetime(pY, pM, pD, pHH, pMM, pSS) #<class 'datetime.datetime'>
        oDate = oYMDHMS.date() #<class 'datime.date'>
        return oDate
    #def getDateCorrespondingToYMD

    # _.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.
    @staticmethod
    def utilNowYMDHMS():
        dateToday = date.today()
        y, m, d = dateToday.year, dateToday.month, dateToday.day
        strYMD = "%d-%d-%d" % (y, m, d)

        timeNow = datetime.now()
        hh, mm, ss = timeNow.hour, timeNow.minute, timeNow.second
        strHMS = "%d-%d-%d" % (hh, mm, ss)

        strRet = "%s_%s" % (strYMD, strHMS)

        return strRet
    # def utilNowYMDHMS

    # _.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.
    #untested, unused
    #2022-01-01
    @staticmethod
    def genericDictToTsvLine(
        pDict:dict
    )->str:
        strRet = ""
        theKeys = pDict.keys()
        iHowManyKeys = len(theKeys)
        iCurrentKey = 1
        for k in theKeys:
            bThisIsTheLastKey = iCurrentKey==iHowManyKeys
            v = pDict[v]
            bValueIsDict = type(v)==dict
            if (bValueIsDict):
                strRet+=AmUtil.genericDictToTsvLine(v)+"\t"
            else:
                strRet+=str(v)
            #if-else

            if(bThisIsTheLastKey):
                strRet+="\n"
            #if
        #for
    #def genericDictToTsvLine

    # _.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.
    # 2021-12-31
    # created to assist the AmSpotify project
    @staticmethod
    def genericListOfDictsToTsvFile(
        pListOfDicts: list,
        pStrFileName: str
    ):
        strAll = ""
        bCheck = len(pListOfDicts) > 0 and (type(pListOfDicts[0]) == dict)
        if (bCheck):
            keys = pListOfDicts[0].keys()
            strKeys = ""

            # build the headers line
            idx = 0
            for k in keys:
                bLastOne = idx == len(keys) - 1
                strKeys += k + "\t" if (not bLastOne) else k + "\n"
                idx += 1
            # for

            strAll += strKeys

            for el in pListOfDicts:
                idxCol = 0
                strLineForEl = ""
                for key in keys:
                    v = el[key]
                    # TODO: if v is a dict itself
                    bLastOne = idxCol == len(keys) - 1
                    strLineForEl += (str(v) + "\t") if (not bLastOne) else (str(v) + "\n")
                    idxCol += 1
                # for every key for the current element

                strAll += strLineForEl
            # for every element in the list
        # if check
        # return strAll

        try:
            fw = open(
                file=pStrFileName,
                mode="wt",
                encoding="UTF-8"
            )
            fw.write(strAll)
            fw.close()
        except Exception as e:
            print(str(e))
            return False
        # try-except

        return len(strAll)  # returns the number of bytes written
    # def genericListOfDictsToTsvFile
#def classAmUtil

*****

#Am4Chan.py
from amutil_tools import AmUtil

import re #regular expressions
from bs4 import BeautifulSoup

class Am4Chan:
    PATTERN_FOR_BOARD_DISCOVERY = "^//boards\.4channel\.org/[a-zA-Z0-9_]+/$"
    PATTERN_FOR_JPGS = ".*\.jpg$|.*\.jpeg$"
    PATTERN_FOR_PNGS = ".*\.png$"

    def __init__(
        self,
        pBoardName:str
    ):
        self.mBoardName = pBoardName

        self.mAllResources = self.discoverAllResources()
        #print(self.mAllResources)

        self.mBoards = self.discoverBoards()
        #print(self.mBoards)

        self.mJpgs = self.discoverJpgs()
        #print(self.mJpgs)

        self.mPngs = self.discoverPngs()
        #print(self.mPngs)
    #def __init__

    #dunder / double underscore
    def __str__(self):
        strAll = "board name: %s"%(self.mBoardName)
        return strAll
    #def __str__

    def buildUrlForPageNumber(self, piPageNumber:int)->str:
        strUrl = "https://boards.4chan.org/%s/"%(self.mBoardName)
        if(piPageNumber==1):
            return strUrl
        else:
            #strUrl = strUrl+str(piPageNumber)
            strUrl = "%s%d"%(strUrl, piPageNumber)
            return strUrl
        #if-else
    #def buildUrlForPageNumber

    def discoverAllResources(self):
        listResources = list()
        strUrl = self.buildUrlForPageNumber(1)
        strSourceCode =\
            AmUtil.genericUrlReader(
                pStrUrl=strUrl,
                pUserAgent=AmUtil.USER_AGENT_STRING_MOZ47,
                pReferer="https://boards.4chan.org/",
                pBytesInsteadOfStr=False
            )
        #print(strSourceCode)
        bs = BeautifulSoup(
            markup=strSourceCode,
            features="html5lib"
        )
        anchors = bs.findAll("a") #each object is of "Tag" (BeautifulSoup), non-hashable
        for a in anchors:
            bHrefExists = "href" in a.attrs.keys()
            if (bHrefExists):
                href = a.attrs['href']
                anchor = a.text
                myDict = dict()
                myDict['href'] = href
                myDict['anchor'] = anchor
                listResources.append(myDict)
            #if
        #for

        #self.mAllResources = listResources #this being is done at __init__
        return listResources
    #def discoverAllResources

    def filterAllResourcesByRegexp(
        self,
        pStrRegExp:str #this is the regular expression that will be used to filter results
    ):
        listFiltered = list()
        for dictResource in self.mAllResources:
            href = dictResource['href']

            reTester = re.compile(
                pattern=pStrRegExp,
                flags=re.IGNORECASE
            )
            matchResult = reTester.match(href)
            bNoMatch = matchResult==None
            bMatch = not bNoMatch
            if (bMatch):
                listFiltered.append(dictResource)
            #if the resource's href matches the pattern
        #for every resource
        return listFiltered
    #def filterAllResourcesByRegexp

    def discoverBoards(self):
        listBoards = list()
        #incomplete URLs, may have repetitions
        foundBoards = self.filterAllResourcesByRegexp(Am4Chan.PATTERN_FOR_BOARD_DISCOVERY)
        for dictBoard in foundBoards:
            href = dictBoard['href']
            listBoards.append("https:"+href)
        #for
        #list for absolute URLs for other communities

        #now, elimininate repititions, if there are any
        setNoRepetitions = set(listBoards) #set elements are non-hashable
        listNoRepetitions = list(setNoRepetitions)
        return listNoRepetitions
    #def discoverBoards

    #2022-01-05
    def discoverJpgs(self):
        listJpgs = list()
        listOfDictsOfFoundJpgs = self.filterAllResourcesByRegexp(Am4Chan.PATTERN_FOR_JPGS)
        for dictJpg in listOfDictsOfFoundJpgs:
            href=dictJpg['href']
            listJpgs.append("https:"+href)
        #for

        setOfJpgsWithNoRepetitions = set(listJpgs)
        listOfJpgsWithNoRepetitions = list(setOfJpgsWithNoRepetitions)

        #return listJpgs
        return listOfJpgsWithNoRepetitions
    #def discoverJpgs

    def discoverPngs(self):
        listPngs = list()
        listOfDictsOfFoundPngs = self.filterAllResourcesByRegexp(Am4Chan.PATTERN_FOR_PNGS)
        for dictPng in listOfDictsOfFoundPngs:
            href=dictPng['href']
            listPngs.append("https:"+href)
        #for

        setOfPngsWithNoRepetitions = set(listPngs)
        listOfPngsWithNoRepetitions = list(setOfPngsWithNoRepetitions)

        #return listPngs
        return listOfPngsWithNoRepetitions
    #def discoverPngs

    def downloadAllImages(self):
        listOfAllImages = self.mPngs + self.mJpgs #["https://i.4cdn.org/wg/i1.png", "...2.jpg" ...]
        for strHref in listOfAllImages:
            theBytes =\
                AmUtil.genericUrlReader(
                    pStrUrl=strHref,
                    pUserAgent=AmUtil.USER_AGENT_STRING_MOZ47,
                    pReferer="",
                    pBytesInsteadOfStr=True
                )

            #find = leftmost occurence of symbol in string
            # rfind = rightmost occurence of symbol in string
            #"Artur".find("r") #1
            #"Artur".rfind("r")  # 4

            #string slicing
            strNameForImageFile = strHref [
              strHref.rfind("/")+1 #start pos
              : #slice operator
              #len(strHref) #final pos
            ]
            fw = open(
                strNameForImageFile,
                mode="wb" #write bytes
            )
            fw.write(theBytes)
            fw.close()
        #for
#class Am4Chan

wg = Am4Chan("wg") #instantiate the consumer
print (wg)
wg.downloadAllImages()
#wg.discoverAllResources()
#wg.discoverBoards() #discover other communities
#wg.discoverMedia() #media is plural of medium (jpgs, pngs, ...)