Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #class_animals.py #classification of images of animals
- import os #operating system
- from amutil_ai import AmUtilAi
- dirWithImages = "."
- listOfNamesOfAllFilesInDir =\
- os.listdir(
- dirWithImages
- )
- #print(listOfNamesOfAllFilesInDir)
- for filename in listOfNamesOfAllFilesInDir:
- bJpg = filename.find(".jpg")!=-1
- bPng = filename.find(".png")!=-1
- bImage = bJpg or bPng
- if (bImage):
- print ("Will classify the image ", filename)
- classificationResult =\
- AmUtilAi.kerasClassify(
- pStrImagePath=filename,
- pbStrClassificationTop1=True,
- pbShowImageWithOverlayedClassificationRightAfterPredicion=False
- )
- #input("Press ENTER to continue") #pause
- #TODO rename
- AmUtilAi.renameImageWithPrefixingTopClassification(
- pStrImageFileName=filename,
- pStrClassification=classificationResult
- )
- #if
- #for
- ****
- #amutil_tools.py
- import urllib.request
- from urllib.request import urlopen
- from urllib.error import URLError
- from urllib.error import HTTPError
- from bs4 import BeautifulSoup
- import datetime
- from datetime import date, datetime
- import json #for json.dumps
- #pip install certifi
- import certifi
- import ssl
- class AmUtil:
- USER_AGENT_STRING_MOZ47 = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'
- USER_AGENT_STRING_GOOGLE_NEWS = 'Googlebot-News'
- REFERER_GOOGLE = "https://www.google.com"
- # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
- @staticmethod
- # https://docs.python.org/3/library/urllib.request.html
- def buildDictOfHttpHeadersToFakeUserAgent(
- pStrUserAgentSignature = USER_AGENT_STRING_MOZ47
- ):
- dictHeaders = dict()
- dictHeaders["User-Agent"]=pStrUserAgentSignature
- return dictHeaders
- #def buildDictOfHttpHeadersToFakeUserAgent
- # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
- @staticmethod
- # https://docs.python.org/3/library/urllib.request.html
- def buildDictOfHttpHeadersToFakeReferer(
- pStrDefaultReferer=REFERER_GOOGLE
- ):
- dictHeaders = dict()
- dictHeaders["Referer"] = pStrDefaultReferer
- return dictHeaders
- # def buildDictOfHttpHeadersToFakeReferer
- # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
- @staticmethod
- def combineAllDictionaries(
- *pDictionaries #variable number of not named-param arguments
- ):
- #https://betterprogramming.pub/new-union-operators-to-merge-dictionaries-in-python-3-9-8c7dbbd1080c
- dictRet = dict()
- for d in pDictionaries:
- dictRet.update(d)
- return dictRet
- # def combineAllDictionaries
- # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
- @staticmethod
- def genericUrlReader (
- pStrUrl:str,
- pUserAgent:str="",
- pReferer:str="",
- pStrEncoding="UTF-8",
- pbSilent:bool = False,
- pBytesInsteadOfStr = False
- ):
- strRead = ""
- strErrorMsg = "No error reading %s"%(pStrUrl)
- try:
- sslContext = ssl.create_default_context(
- cafile=certifi.where()
- )
- #oRequest = urlopen(pStrUrl, cafile=certifi.where()) #DeprecationWarning: cafile, capath and cadefault are deprecated, use a custom context instead.
- dictHeadersToFakeUserAgent = dict()
- dictHeadersToFakeReferer = dict()
- if (pUserAgent!=""):
- dictHeadersToFakeUserAgent = AmUtil.buildDictOfHttpHeadersToFakeUserAgent() #defaults to Firefox v47
- if (pReferer!=""):
- dictHeadersToFakeReferer = AmUtil.buildDictOfHttpHeadersToFakeReferer() #defaults to Google
- dictAllHeaders = AmUtil.combineAllDictionaries(
- dictHeadersToFakeUserAgent,
- dictHeadersToFakeReferer
- )
- #https://docs.python.org/3/library/urllib.request.html
- oRequest = urllib.request.Request(
- url=pStrUrl,
- headers=dictAllHeaders,
- )
- reader = urllib.request.urlopen(
- oRequest,
- context=sslContext
- )
- bytesRead = reader.read()
- """
- #this approach does not allow custom headers, necessary to bypass some sites' barriers to consumption
- oRequest = urlopen(
- pStrUrl, #str or urllib.request.Request object
- context=sslContext
- )
- bytesRead = oRequest.read()
- """
- if (pBytesInsteadOfStr):
- return bytesRead
- else:
- strRead = str(bytesRead, pStrEncoding)
- except URLError as urlError:
- iErrorCode = urlError.reason.errno #e.g. 11004
- strError = urlError.reason.strerror #e.g. 'getaddrinfo failed'
- strUrlError = str(urlError)
- strErrorMsg = "URL error %s opening URL: %s"%(strUrlError, pStrUrl)
- except HTTPError as httpError:
- strHttpError = str(httpError)
- strErrorMsg = "HTTP error %s @URL: %s"%(strHttpError, pStrUrl)
- finally:
- print (strErrorMsg)
- #try/except/finally
- return strRead
- #def genericUrlReader
- # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
- @staticmethod
- def findAnchorsAtUrl(pStrUrl):
- listAnchors = []
- strContents = AmUtil.genericUrlReader(pStrUrl)
- if (strContents != ""):
- bs = BeautifulSoup(
- strContents,
- "html5lib"
- )
- anchors = bs.findAll("a") #<class 'bs4.element.ResultSet'>
- iHowManyAnchors = anchors.__len__()
- for a in anchors:
- dictAttributesOfElement = a.attrs
- bContainsHref = "href" in dictAttributesOfElement.keys()
- if (bContainsHref):
- href = dictAttributesOfElement["href"]
- anchor = a.text
- dictA = {}
- dictA['href'] = href
- dictA['anchor'] = anchor
- bNewDict = not (dictA in listAnchors)
- if (bNewDict):
- listAnchors.append(dictA)
- #if new dict
- #if anchor contains href
- else:
- print ("Anchor without href: "+str(a))
- # for every anchor
- # if
- return listAnchors
- # def findAnchorsAtUrl
- # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
- @staticmethod
- def findAnchorsAtUrlFilterByHrefWith(
- pStrUrl:str,
- pStrFilterInHref:str=""
- ):
- listAnchors = []
- strContents = AmUtil.genericUrlReader(pStrUrl)
- if (strContents != ""):
- bs = BeautifulSoup(
- strContents,
- "html5lib"
- )
- anchors = bs.findAll("a") # <class 'bs4.element.ResultSet'>
- iHowManyAnchors = anchors.__len__()
- for a in anchors:
- dictAttributesOfElement = a.attrs
- bContainsHref = "href" in dictAttributesOfElement.keys()
- if (bContainsHref):
- href = dictAttributesOfElement["href"]
- anchor = a.text
- dictA = {}
- dictA['href'] = href
- dictA['anchor'] = anchor
- bNoFilter = pStrFilterInHref==""
- iWhereFilterExistsInHref = href.find(pStrFilterInHref)
- bFilterExistsInHref = iWhereFilterExistsInHref!=-1
- bHrefSatisfiesFilter = bNoFilter or bFilterExistsInHref
- bNewDict = not (dictA in listAnchors)
- if (bNewDict and bHrefSatisfiesFilter):
- listAnchors.append(dictA)
- #if new dict
- #if anchor contains href
- # for every anchor
- # if
- return listAnchors
- # def findAnchorsAtUrlFilterByHrefWith
- # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
- @staticmethod
- def listOfDictsHrefAnchorToString(pListOfDictsHrefAnchor:list):
- strAll = ""
- for el in pListOfDictsHrefAnchor:
- bCheckHref = "href" in el.keys()
- bCheckAnchor = "anchor" in el.keys()
- bCheck = bCheckHref and bCheckAnchor
- if (bCheck):
- strEl = "href: %s\nanchor: %s"%(el['href'], el['anchor'])
- strAll+=strEl+"\n"
- #if
- #for
- return strAll
- # def listOfDictsHrefAnchorToString
- # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
- @staticmethod
- def listOfDictsHrefAnchorToFile(
- pListOfDictsHrefAnchor: list,
- pStrFileName:str=""
- )->str:
- bAutomaticFileName:bool = pStrFileName==""
- if (bAutomaticFileName):
- strNow = AmUtil.utilNowYMDHMS()
- strFileName:str = "%s_hrefs_anchors.TXT"%(strNow)
- else:
- strFileName = pStrFileName
- #if-else
- fw = open(
- file = strFileName,
- mode = "wt",
- encoding = "UTF-8"
- )
- strAll = AmUtil.listOfDictsHrefAnchorToString(pListOfDictsHrefAnchor)
- strAll = str(
- strAll
- )
- fw.write(
- strAll
- )
- fw.close()
- return strFileName
- #def listOfDictsHrefAnchorToFile
- # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
- @staticmethod
- def genericDataStructureToJsonFile(
- pData,
- pStrFileName: str = ""
- ) -> str:
- bAutomaticFileName: bool = pStrFileName == ""
- if (bAutomaticFileName):
- strNow = AmUtil.utilNowYMDHMS()
- strFileName: str = "%s_data.JSON" % (strNow)
- else:
- strFileName = pStrFileName
- # if-else
- fw = open(
- file=strFileName,
- mode="wt",
- encoding="UTF-8"
- )
- strAll = json.dumps(pData)
- fw.write(
- strAll
- )
- fw.close()
- return strFileName
- # def genericDataStructureToJsonFile
- # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
- #from datetime import today
- @staticmethod
- def getDateCorrespondingToToday():
- today = date.today() #<class 'datime.date'>
- y = today.year
- m = today.month
- d = today.day
- #return (y,m,d)
- return today
- #def getDateCorrespondingToToday
- # _.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-.
- #import datetime
- @staticmethod
- def getDateCorrespondingToYMD (pY, pM, pD, pHH=0, pMM=0, pSS=0):
- oYMDHMS = datetime.datetime(pY, pM, pD, pHH, pMM, pSS) #<class 'datetime.datetime'>
- oDate = oYMDHMS.date() #<class 'datime.date'>
- return oDate
- #def getDateCorrespondingToYMD
- # _.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.
- @staticmethod
- def utilNowYMDHMS():
- dateToday = date.today()
- y, m, d = dateToday.year, dateToday.month, dateToday.day
- strYMD = "%d-%d-%d" % (y, m, d)
- timeNow = datetime.now()
- hh, mm, ss = timeNow.hour, timeNow.minute, timeNow.second
- strHMS = "%d-%d-%d" % (hh, mm, ss)
- strRet = "%s_%s" % (strYMD, strHMS)
- return strRet
- # def utilNowYMDHMS
- # _.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.
- #untested, unused
- #2022-01-01
- @staticmethod
- def genericDictToTsvLine(
- pDict:dict
- )->str:
- strRet = ""
- theKeys = pDict.keys()
- iHowManyKeys = len(theKeys)
- iCurrentKey = 1
- for k in theKeys:
- bThisIsTheLastKey = iCurrentKey==iHowManyKeys
- v = pDict[v]
- bValueIsDict = type(v)==dict
- if (bValueIsDict):
- strRet+=AmUtil.genericDictToTsvLine(v)+"\t"
- else:
- strRet+=str(v)
- #if-else
- if(bThisIsTheLastKey):
- strRet+="\n"
- #if
- #for
- #def genericDictToTsvLine
- # _.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.-~^~-._.
- # 2021-12-31
- # created to assist the AmSpotify project
- @staticmethod
- def genericListOfDictsToTsvFile(
- pListOfDicts: list,
- pStrFileName: str
- ):
- strAll = ""
- bCheck = len(pListOfDicts) > 0 and (type(pListOfDicts[0]) == dict)
- if (bCheck):
- keys = pListOfDicts[0].keys()
- strKeys = ""
- # build the headers line
- idx = 0
- for k in keys:
- bLastOne = idx == len(keys) - 1
- strKeys += k + "\t" if (not bLastOne) else k + "\n"
- idx += 1
- # for
- strAll += strKeys
- for el in pListOfDicts:
- idxCol = 0
- strLineForEl = ""
- for key in keys:
- v = el[key]
- # TODO: if v is a dict itself
- bLastOne = idxCol == len(keys) - 1
- strLineForEl += (str(v) + "\t") if (not bLastOne) else (str(v) + "\n")
- idxCol += 1
- # for every key for the current element
- strAll += strLineForEl
- # for every element in the list
- # if check
- # return strAll
- try:
- fw = open(
- file=pStrFileName,
- mode="wt",
- encoding="UTF-8"
- )
- fw.write(strAll)
- fw.close()
- except Exception as e:
- print(str(e))
- return False
- # try-except
- return len(strAll) # returns the number of bytes written
- # def genericListOfDictsToTsvFile
- #def classAmUtil
- *****
- #Am4Chan.py
- from amutil_tools import AmUtil
- import re #regular expressions
- from bs4 import BeautifulSoup
- class Am4Chan:
- PATTERN_FOR_BOARD_DISCOVERY = "^//boards\.4channel\.org/[a-zA-Z0-9_]+/$"
- PATTERN_FOR_JPGS = ".*\.jpg$|.*\.jpeg$"
- PATTERN_FOR_PNGS = ".*\.png$"
- def __init__(
- self,
- pBoardName:str
- ):
- self.mBoardName = pBoardName
- self.mAllResources = self.discoverAllResources()
- #print(self.mAllResources)
- self.mBoards = self.discoverBoards()
- #print(self.mBoards)
- self.mJpgs = self.discoverJpgs()
- #print(self.mJpgs)
- self.mPngs = self.discoverPngs()
- #print(self.mPngs)
- #def __init__
- #dunder / double underscore
- def __str__(self):
- strAll = "board name: %s"%(self.mBoardName)
- return strAll
- #def __str__
- def buildUrlForPageNumber(self, piPageNumber:int)->str:
- strUrl = "https://boards.4chan.org/%s/"%(self.mBoardName)
- if(piPageNumber==1):
- return strUrl
- else:
- #strUrl = strUrl+str(piPageNumber)
- strUrl = "%s%d"%(strUrl, piPageNumber)
- return strUrl
- #if-else
- #def buildUrlForPageNumber
- def discoverAllResources(self):
- listResources = list()
- strUrl = self.buildUrlForPageNumber(1)
- strSourceCode =\
- AmUtil.genericUrlReader(
- pStrUrl=strUrl,
- pUserAgent=AmUtil.USER_AGENT_STRING_MOZ47,
- pReferer="https://boards.4chan.org/",
- pBytesInsteadOfStr=False
- )
- #print(strSourceCode)
- bs = BeautifulSoup(
- markup=strSourceCode,
- features="html5lib"
- )
- anchors = bs.findAll("a") #each object is of "Tag" (BeautifulSoup), non-hashable
- for a in anchors:
- bHrefExists = "href" in a.attrs.keys()
- if (bHrefExists):
- href = a.attrs['href']
- anchor = a.text
- myDict = dict()
- myDict['href'] = href
- myDict['anchor'] = anchor
- listResources.append(myDict)
- #if
- #for
- #self.mAllResources = listResources #this being is done at __init__
- return listResources
- #def discoverAllResources
- def filterAllResourcesByRegexp(
- self,
- pStrRegExp:str #this is the regular expression that will be used to filter results
- ):
- listFiltered = list()
- for dictResource in self.mAllResources:
- href = dictResource['href']
- reTester = re.compile(
- pattern=pStrRegExp,
- flags=re.IGNORECASE
- )
- matchResult = reTester.match(href)
- bNoMatch = matchResult==None
- bMatch = not bNoMatch
- if (bMatch):
- listFiltered.append(dictResource)
- #if the resource's href matches the pattern
- #for every resource
- return listFiltered
- #def filterAllResourcesByRegexp
- def discoverBoards(self):
- listBoards = list()
- #incomplete URLs, may have repetitions
- foundBoards = self.filterAllResourcesByRegexp(Am4Chan.PATTERN_FOR_BOARD_DISCOVERY)
- for dictBoard in foundBoards:
- href = dictBoard['href']
- listBoards.append("https:"+href)
- #for
- #list for absolute URLs for other communities
- #now, elimininate repititions, if there are any
- setNoRepetitions = set(listBoards) #set elements are non-hashable
- listNoRepetitions = list(setNoRepetitions)
- return listNoRepetitions
- #def discoverBoards
- #2022-01-05
- def discoverJpgs(self):
- listJpgs = list()
- listOfDictsOfFoundJpgs = self.filterAllResourcesByRegexp(Am4Chan.PATTERN_FOR_JPGS)
- for dictJpg in listOfDictsOfFoundJpgs:
- href=dictJpg['href']
- listJpgs.append("https:"+href)
- #for
- setOfJpgsWithNoRepetitions = set(listJpgs)
- listOfJpgsWithNoRepetitions = list(setOfJpgsWithNoRepetitions)
- #return listJpgs
- return listOfJpgsWithNoRepetitions
- #def discoverJpgs
- def discoverPngs(self):
- listPngs = list()
- listOfDictsOfFoundPngs = self.filterAllResourcesByRegexp(Am4Chan.PATTERN_FOR_PNGS)
- for dictPng in listOfDictsOfFoundPngs:
- href=dictPng['href']
- listPngs.append("https:"+href)
- #for
- setOfPngsWithNoRepetitions = set(listPngs)
- listOfPngsWithNoRepetitions = list(setOfPngsWithNoRepetitions)
- #return listPngs
- return listOfPngsWithNoRepetitions
- #def discoverPngs
- def downloadAllImages(self):
- listOfAllImages = self.mPngs + self.mJpgs #["https://i.4cdn.org/wg/i1.png", "...2.jpg" ...]
- for strHref in listOfAllImages:
- theBytes =\
- AmUtil.genericUrlReader(
- pStrUrl=strHref,
- pUserAgent=AmUtil.USER_AGENT_STRING_MOZ47,
- pReferer="",
- pBytesInsteadOfStr=True
- )
- #find = leftmost occurence of symbol in string
- # rfind = rightmost occurence of symbol in string
- #"Artur".find("r") #1
- #"Artur".rfind("r") # 4
- #string slicing
- strNameForImageFile = strHref [
- strHref.rfind("/")+1 #start pos
- : #slice operator
- #len(strHref) #final pos
- ]
- fw = open(
- strNameForImageFile,
- mode="wb" #write bytes
- )
- fw.write(theBytes)
- fw.close()
- #for
- #class Am4Chan
- wg = Am4Chan("wg") #instantiate the consumer
- print (wg)
- wg.downloadAllImages()
- #wg.discoverAllResources()
- #wg.discoverBoards() #discover other communities
- #wg.discoverMedia() #media is plural of medium (jpgs, pngs, ...)
Add Comment
Please, Sign In to add comment