Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from amutil_tools import AmUtil #for accessing the helpers in AmUtil
- from bs4 import BeautifulSoup #for parsing html elements
- import re #for regular expressions
- class Am4Chan:
- PATTERN_FOR_BOARD_SHORT_URLS = "^//boards\.4chan\.org/[a-zA-Z0-9_]+/$|^//boards\.4channel\.org/[a-zA-Z0-9_]+/$"
- PATTERN_FOR_JPG_URL = ".*\.jpg$|.*\.jpeg$"
- PATTERN_FOR_PNG = ".*\.png$"
- BOARDS_BASE_URL = "https://boards.4chan.org/"
- #https://boards.4chan.org/wg/
- #https://boards.4chan.org/wg/2
- #...
- #https://boards.4chan.org/wg/10
- def __init__(
- self,
- pBoardName:str
- ):
- self.mBoardName = pBoardName
- self.mAllResources = self.discoverAllResources()
- self.mFoundBoards = self.filterResourcesByRegexp(
- Am4Chan.PATTERN_FOR_BOARD_SHORT_URLS
- )
- #print(self.mFoundBoards)
- self.mJpgs = self.filterResourcesByRegexp(
- Am4Chan.PATTERN_FOR_JPG_URL
- )
- #print (self.mJpgs)
- self.mPngs = self.filterResourcesByRegexp(
- Am4Chan.PATTERN_FOR_PNG
- )
- #print(self.mPngs)
- #def __init__
- def __str__(self):
- strAll = "board name: %s\n"%(self.mBoardName)
- return strAll
- #def __str__
- def buildUrlForPageNumber(self, piPageNumber:int=1):
- strUrl="https://boards.4chan.org/%s/"%(self.mBoardName)
- if (piPageNumber==1):
- return strUrl
- else:
- strUrl="strUrl%d"%(piPageNumber)
- return strUrl
- #if-else
- #def buildUrlForPageNumber
- def discoverAllResources(self)->list:
- """
- @return: a list of dicts, each with 2 keys ("href", "anchor")
- """
- listRet = list()
- strUrl = self.buildUrlForPageNumber(1)
- strHtml =\
- AmUtil.genericUrlReader(
- pStrUrl=strUrl,
- pUserAgent=AmUtil.USER_AGENT_STRING_MOZ47,
- pReferer=Am4Chan.BOARDS_BASE_URL
- )
- bs = BeautifulSoup(
- markup=strHtml,
- features="html5"
- )
- anchors = bs.findAll("a") #Tag objects (of BeautifulSoup)
- for a in anchors:
- bHrefExists = "href" in a.attrs.keys() #a.attrs is a dictionary
- if (bHrefExists):
- href = a.attrs['href']
- text = a.text
- myDict = dict()
- myDict['href'] = href
- myDict['anchor'] = text
- listRet.append(myDict)
- #if
- #for
- return listRet
- #def discoverAllResources
- def filterResourcesByRegexp(self, pRegExp:str)->list:
- listRet = list()
- for dictResource in self.mAllResources:
- href = dictResource['href'] #e.g. "//www.4chan.org/contact"
- anchor = dictResource['anchor']
- reTester = \
- re.compile(
- pattern=pRegExp, #e.g. PATTERN_FOR_BOARD_SHORT_URLS
- flags=re.IGNORECASE
- )
- resultMatch = reTester.match(href) #None is there is no match
- if(resultMatch!=None):
- listRet.append(dictResource)
- #if
- #for
- return listRet
- #def filterResourcesByRegexp
- def downloadAllImages(
- self,
- pDestinationDir:str="."
- ):
- #L1=[1, 2]; L2=[2, 3]; L3=L1+L2=[1,2,2,3]
- listOfAllImages = self.mJpgs + self.mPngs
- listOfAllHref = list()
- for dictImage in listOfAllImages:
- href = dictImage['href']
- listOfAllHref.append(href)
- #for
- setNoRepetitions = set(listOfAllHref) #the set constructor will eliminate all repetitions, because sets do NOT allow repeated elements
- listNoRepetitions = list(setNoRepetitions) #serializable (representable as str) and with no repetitions
- for href in listNoRepetitions:
- fullHref = "https:"+href
- #//i.4cdn.org/wg/1639231804800.jpg
- #href.find("/") #find is the "left-find" - find pos of th FIRST occurence
- #href.rfind("/") #15
- strFileName = href[
- href.rfind("/")+1 #start AFTER the last forward-slash (/)
- : #go to the end
- ]
- theBytes =\
- AmUtil.genericUrlReader(
- pStrUrl = fullHref,
- pUserAgent = AmUtil.USER_AGENT_STRING_MOZ47,
- pReferer="",
- pBytesInsteadOfStr=True
- )
- fw = open(
- file=strFileName,
- mode="wb" #write bytes
- )
- fw.write(theBytes)
- fw.close()
- strMsg="File %s (%d bytes long) saved OK!"%(strFileName, len(theBytes))
- print(strMsg)
- #for
- #def downloadAllImages
- #def Am4Chan
- wgConsumer = Am4Chan("wg") #instantiation
- print (wgConsumer)
- #listAllAnchors = wgConsumer.discoverAllResources()
- #print(listAllAnchors)
- wgConsumer.downloadAllImages()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement