Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Am4Chan.py
- from amutil_tools import AmUtil
- import re #regular expressions
- from bs4 import BeautifulSoup
- class Am4Chan:
- PATTERN_FOR_BOARD_DISCOVERY = "^//boards\.4channel\.org/[a-zA-Z0-9_]+/$"
- PATTERN_FOR_JPGS = ".*\.jpg$|.*\.jpeg$"
- PATTERN_FOR_PNGS = ".*\.png$"
- def __init__(
- self,
- pBoardName:str
- ):
- self.mBoardName = pBoardName
- self.mAllResources = self.discoverAllResources()
- #print(self.mAllResources)
- self.mBoards = self.discoverBoards()
- print(self.mBoards)
- #def __init__
- #dunder / double underscore
- def __str__(self):
- strAll = "board name: %s"%(self.mBoardName)
- return strAll
- #def __str__
- def buildUrlForPageNumber(self, piPageNumber:int)->str:
- strUrl = "https://boards.4chan.org/%s/"%(self.mBoardName)
- if(piPageNumber==1):
- return strUrl
- else:
- #strUrl = strUrl+str(piPageNumber)
- strUrl = "%s%d"%(strUrl, piPageNumber)
- return strUrl
- #if-else
- #def buildUrlForPageNumber
- def discoverAllResources(self):
- listResources = list()
- strUrl = self.buildUrlForPageNumber(1)
- strSourceCode =\
- AmUtil.genericUrlReader(
- pStrUrl=strUrl,
- pUserAgent=AmUtil.USER_AGENT_STRING_MOZ47,
- pReferer="https://boards.4chan.org/",
- pBytesInsteadOfStr=False
- )
- #print(strSourceCode)
- bs = BeautifulSoup(
- markup=strSourceCode,
- features="html5lib"
- )
- anchors = bs.findAll("a") #each object is of "Tag" (BeautifulSoup), non-hashable
- for a in anchors:
- bHrefExists = "href" in a.attrs.keys()
- if (bHrefExists):
- href = a.attrs['href']
- anchor = a.text
- myDict = dict()
- myDict['href'] = href
- myDict['anchor'] = anchor
- listResources.append(myDict)
- #if
- #for
- #self.mAllResources = listResources #this being is done at __init__
- return listResources
- #def discoverAllResources
- def filterAllResourcesByRegexp(
- self,
- pStrRegExp:str #this is the regular expression that will be used to filter results
- ):
- listFiltered = list()
- for dictResource in self.mAllResources:
- href = dictResource['href']
- reTester = re.compile(
- pStrRegExp,
- re.IGNORECASE
- )
- matchResult = reTester.match(href)
- bNoMatch = matchResult==None
- bMatch = not bNoMatch
- if (bMatch):
- listFiltered.append(dictResource)
- #if the resource's href matches the pattern
- #for every resource
- return listFiltered
- #def filterAllResourcesByRegexp
- def discoverBoards(self):
- listBoards = list()
- #incomplete URLs, may have repetitions
- foundBoards = self.filterAllResourcesByRegexp(Am4Chan.PATTERN_FOR_BOARD_DISCOVERY)
- for dictBoard in foundBoards:
- href = dictBoard['href']
- listBoards.append("https:"+href)
- #for
- #list for absolute URLs for other communities
- #now, elimininate repititions, if there are any
- setNoRepitions = set(listBoards) #set elements are non-hashable
- listNoReptitions = list(setNoRepitions)
- return listNoReptitions
- #def discoverBoards
- #class Am4Chan
- wg = Am4Chan("wg") #instantiate the consumer
- print (wg)
- #wg.discoverAllResources()
- #wg.discoverBoards() #discover other communities
- #wg.discoverMedia() #media is plural of medium (jpgs, pngs, ...)
Add Comment
Please, Sign In to add comment