am_dot_com

IA 2022-01-03

Jan 3rd, 2022 (edited)
312
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.75 KB | None | 0 0
  1. #Am4Chan.py
  2. from amutil_tools import AmUtil
  3.  
  4. import re #regular expressions
  5. from bs4 import BeautifulSoup
  6.  
  7. class Am4Chan:
  8.     PATTERN_FOR_BOARD_DISCOVERY = "^//boards\.4channel\.org/[a-zA-Z0-9_]+/$"
  9.     PATTERN_FOR_JPGS = ".*\.jpg$|.*\.jpeg$"
  10.     PATTERN_FOR_PNGS = ".*\.png$"
  11.  
  12.     def __init__(
  13.         self,
  14.         pBoardName:str
  15.     ):
  16.         self.mBoardName = pBoardName
  17.  
  18.         self.mAllResources = self.discoverAllResources()
  19.         #print(self.mAllResources)
  20.  
  21.         self.mBoards = self.discoverBoards()
  22.         print(self.mBoards)
  23.     #def __init__
  24.  
  25.     #dunder / double underscore
  26.     def __str__(self):
  27.         strAll = "board name: %s"%(self.mBoardName)
  28.         return strAll
  29.     #def __str__
  30.  
  31.     def buildUrlForPageNumber(self, piPageNumber:int)->str:
  32.         strUrl = "https://boards.4chan.org/%s/"%(self.mBoardName)
  33.         if(piPageNumber==1):
  34.             return strUrl
  35.         else:
  36.             #strUrl = strUrl+str(piPageNumber)
  37.             strUrl = "%s%d"%(strUrl, piPageNumber)
  38.             return strUrl
  39.         #if-else
  40.     #def buildUrlForPageNumber
  41.  
  42.     def discoverAllResources(self):
  43.         listResources = list()
  44.         strUrl = self.buildUrlForPageNumber(1)
  45.         strSourceCode =\
  46.             AmUtil.genericUrlReader(
  47.                 pStrUrl=strUrl,
  48.                 pUserAgent=AmUtil.USER_AGENT_STRING_MOZ47,
  49.                 pReferer="https://boards.4chan.org/",
  50.                 pBytesInsteadOfStr=False
  51.             )
  52.         #print(strSourceCode)
  53.         bs = BeautifulSoup(
  54.             markup=strSourceCode,
  55.             features="html5lib"
  56.         )
  57.         anchors = bs.findAll("a") #each object is of "Tag" (BeautifulSoup), non-hashable
  58.         for a in anchors:
  59.             bHrefExists = "href" in a.attrs.keys()
  60.             if (bHrefExists):
  61.                 href = a.attrs['href']
  62.                 anchor = a.text
  63.                 myDict = dict()
  64.                 myDict['href'] = href
  65.                 myDict['anchor'] = anchor
  66.                 listResources.append(myDict)
  67.             #if
  68.         #for
  69.  
  70.         #self.mAllResources = listResources #this being is done at __init__
  71.         return listResources
  72.     #def discoverAllResources
  73.  
  74.     def filterAllResourcesByRegexp(
  75.         self,
  76.         pStrRegExp:str #this is the regular expression that will be used to filter results
  77.     ):
  78.         listFiltered = list()
  79.         for dictResource in self.mAllResources:
  80.             href = dictResource['href']
  81.  
  82.             reTester = re.compile(
  83.                 pStrRegExp,
  84.                 re.IGNORECASE
  85.             )
  86.             matchResult = reTester.match(href)
  87.             bNoMatch = matchResult==None
  88.             bMatch = not bNoMatch
  89.             if (bMatch):
  90.                 listFiltered.append(dictResource)
  91.             #if the resource's href matches the pattern
  92.         #for every resource
  93.         return listFiltered
  94.     #def filterAllResourcesByRegexp
  95.  
  96.     def discoverBoards(self):
  97.         listBoards = list()
  98.         #incomplete URLs, may have repetitions
  99.         foundBoards = self.filterAllResourcesByRegexp(Am4Chan.PATTERN_FOR_BOARD_DISCOVERY)
  100.         for dictBoard in foundBoards:
  101.             href = dictBoard['href']
  102.             listBoards.append("https:"+href)
  103.         #for
  104.         #list for absolute URLs for other communities
  105.  
  106.         #now, elimininate repititions, if there are any
  107.         setNoRepitions = set(listBoards) #set elements are non-hashable
  108.         listNoReptitions = list(setNoRepitions)
  109.         return listNoReptitions
  110.     #def discoverBoards
  111. #class Am4Chan
  112.  
  113. wg = Am4Chan("wg") #instantiate the consumer
  114. print (wg)
  115. #wg.discoverAllResources()
  116. #wg.discoverBoards() #discover other communities
  117. #wg.discoverMedia() #media is plural of medium (jpgs, pngs, ...)
Add Comment
Please, Sign In to add comment