Advertisement
am_dot_com

FP 2022-01-03

Jan 3rd, 2022 (edited)
603
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.03 KB | None | 0 0
  1. from amutil_tools import AmUtil #for accessing the helpers in AmUtil
  2.  
  3. from bs4 import BeautifulSoup #for parsing html elements
  4.  
  5. import re #for regular expressions
  6.  
  7. class Am4Chan:
  8.     PATTERN_FOR_BOARD_SHORT_URLS = "^//boards\.4chan\.org/[a-zA-Z0-9_]+/$|^//boards\.4channel\.org/[a-zA-Z0-9_]+/$"
  9.     PATTERN_FOR_JPG_URL = ".*\.jpg$|.*\.jpeg$"
  10.     PATTERN_FOR_PNG = ".*\.png$"
  11.  
  12.     BOARDS_BASE_URL = "https://boards.4chan.org/"
  13.  
  14.     #https://boards.4chan.org/wg/
  15.     #https://boards.4chan.org/wg/2
  16.     #...
  17.     #https://boards.4chan.org/wg/10
  18.  
  19.     def __init__(
  20.         self,
  21.         pBoardName:str
  22.     ):
  23.         self.mBoardName = pBoardName
  24.  
  25.         self.mAllResources = self.discoverAllResources()
  26.  
  27.         self.mFoundBoards = self.filterResourcesByRegexp(
  28.             Am4Chan.PATTERN_FOR_BOARD_SHORT_URLS
  29.         )
  30.         #print(self.mFoundBoards)
  31.  
  32.         self.mJpgs = self.filterResourcesByRegexp(
  33.             Am4Chan.PATTERN_FOR_JPG_URL
  34.         )
  35.         #print (self.mJpgs)
  36.  
  37.         self.mPngs = self.filterResourcesByRegexp(
  38.             Am4Chan.PATTERN_FOR_PNG
  39.         )
  40.         #print(self.mPngs)
  41.     #def __init__
  42.  
  43.     def __str__(self):
  44.         strAll = "board name: %s\n"%(self.mBoardName)
  45.         return strAll
  46.     #def __str__
  47.  
  48.     def buildUrlForPageNumber(self, piPageNumber:int=1):
  49.         strUrl="https://boards.4chan.org/%s/"%(self.mBoardName)
  50.         if (piPageNumber==1):
  51.             return strUrl
  52.         else:
  53.             strUrl="strUrl%d"%(piPageNumber)
  54.             return strUrl
  55.         #if-else
  56.     #def buildUrlForPageNumber
  57.  
  58.     def discoverAllResources(self)->list:
  59.         """
  60.        @return: a list of dicts, each with 2 keys ("href", "anchor")
  61.        """
  62.         listRet = list()
  63.         strUrl = self.buildUrlForPageNumber(1)
  64.         strHtml =\
  65.             AmUtil.genericUrlReader(
  66.                 pStrUrl=strUrl,
  67.                 pUserAgent=AmUtil.USER_AGENT_STRING_MOZ47,
  68.                 pReferer=Am4Chan.BOARDS_BASE_URL
  69.             )
  70.         bs = BeautifulSoup(
  71.             markup=strHtml,
  72.             features="html5"
  73.         )
  74.         anchors = bs.findAll("a") #Tag objects (of BeautifulSoup)
  75.         for a in anchors:
  76.             bHrefExists = "href" in a.attrs.keys() #a.attrs is a dictionary
  77.             if (bHrefExists):
  78.                 href = a.attrs['href']
  79.                 text = a.text
  80.                 myDict = dict()
  81.                 myDict['href'] = href
  82.                 myDict['anchor'] = text
  83.                 listRet.append(myDict)
  84.             #if
  85.         #for
  86.  
  87.         return listRet
  88.     #def discoverAllResources
  89.  
  90.     def filterResourcesByRegexp(self, pRegExp:str)->list:
  91.         listRet = list()
  92.         for dictResource in self.mAllResources:
  93.             href = dictResource['href'] #e.g. "//www.4chan.org/contact"
  94.             anchor = dictResource['anchor']
  95.  
  96.             reTester = \
  97.                 re.compile(
  98.                     pattern=pRegExp, #e.g. PATTERN_FOR_BOARD_SHORT_URLS
  99.                     flags=re.IGNORECASE
  100.                 )
  101.  
  102.             resultMatch = reTester.match(href) #None is there is no match
  103.  
  104.             if(resultMatch!=None):
  105.                 listRet.append(dictResource)
  106.             #if
  107.         #for
  108.         return listRet
  109.     #def filterResourcesByRegexp
  110.  
  111.     def downloadAllImages(
  112.         self,
  113.         pDestinationDir:str="."
  114.     ):
  115.         #L1=[1, 2]; L2=[2, 3]; L3=L1+L2=[1,2,2,3]
  116.         listOfAllImages = self.mJpgs + self.mPngs
  117.         listOfAllHref = list()
  118.         for dictImage in listOfAllImages:
  119.             href = dictImage['href']
  120.             listOfAllHref.append(href)
  121.         #for
  122.  
  123.         setNoRepetitions = set(listOfAllHref) #the set constructor will eliminate all repetitions, because sets do NOT allow repeated elements
  124.         listNoRepetitions = list(setNoRepetitions) #serializable (representable as str) and with no repetitions
  125.  
  126.         for href in listNoRepetitions:
  127.             fullHref = "https:"+href
  128.             #//i.4cdn.org/wg/1639231804800.jpg
  129.             #href.find("/") #find is the "left-find" - find pos of th FIRST occurence
  130.             #href.rfind("/") #15
  131.             strFileName = href[
  132.               href.rfind("/")+1 #start AFTER the last forward-slash (/)
  133.               : #go to the end
  134.             ]
  135.  
  136.             theBytes =\
  137.                 AmUtil.genericUrlReader(
  138.                     pStrUrl = fullHref,
  139.                     pUserAgent = AmUtil.USER_AGENT_STRING_MOZ47,
  140.                     pReferer="",
  141.                     pBytesInsteadOfStr=True
  142.                 )
  143.  
  144.             fw = open(
  145.                 file=strFileName,
  146.                 mode="wb" #write bytes
  147.             )
  148.             fw.write(theBytes)
  149.             fw.close()
  150.  
  151.             strMsg="File %s (%d bytes long) saved OK!"%(strFileName, len(theBytes))
  152.             print(strMsg)
  153.         #for
  154.     #def downloadAllImages
  155. #def Am4Chan
  156.  
  157. wgConsumer = Am4Chan("wg") #instantiation
  158. print (wgConsumer)
  159. #listAllAnchors = wgConsumer.discoverAllResources()
  160. #print(listAllAnchors)
  161. wgConsumer.downloadAllImages()
  162.  
  163.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement