Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # AmBotTools.py
- import certifi
- import ssl
- from urllib.request import urlopen, Request
- from http.client import HTTPResponse
- import bs4
- from bs4 import BeautifulSoup
- class AmBotTools:
- KEY_ANCHOR = "anchor"
- KEY_HREF = "href"
- CHROME_SIGNATURE = "Chrome"
- FF78_SIGNATURE =\
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0"
- def __init__(self):
- self.mCAFile = certifi.where()
- self.mSSLContext =\
- ssl.create_default_context(
- cafile=self.mCAFile
- )
- # def __init__
- def consumeUrl(
- self,
- pUrl:str,
- pbPreferBytes:bool=False,
- pStrEncoding="UTF-8",
- pHeaders:dict={}
- ):
- theRequest:Request = Request(
- url=pUrl,
- # custom http-headers
- headers=pHeaders
- )
- response:HTTPResponse =\
- urlopen(
- theRequest,
- #url = pUrl,
- context=self.mSSLContext
- )
- theBytes = response.read()
- if(pbPreferBytes):
- return theBytes
- else:
- strResponse = str(
- theBytes,
- pStrEncoding
- )
- return strResponse
- # if-else
- # def consumeUrl
- def getAnchors(
- self,
- pUrl:str
- ):
- listOfFoundAnchors = list() #
- strConteudo = \
- self.consumeUrl(
- pUrl=pUrl
- )
- if (strConteudo):
- bs = BeautifulSoup(
- strConteudo,
- "html5lib"
- )
- if (bs):
- theAs = bs.findAll("a")
- if (theAs):
- for anchor in theAs:
- #texto:str = anchor.title
- texto: str = anchor.text
- bThereIsHref = "href" in anchor.attrs.keys()
- if (bThereIsHref):
- href = anchor.attrs["href"]
- else:
- href = ""
- # if-else
- listOfFoundAnchors.append(
- {
- AmBotTools.KEY_ANCHOR:texto,
- AmBotTools.KEY_HREF:href
- }
- )
- # for every anchor
- # if there are anchors
- # if it was possible to get a bs object
- # if there is content
- return listOfFoundAnchors
- # def getAnchors
- """
- escreva um método que permita filtrar
- uma lista de anchors
- incluindo no retorno apenas aquelas
- cujo atributo href
- contenha certa expressão.
- Por exemplo:
- AmBotTools.getFilteredByHrefAnchors(
- theAnchors, # uma lista
- "4cdn.org" # uma frase de filtro
- )
- """
- @staticmethod
- def getFilteredByHrefAnchors(
- pListOfAnchors:list,
- pStrHrefFilter:str
- ):
- filtered = list()
- for thing in pListOfAnchors:
- #anchor = thing[AmBotTools.KEY_ANCHOR]
- href = thing[AmBotTools.KEY_HREF]
- bSatisfiesFilter =\
- pStrHrefFilter in href
- if(bSatisfiesFilter):
- filtered.append(thing)
- # if
- # for
- return filtered
- # def getFilteredByHrefAnchors
- # use for jpg, png, webm, etc, NOT for text
- def downloadBin(
- self,
- pUrlBin:str,
- pDestinationName:str,
- pHeaders:dict
- ):
- theBytes =\
- self.consumeUrl(
- pUrl=pUrlBin,
- pbPreferBytes=True,
- pHeaders=pHeaders
- )
- try:
- fw = open(
- pDestinationName,
- "wb" # write binary
- )
- if(fw):
- fw.write(theBytes)
- fw.close()
- return True
- #if
- except Exception as e:
- print(str(e))
- #try-except
- return False
- # def downloadBin
- # class AmBotTools
- *****************************
- # bot4chan.py
- # um bot para consumir a rede social 4chan.org
- from AmBotTools import AmBotTools
- import bs4
- from bs4 import BeautifulSoup
- """
- em HTML todo o conteúdo está na forma
- <marca a1=v1 a2=v2>conteúdo</marca>
- <p class="info">A taxa juro XPTO é 5.6%</p>
- <a href="endereço">Texto do âncora</a>
- """
- class Bot4Chan:
- CONTENT_FILTER = "4cdn.org"
- def __init__(
- self,
- pStrBoard:str
- ):
- self.mBoard = pStrBoard
- # def __init__
- # https://boards.4channel.org/<nome da board/
- def getUrlForBoard(self):
- strUrl =\
- f"https://boards.4channel.org/{self.mBoard}/"
- return strUrl
- # def getUrlForBoard
- def getBoardAnchors(self):
- bot = AmBotTools()
- listOfDictsEachOneIsAnAnchor =\
- bot.getAnchors(
- pUrl = self.getUrlForBoard()
- )
- return listOfDictsEachOneIsAnAnchor
- # def getBoardAnchors
- """
- modificar
- getAllContentAnchors
- para que o retorno tenha apenas
- âncoras cujos hrefs correspondam a
- endereços absolutos, começados por https
- """
- def getAllContentAnchors(
- self,
- pbPrefixWithHttps:bool=True
- ):
- ret = list()
- unfiltered = self.getBoardAnchors()
- contentFiltered =\
- AmBotTools.getFilteredByHrefAnchors(
- unfiltered,
- Bot4Chan.CONTENT_FILTER
- )
- if (not pbPrefixWithHttps):
- return contentFiltered
- else:
- for thing in contentFiltered:
- old = thing[AmBotTools.KEY_HREF]
- thing[AmBotTools.KEY_HREF] =\
- f"https:{old}"
- ret.append(thing)
- # for
- # if-else
- return ret
- # def getAllContentAnchors
- """
- escrever um método que, recebida uma lista que é
- o retorno de getAllContentAnchors,
- elimina hrefs duplicados e retorna apenas a
- list dos hrefs sem repetições.
- Sugestão: utilize Set
- """
- def getHrefsForDownload(
- self,
- pListAnchorsAsDicts
- ):
- apenasHrefs = list()
- #1 - fazer uma lista APENAS dos hrefs
- for coisa in pListAnchorsAsDicts:
- href = coisa[AmBotTools.KEY_HREF]
- apenasHrefs.append(href)
- # for
- #2 - eliminar repetições da lista
- setApenasHrefs:set =\
- set(apenasHrefs)
- #3 - retornar o resultado enquanto lista
- return list(setApenasHrefs)
- # def getHrefsForDownload
- @staticmethod
- def headersFor4Chan(
- #pUA:str=AmBotTools.FF78_SIGNATURE,
- pUA:str=AmBotTools.CHROME_SIGNATURE,
- pReferer:str=""
- ):
- headers = dict()
- headers['User-Agent']=pUA
- headers['referer']=pReferer
- return headers
- # def headersFor4Chan
- def dlBin(
- self,
- pUrl:str,
- pFileName:str
- ):
- bot = AmBotTools()
- bSuccessOrNot =\
- bot.downloadBin(
- pUrlBin=pUrl,
- pHeaders=Bot4Chan.headersFor4Chan(),
- pDestinationName=pFileName
- )
- return bSuccessOrNot
- # def dlBin
- @staticmethod
- def getFileNameFromUrl(pUrl:str):
- #return pUrl.split("/")[-1]
- # exemplo:
- # pUrl = "https://i.4cdn.org/an/1670916779952287.jpg"
- aPartes = pUrl.split("/")
- iQuantasPartes = len(aPartes)
- #nome = aPartes[iQuantasPartes-1]
- nome = aPartes[-1]
- return nome
- # def getFileNameFromUrl
- def dlAllBins(self, pListUrlsToDl):
- listResults = list()
- for url in pListUrlsToDl:
- destName =\
- Bot4Chan.getFileNameFromUrl(url)
- bOK = self.dlBin(
- pUrl=url,
- pFileName=destName
- )
- listResults.append(bOK)
- # for
- return listResults
- # def dlAllBins
- # class Bot4Chan
- # The End - Sucesso no consumo dos binários da página de entrada
- bot = Bot4Chan("an")
- #theAnchors = bot.getBoardAnchors()
- theAnchorsWithReps = bot.getAllContentAnchors()
- hrefsOnlyAndNoReps =\
- bot.getHrefsForDownload(
- theAnchorsWithReps
- )
- listResults = bot.dlAllBins(hrefsOnlyAndNoReps)
- print(listResults)
Advertisement
Add Comment
Please, Sign In to add comment