am_dot_com

FP 2022-12-13

Dec 13th, 2022 (edited)
95
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 8.56 KB | None | 0 0
  1. # AmBotTools.py
  2. import certifi
  3. import ssl
  4. from urllib.request import urlopen, Request
  5. from http.client import HTTPResponse
  6.  
  7. import bs4
  8. from bs4 import BeautifulSoup
  9.  
  10. class AmBotTools:
  11. KEY_ANCHOR = "anchor"
  12. KEY_HREF = "href"
  13.  
  14. CHROME_SIGNATURE = "Chrome"
  15.  
  16. FF78_SIGNATURE =\
  17. "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0"
  18.  
  19. def __init__(self):
  20. self.mCAFile = certifi.where()
  21. self.mSSLContext =\
  22. ssl.create_default_context(
  23. cafile=self.mCAFile
  24. )
  25. # def __init__
  26.  
  27. def consumeUrl(
  28. self,
  29. pUrl:str,
  30. pbPreferBytes:bool=False,
  31. pStrEncoding="UTF-8",
  32. pHeaders:dict={}
  33. ):
  34. theRequest:Request = Request(
  35. url=pUrl,
  36. # custom http-headers
  37. headers=pHeaders
  38. )
  39.  
  40. response:HTTPResponse =\
  41. urlopen(
  42. theRequest,
  43. #url = pUrl,
  44. context=self.mSSLContext
  45. )
  46. theBytes = response.read()
  47. if(pbPreferBytes):
  48. return theBytes
  49. else:
  50. strResponse = str(
  51. theBytes,
  52. pStrEncoding
  53. )
  54. return strResponse
  55. # if-else
  56. # def consumeUrl
  57.  
  58. def getAnchors(
  59. self,
  60. pUrl:str
  61. ):
  62. listOfFoundAnchors = list() #
  63.  
  64. strConteudo = \
  65. self.consumeUrl(
  66. pUrl=pUrl
  67. )
  68. if (strConteudo):
  69. bs = BeautifulSoup(
  70. strConteudo,
  71. "html5lib"
  72. )
  73. if (bs):
  74. theAs = bs.findAll("a")
  75.  
  76. if (theAs):
  77. for anchor in theAs:
  78. #texto:str = anchor.title
  79. texto: str = anchor.text
  80. bThereIsHref = "href" in anchor.attrs.keys()
  81. if (bThereIsHref):
  82. href = anchor.attrs["href"]
  83. else:
  84. href = ""
  85. # if-else
  86.  
  87. listOfFoundAnchors.append(
  88. {
  89. AmBotTools.KEY_ANCHOR:texto,
  90. AmBotTools.KEY_HREF:href
  91. }
  92. )
  93. # for every anchor
  94. # if there are anchors
  95. # if it was possible to get a bs object
  96. # if there is content
  97.  
  98. return listOfFoundAnchors
  99. # def getAnchors
  100.  
  101. """
  102. escreva um método que permita filtrar
  103. uma lista de anchors
  104. incluindo no retorno apenas aquelas
  105. cujo atributo href
  106. contenha certa expressão.
  107. Por exemplo:
  108. AmBotTools.getFilteredByHrefAnchors(
  109. theAnchors, # uma lista
  110. "4cdn.org" # uma frase de filtro
  111. )
  112. """
  113. @staticmethod
  114. def getFilteredByHrefAnchors(
  115. pListOfAnchors:list,
  116. pStrHrefFilter:str
  117. ):
  118. filtered = list()
  119.  
  120. for thing in pListOfAnchors:
  121. #anchor = thing[AmBotTools.KEY_ANCHOR]
  122.  
  123. href = thing[AmBotTools.KEY_HREF]
  124. bSatisfiesFilter =\
  125. pStrHrefFilter in href
  126. if(bSatisfiesFilter):
  127. filtered.append(thing)
  128. # if
  129. # for
  130.  
  131. return filtered
  132. # def getFilteredByHrefAnchors
  133.  
  134. # use for jpg, png, webm, etc, NOT for text
  135. def downloadBin(
  136. self,
  137. pUrlBin:str,
  138. pDestinationName:str,
  139. pHeaders:dict
  140. ):
  141. theBytes =\
  142. self.consumeUrl(
  143. pUrl=pUrlBin,
  144. pbPreferBytes=True,
  145. pHeaders=pHeaders
  146. )
  147. try:
  148. fw = open(
  149. pDestinationName,
  150. "wb" # write binary
  151. )
  152. if(fw):
  153. fw.write(theBytes)
  154. fw.close()
  155. return True
  156. #if
  157. except Exception as e:
  158. print(str(e))
  159. #try-except
  160.  
  161. return False
  162. # def downloadBin
  163. # class AmBotTools
  164.  
  165.  
  166. *****************************
  167.  
  168.  
  169. # bot4chan.py
  170. # um bot para consumir a rede social 4chan.org
  171. from AmBotTools import AmBotTools
  172.  
  173. import bs4
  174. from bs4 import BeautifulSoup
  175.  
  176. """
  177. em HTML todo o conteúdo está na forma
  178. <marca a1=v1 a2=v2>conteúdo</marca>
  179. <p class="info">A taxa juro XPTO é 5.6%</p>
  180.  
  181. <a href="endereço">Texto do âncora</a>
  182. """
  183.  
  184. class Bot4Chan:
  185. CONTENT_FILTER = "4cdn.org"
  186.  
  187. def __init__(
  188. self,
  189. pStrBoard:str
  190. ):
  191. self.mBoard = pStrBoard
  192. # def __init__
  193.  
  194. # https://boards.4channel.org/<nome da board/
  195. def getUrlForBoard(self):
  196. strUrl =\
  197. f"https://boards.4channel.org/{self.mBoard}/"
  198. return strUrl
  199. # def getUrlForBoard
  200.  
  201. def getBoardAnchors(self):
  202. bot = AmBotTools()
  203. listOfDictsEachOneIsAnAnchor =\
  204. bot.getAnchors(
  205. pUrl = self.getUrlForBoard()
  206. )
  207. return listOfDictsEachOneIsAnAnchor
  208. # def getBoardAnchors
  209.  
  210. """
  211. modificar
  212. getAllContentAnchors
  213. para que o retorno tenha apenas
  214. âncoras cujos hrefs correspondam a
  215. endereços absolutos, começados por https
  216. """
  217. def getAllContentAnchors(
  218. self,
  219. pbPrefixWithHttps:bool=True
  220. ):
  221. ret = list()
  222.  
  223. unfiltered = self.getBoardAnchors()
  224. contentFiltered =\
  225. AmBotTools.getFilteredByHrefAnchors(
  226. unfiltered,
  227. Bot4Chan.CONTENT_FILTER
  228. )
  229. if (not pbPrefixWithHttps):
  230. return contentFiltered
  231. else:
  232. for thing in contentFiltered:
  233. old = thing[AmBotTools.KEY_HREF]
  234. thing[AmBotTools.KEY_HREF] =\
  235. f"https:{old}"
  236.  
  237. ret.append(thing)
  238. # for
  239. # if-else
  240.  
  241. return ret
  242. # def getAllContentAnchors
  243.  
  244. """
  245. escrever um método que, recebida uma lista que é
  246. o retorno de getAllContentAnchors,
  247. elimina hrefs duplicados e retorna apenas a
  248. list dos hrefs sem repetições.
  249. Sugestão: utilize Set
  250. """
  251. def getHrefsForDownload(
  252. self,
  253. pListAnchorsAsDicts
  254. ):
  255. apenasHrefs = list()
  256. #1 - fazer uma lista APENAS dos hrefs
  257. for coisa in pListAnchorsAsDicts:
  258. href = coisa[AmBotTools.KEY_HREF]
  259. apenasHrefs.append(href)
  260. # for
  261.  
  262. #2 - eliminar repetições da lista
  263. setApenasHrefs:set =\
  264. set(apenasHrefs)
  265.  
  266. #3 - retornar o resultado enquanto lista
  267. return list(setApenasHrefs)
  268. # def getHrefsForDownload
  269.  
  270. @staticmethod
  271. def headersFor4Chan(
  272. #pUA:str=AmBotTools.FF78_SIGNATURE,
  273. pUA:str=AmBotTools.CHROME_SIGNATURE,
  274. pReferer:str=""
  275. ):
  276. headers = dict()
  277. headers['User-Agent']=pUA
  278. headers['referer']=pReferer
  279. return headers
  280. # def headersFor4Chan
  281.  
  282. def dlBin(
  283. self,
  284. pUrl:str,
  285. pFileName:str
  286. ):
  287. bot = AmBotTools()
  288. bSuccessOrNot =\
  289. bot.downloadBin(
  290. pUrlBin=pUrl,
  291. pHeaders=Bot4Chan.headersFor4Chan(),
  292. pDestinationName=pFileName
  293. )
  294. return bSuccessOrNot
  295. # def dlBin
  296.  
  297. @staticmethod
  298. def getFileNameFromUrl(pUrl:str):
  299. #return pUrl.split("/")[-1]
  300. # exemplo:
  301. # pUrl = "https://i.4cdn.org/an/1670916779952287.jpg"
  302. aPartes = pUrl.split("/")
  303. iQuantasPartes = len(aPartes)
  304. #nome = aPartes[iQuantasPartes-1]
  305. nome = aPartes[-1]
  306. return nome
  307. # def getFileNameFromUrl
  308.  
  309. def dlAllBins(self, pListUrlsToDl):
  310. listResults = list()
  311. for url in pListUrlsToDl:
  312. destName =\
  313. Bot4Chan.getFileNameFromUrl(url)
  314. bOK = self.dlBin(
  315. pUrl=url,
  316. pFileName=destName
  317. )
  318. listResults.append(bOK)
  319. # for
  320. return listResults
  321. # def dlAllBins
  322. # class Bot4Chan
  323.  
  324. # The End - Sucesso no consumo dos binários da página de entrada
  325.  
  326. bot = Bot4Chan("an")
  327. #theAnchors = bot.getBoardAnchors()
  328. theAnchorsWithReps = bot.getAllContentAnchors()
  329. hrefsOnlyAndNoReps =\
  330. bot.getHrefsForDownload(
  331. theAnchorsWithReps
  332. )
  333. listResults = bot.dlAllBins(hrefsOnlyAndNoReps)
  334. print(listResults)
Advertisement
Add Comment
Please, Sign In to add comment