am_dot_com

FP 2022-12-20

Dec 20th, 2022 (edited)
127
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 12.15 KB | None | 0 0
  1. # AmBotTools.py
  2. import certifi
  3. import ssl
  4. from urllib.request import urlopen, Request
  5. from http.client import HTTPResponse
  6.  
  7. import bs4
  8. from bs4 import BeautifulSoup
  9.  
  10. class AmBotTools:
  11. KEY_ANCHOR = "anchor"
  12. KEY_HREF = "href"
  13.  
  14. CHROME_SIGNATURE = "Chrome"
  15.  
  16. FF78_SIGNATURE =\
  17. "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0"
  18.  
  19. def __init__(self):
  20. self.mCAFile = certifi.where()
  21. self.mSSLContext =\
  22. ssl.create_default_context(
  23. cafile=self.mCAFile
  24. )
  25. # def __init__
  26.  
  27. def consumeUrl(
  28. self,
  29. pUrl:str,
  30. pbPreferBytes:bool=False,
  31. pStrEncoding="UTF-8",
  32. pHeaders:dict={}
  33. ):
  34. theRequest:Request = Request(
  35. url=pUrl,
  36. # custom http-headers
  37. headers=pHeaders
  38. )
  39.  
  40. response:HTTPResponse =\
  41. urlopen(
  42. theRequest,
  43. #url = pUrl,
  44. context=self.mSSLContext
  45. )
  46. theBytes = response.read()
  47. if(pbPreferBytes):
  48. return theBytes
  49. else:
  50. strResponse = str(
  51. theBytes,
  52. pStrEncoding
  53. )
  54. return strResponse
  55. # if-else
  56. # def consumeUrl
  57.  
  58. def getImgs(
  59. self,
  60. pUrl:str
  61. ):
  62. listOfImgSrcs = list()
  63.  
  64. strContent = self.consumeUrl(pUrl=pUrl)
  65. if(strContent):
  66. bs = BeautifulSoup(
  67. strContent,
  68. "html5lib" # parser => python -m pip install html5lib
  69. )
  70.  
  71. if(bs):
  72. theImgs = bs.findAll("img") #<img src="1.jpg" alt="bla bla">
  73. for img in theImgs:
  74. if("src" in img.attrs.keys()): # é um dict
  75. src = img.attrs['src']
  76. listOfImgSrcs.append(src)
  77. # if
  78. # for
  79. # if
  80. # if
  81.  
  82. return listOfImgSrcs
  83. # def getImgs
  84.  
  85. def getAnchors(
  86. self,
  87. pUrl:str
  88. ):
  89. listOfFoundAnchors = list() #
  90.  
  91. strConteudo = \
  92. self.consumeUrl(
  93. pUrl=pUrl
  94. )
  95. if (strConteudo):
  96. bs = BeautifulSoup(
  97. strConteudo,
  98. "html5lib"
  99. )
  100. if (bs):
  101. theAs = bs.findAll("a")
  102.  
  103. if (theAs):
  104. for anchor in theAs:
  105. #texto:str = anchor.title
  106. texto: str = anchor.text
  107. bThereIsHref = "href" in anchor.attrs.keys()
  108. if (bThereIsHref):
  109. href = anchor.attrs["href"]
  110. else:
  111. href = ""
  112. # if-else
  113.  
  114. listOfFoundAnchors.append(
  115. {
  116. AmBotTools.KEY_ANCHOR:texto,
  117. AmBotTools.KEY_HREF:href
  118. }
  119. )
  120. # for every anchor
  121. # if there are anchors
  122. # if it was possible to get a bs object
  123. # if there is content
  124.  
  125. return listOfFoundAnchors
  126. # def getAnchors
  127.  
  128. """
  129. escreva um método que permita filtrar
  130. uma lista de anchors
  131. incluindo no retorno apenas aquelas
  132. cujo atributo href
  133. contenha certa expressão.
  134. Por exemplo:
  135. AmBotTools.getFilteredByHrefAnchors(
  136. theAnchors, # uma lista
  137. "4cdn.org" # uma frase de filtro
  138. )
  139. """
  140. @staticmethod
  141. def getFilteredByHrefAnchors(
  142. pListOfAnchors:list,
  143. pStrHrefFilter:str
  144. ):
  145. filtered = list()
  146.  
  147. for thing in pListOfAnchors:
  148. #anchor = thing[AmBotTools.KEY_ANCHOR]
  149.  
  150. href = thing[AmBotTools.KEY_HREF]
  151. bSatisfiesFilter =\
  152. pStrHrefFilter in href
  153. if(bSatisfiesFilter):
  154. filtered.append(thing)
  155. # if
  156. # for
  157.  
  158. return filtered
  159. # def getFilteredByHrefAnchors
  160.  
  161. # use for jpg, png, webm, etc, NOT for text
  162. def downloadBin(
  163. self,
  164. pUrlBin:str,
  165. pDestinationName:str,
  166. pHeaders:dict={
  167. "user-agent":FF78_SIGNATURE,
  168. "referer":""
  169. }
  170. ):
  171. theBytes =\
  172. self.consumeUrl(
  173. pUrl=pUrlBin,
  174. pbPreferBytes=True,
  175. pHeaders=pHeaders
  176. )
  177. try:
  178. fw = open(
  179. pDestinationName,
  180. "wb" # write binary
  181. )
  182. if(fw):
  183. fw.write(theBytes)
  184. fw.close()
  185. return True
  186. #if
  187. except Exception as e:
  188. print(str(e))
  189. #try-except
  190.  
  191. return False
  192. # def downloadBin
  193. # class AmBotTools
  194.  
  195.  
  196. *****
  197.  
  198.  
  199. from AmBotTools import AmBotTools
  200. import json
  201.  
  202. class BotFlickr:
  203. MEMORY = "memory.DB"
  204.  
  205. BASE_URL = "https://www.flickr.com/photos"
  206. FILTER_FOR_IMG_SRC = "//live.staticflickr.com"
  207. MARK_IMG_DICT_START = "name: 'photo-page-scrappy-view',"
  208. MARK_IMG_DICT_END = "modelExport:"
  209.  
  210. def __init__(self, pUserName):
  211. self.mUserName = pUserName
  212. self.mBot = AmBotTools()
  213. self.mMemory = self.readFromMemory()
  214. # def __init__
  215.  
  216. def getUserUrl(self):
  217. return f"{BotFlickr.BASE_URL}/{self.mUserName}"
  218. # def getUserUrl
  219.  
  220. def getUserUrlForUserPage(self, pN):
  221. # https://www.flickr.com/photos/projectapolloarchive/page2
  222. return f"{BotFlickr.BASE_URL}/{self.mUserName}/page{pN}"
  223. # def getUserUrlForUserPage
  224.  
  225. def getPhotoIdsAtUrl(self, pUrl:str):
  226. theIds = list()
  227. theImgSrcs = self.mBot.getImgs(pUrl)
  228. # filter the images
  229. # '//live.staticflickr.com/65535/51357978988_cee7be2c65_z.jpg'
  230. for src in theImgSrcs:
  231. # example: src='//live.staticflickr.com/65535/51357978988_cee7be2c65_z.jpg'
  232. bSatisfiesFilter:bool = src.find(BotFlickr.FILTER_FOR_IMG_SRC)!=-1
  233. """
  234. "artur".find("z") # -1
  235. "artur".find("r") # 1
  236. """
  237. if(bSatisfiesFilter):
  238. theImgFileName = src.split("/")[-1] # '51357978988_cee7be2c65_z.jpg'
  239. theImgId = theImgFileName.split("_")[0]
  240. theIds.append(theImgId)
  241. # if
  242. return theIds
  243. # def getPhotoIdsAtUrl
  244.  
  245. def getPhotoUrlFromPhotoId(self, pPhotoId):
  246. # https://www.flickr.com/photos/<user name>/<photo id>/
  247. return self.getUserUrl()+f"/{pPhotoId}/"
  248. # def getPhotoUrlFromPhotoId
  249.  
  250. def getPhotoDictFromPhotoId(self, pPhotoId):
  251. url = self.getPhotoUrlFromPhotoId(pPhotoId)
  252. src = self.mBot.consumeUrl(url)
  253.  
  254. iWhereDictStarts = src.find(BotFlickr.MARK_IMG_DICT_START)
  255. if(iWhereDictStarts!=-1):
  256. iWhereDictStarts+=len(BotFlickr.MARK_IMG_DICT_START)
  257. srcDict = src[iWhereDictStarts:]
  258.  
  259. iWhereDictEnds = srcDict.find(BotFlickr.MARK_IMG_DICT_END)
  260. if(iWhereDictEnds!=-1):
  261. # add the starting curly-brace
  262. srcDict = "{"+ (srcDict[0:iWhereDictEnds]).strip()
  263.  
  264. # fix the 1st key
  265. srcDict = srcDict.replace(
  266. "params:",
  267. "\"params\":"
  268. )
  269.  
  270. srcDict:str = (srcDict.split("\n")[0])+"}"
  271.  
  272. realDict:dict = json.loads(srcDict)
  273.  
  274. return realDict
  275. # if
  276. # if
  277. return False
  278. # def getPhotoDictFromPhotoId
  279.  
  280. def getDescriptionAndOriginalUrlFromPhotoId(self, pPhotoId):
  281. theDict = self.getPhotoDictFromPhotoId(pPhotoId)
  282. theDescription = theDict['params']['photoModel']['description']
  283. o = "https:" + theDict['params']['photoModel']['sizes']['o']['src']
  284.  
  285. return theDescription, o
  286. # def getDescriptionAndOriginalUrlFromPhotoId
  287.  
  288. def getDescriptionAndSmallUrlFromPhotoId(self, pPhotoId):
  289. theDict = self.getPhotoDictFromPhotoId(pPhotoId)
  290. theDescription = theDict['params']['photoModel']['description']
  291. sq = "https:" + theDict['params']['photoModel']['sizes']['sq']['src']
  292.  
  293. return theDescription, sq
  294. # def getDescriptionAndSmallUrlFromPhotoId
  295.  
  296. def downloadPhotoOriginalIfNotAlreadyDownloaded(self, pPhotoId):
  297. desc, url = self.getDescriptionAndOriginalUrlFromPhotoId(pPhotoId)
  298.  
  299. #https://live.staticflickr.com/65535/51357005762_3f48a89897_o.jpg
  300. #filename:str = url.split("/")[len(url.split("/"))-1]
  301. filename: str = url.split("/")[-1]
  302.  
  303. bAlreadyDownloaded = self.alreadyDownloaded(url)
  304.  
  305. if (not bAlreadyDownloaded):
  306. bSuccess:bool = self.mBot.downloadBin(
  307. pUrlBin=url,
  308. pDestinationName=filename
  309. )
  310.  
  311. return self.writeToMemory(desc, url, bSuccess)
  312. else:
  313. return False
  314. # def downloadPhotoOriginalIfNotAlreadyDownloaded
  315.  
  316. def writeToMemory(self, desc, url, bSuccessOnDownload):
  317. fw = open(BotFlickr.MEMORY, "a")
  318. if(fw):
  319. record:str = f"{desc}\t{url}\t{bSuccessOnDownload}\n" # TSV = Tab Separated Values
  320. fw.write(record)
  321. fw.close()
  322.  
  323. return True
  324. # if
  325. return False
  326. # def writeToMemory
  327.  
  328. def alreadyDownloaded(self, url):
  329. for record in self.mMemory:
  330. urlInMemory = record["url"]
  331. if(urlInMemory==url):
  332. return record["dl"]
  333. # if
  334. # for
  335.  
  336. return False
  337. # def alreadyDownloaded
  338.  
  339. def readFromMemory(self):
  340. listOfRecords = list()
  341. try:
  342. fr = open(BotFlickr.MEMORY, "r")
  343. if(fr):
  344. strContent = fr.read()
  345. theLines = strContent.split("\n")
  346. for line in theLines:
  347. processedLine = line.strip()
  348. lineParts = processedLine.split("\t")
  349. bCaution = len(lineParts)==3
  350. if(bCaution):
  351. record = dict()
  352. record["desc"] = lineParts[0]
  353. record["url"] = lineParts[1]
  354. record["dl"] = lineParts[2]
  355.  
  356. listOfRecords.append(record)
  357. # if
  358. # for
  359. # if
  360. except Exception as e:
  361. pass
  362.  
  363. return listOfRecords
  364. # def readFromMemory
  365. # class BotFlickr
  366.  
  367.  
  368. *******************
  369.  
  370.  
  371. from bot_flickr import BotFlickr
  372.  
  373. FLICKR_USER = "projectapolloarchive"
  374.  
  375. whatToSearchFor = input("What do you want to search for?")
  376. whatToSearchFor = whatToSearchFor.lower() # para procura case-insensitive
  377.  
  378. bot = BotFlickr(FLICKR_USER)
  379. listOfFindings = list()
  380. for record in bot.mMemory:
  381. desc = record["desc"]
  382. desc = desc.lower() # para procura case-insensitive
  383.  
  384. bFound = desc.find(whatToSearchFor)!=-1
  385. if (bFound):
  386. listOfFindings.append(record)
  387. # if
  388. # for
  389.  
  390. if (len(listOfFindings)==0):
  391. print ("Nothing found.")
  392. else:
  393. print(listOfFindings)
  394.  
  395. ****************************
  396.  
  397.  
  398. from bot_flickr import BotFlickr
  399.  
  400. TEST_USER = "projectapolloarchive"
  401.  
  402. bot = BotFlickr(TEST_USER)
  403. url = bot.getUserUrl()
  404. print(url)
  405. #print (bot.getUserUrlForUserPage(159))
  406.  
  407. #theSrcs = bot.mBot.getImgs(url)
  408. #print(theSrcs)
  409. theIds = bot.getPhotoIdsAtUrl(url)
  410. print(theIds)
  411.  
  412. urlFor1stPhoto = bot.getPhotoUrlFromPhotoId(theIds[0])
  413. print(urlFor1stPhoto)
  414.  
  415. #print(bot.getPhotoDictFromPhotoId(theIds[0]))
  416.  
  417. """
  418. desc, urlOriginal = bot.getDescriptionAndOriginalUrlFromPhotoId(theIds[2])
  419. print(desc)
  420. print(urlOriginal)
  421. """
  422.  
  423. """
  424. TEST_URL = "https://live.staticflickr.com/65535/51358764550_c5ae627240_o.jpg"
  425. respostaSeJaDescarregado = bot.alreadyDownloaded(TEST_URL)
  426. print (f"Já descarregado? {respostaSeJaDescarregado}")
  427. exit()
  428. """
  429.  
  430. r = bot.downloadPhotoOriginalIfNotAlreadyDownloaded(theIds[3])
  431. print(r)
  432.  
Advertisement
Add Comment
Please, Sign In to add comment