am_dot_com

FP 2022-12-19

Dec 19th, 2022 (edited)
221
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.81 KB | None | 0 0
  1. # AmBotTools.py
  2. import certifi
  3. import ssl
  4. from urllib.request import urlopen, Request
  5. from http.client import HTTPResponse
  6.  
  7. import bs4
  8. from bs4 import BeautifulSoup
  9.  
  10. class AmBotTools:
  11. KEY_ANCHOR = "anchor"
  12. KEY_HREF = "href"
  13.  
  14. CHROME_SIGNATURE = "Chrome"
  15.  
  16. FF78_SIGNATURE =\
  17. "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0"
  18.  
  19. def __init__(self):
  20. self.mCAFile = certifi.where()
  21. self.mSSLContext =\
  22. ssl.create_default_context(
  23. cafile=self.mCAFile
  24. )
  25. # def __init__
  26.  
  27. def consumeUrl(
  28. self,
  29. pUrl:str,
  30. pbPreferBytes:bool=False,
  31. pStrEncoding="UTF-8",
  32. pHeaders:dict={}
  33. ):
  34. theRequest:Request = Request(
  35. url=pUrl,
  36. # custom http-headers
  37. headers=pHeaders
  38. )
  39.  
  40. response:HTTPResponse =\
  41. urlopen(
  42. theRequest,
  43. #url = pUrl,
  44. context=self.mSSLContext
  45. )
  46. theBytes = response.read()
  47. if(pbPreferBytes):
  48. return theBytes
  49. else:
  50. strResponse = str(
  51. theBytes,
  52. pStrEncoding
  53. )
  54. return strResponse
  55. # if-else
  56. # def consumeUrl
  57.  
  58. def getImgs(
  59. self,
  60. pUrl:str
  61. ):
  62. listOfImgSrcs = list()
  63.  
  64. strContent = self.consumeUrl(pUrl=pUrl)
  65. if(strContent):
  66. bs = BeautifulSoup(
  67. strContent,
  68. "html5lib" # parser => python -m pip install html5lib
  69. )
  70.  
  71. if(bs):
  72. theImgs = bs.findAll("img") #<img src="1.jpg" alt="bla bla">
  73. for img in theImgs:
  74. if("src" in img.attrs.keys()): # é um dict
  75. src = img.attrs['src']
  76. listOfImgSrcs.append(src)
  77. # if
  78. # for
  79. # if
  80. # if
  81.  
  82. return listOfImgSrcs
  83. # def getImgs
  84.  
  85. def getAnchors(
  86. self,
  87. pUrl:str
  88. ):
  89. listOfFoundAnchors = list() #
  90.  
  91. strConteudo = \
  92. self.consumeUrl(
  93. pUrl=pUrl
  94. )
  95. if (strConteudo):
  96. bs = BeautifulSoup(
  97. strConteudo,
  98. "html5lib"
  99. )
  100. if (bs):
  101. theAs = bs.findAll("a")
  102.  
  103. if (theAs):
  104. for anchor in theAs:
  105. #texto:str = anchor.title
  106. texto: str = anchor.text
  107. bThereIsHref = "href" in anchor.attrs.keys()
  108. if (bThereIsHref):
  109. href = anchor.attrs["href"]
  110. else:
  111. href = ""
  112. # if-else
  113.  
  114. listOfFoundAnchors.append(
  115. {
  116. AmBotTools.KEY_ANCHOR:texto,
  117. AmBotTools.KEY_HREF:href
  118. }
  119. )
  120. # for every anchor
  121. # if there are anchors
  122. # if it was possible to get a bs object
  123. # if there is content
  124.  
  125. return listOfFoundAnchors
  126. # def getAnchors
  127.  
  128. """
  129. escreva um método que permita filtrar
  130. uma lista de anchors
  131. incluindo no retorno apenas aquelas
  132. cujo atributo href
  133. contenha certa expressão.
  134. Por exemplo:
  135. AmBotTools.getFilteredByHrefAnchors(
  136. theAnchors, # uma lista
  137. "4cdn.org" # uma frase de filtro
  138. )
  139. """
  140. @staticmethod
  141. def getFilteredByHrefAnchors(
  142. pListOfAnchors:list,
  143. pStrHrefFilter:str
  144. ):
  145. filtered = list()
  146.  
  147. for thing in pListOfAnchors:
  148. #anchor = thing[AmBotTools.KEY_ANCHOR]
  149.  
  150. href = thing[AmBotTools.KEY_HREF]
  151. bSatisfiesFilter =\
  152. pStrHrefFilter in href
  153. if(bSatisfiesFilter):
  154. filtered.append(thing)
  155. # if
  156. # for
  157.  
  158. return filtered
  159. # def getFilteredByHrefAnchors
  160.  
  161. # use for jpg, png, webm, etc, NOT for text
  162. def downloadBin(
  163. self,
  164. pUrlBin:str,
  165. pDestinationName:str,
  166. pHeaders:dict
  167. ):
  168. theBytes =\
  169. self.consumeUrl(
  170. pUrl=pUrlBin,
  171. pbPreferBytes=True,
  172. pHeaders=pHeaders
  173. )
  174. try:
  175. fw = open(
  176. pDestinationName,
  177. "wb" # write binary
  178. )
  179. if(fw):
  180. fw.write(theBytes)
  181. fw.close()
  182. return True
  183. #if
  184. except Exception as e:
  185. print(str(e))
  186. #try-except
  187.  
  188. return False
  189. # def downloadBin
  190. # class AmBotTools
  191.  
  192.  
  193. ************************
  194.  
  195.  
  196. # bot_flickr
  197.  
  198. # forma geral de um URL de utilizador flickr.com
  199. # https://www.flickr.com/photos/<user name>/
  200. # exemplo:
  201. # https://www.flickr.com/photos/projectapolloarchive/
  202.  
  203. # numa página de um utilizador, é possível chegar aos photo ids por extração dos elementos img
  204. # exemplo:
  205. # <img loading="lazy" src="//live.staticflickr.com/65535/51357005462_97135e04be.jpg"
  206. # os elementos img que interessam são os que têm atributo src
  207. # cujo valor começar por //live.staticflickr.com/
  208. # a terminação do valor esconde o photo id
  209. # por exemplo: 51357005462_97135e04be.jpg
  210. # o photo id é a expressão ANTES do underscore
  211. # por exemplo: 51357005462
  212.  
  213.  
  214. # forma geral de um URL de foto
  215. # # https://www.flickr.com/photos/<user name>/<photo id>/
  216.  
  217. # exemplo de um URL direto para uma foto
  218. # https://www.flickr.com/photos/projectapolloarchive/51357005462/
  219. # sem JS não se vê a foto
  220. # no source code deste URL
  221. # existe um dict, de interesse a partir da chave "params"
  222. # algures, dentro do dict, existe uma chave "o" que esconde
  223. # o endereço da foto na sua qualidade original
  224. # por exemplo: "o":{"displayUrl":"\/\/live.staticflickr.com\/65535\/51357005462_c179c4dfff_o.jpg"
  225.  
  226. """
  227. importantes
  228. MARCA DE início do dict de interesse
  229. name: 'photo-page-scrappy-view',
  230. params:
  231.  
  232. MARCA de fim do dict de interesse
  233. },
  234. modelExport:
  235. """
  236.  
  237. from AmBotTools import AmBotTools
  238. import json
  239.  
  240. TEST_USER = "projectapolloarchive"
  241.  
  242. class BotFlickr:
  243. BASE_URL = "https://www.flickr.com/photos"
  244. FILTER_FOR_IMG_SRC = "//live.staticflickr.com"
  245. MARK_IMG_DICT_START = "name: 'photo-page-scrappy-view',"
  246. MARK_IMG_DICT_END = "modelExport:"
  247.  
  248. def __init__(self, pUserName):
  249. self.mUserName = pUserName
  250. self.mBot = AmBotTools()
  251. # def __init__
  252.  
  253. def getUserUrl(self):
  254. return f"{BotFlickr.BASE_URL}/{self.mUserName}"
  255. # def getUserUrl
  256.  
  257. def getUserUrlForUserPage(self, pN):
  258. # https://www.flickr.com/photos/projectapolloarchive/page2
  259. return f"{BotFlickr.BASE_URL}/{self.mUserName}/page{pN}"
  260. # def getUserUrlForUserPage
  261.  
  262. def getPhotoIdsAtUrl(self, pUrl:str):
  263. theIds = list()
  264. theImgSrcs = self.mBot.getImgs(pUrl)
  265. # filter the images
  266. # '//live.staticflickr.com/65535/51357978988_cee7be2c65_z.jpg'
  267. for src in theImgSrcs:
  268. # example: src='//live.staticflickr.com/65535/51357978988_cee7be2c65_z.jpg'
  269. bSatisfiesFilter:bool = src.find(BotFlickr.FILTER_FOR_IMG_SRC)!=-1
  270. """
  271. "artur".find("z") # -1
  272. "artur".find("r") # 1
  273. """
  274. if(bSatisfiesFilter):
  275. theImgFileName = src.split("/")[-1] # '51357978988_cee7be2c65_z.jpg'
  276. theImgId = theImgFileName.split("_")[0]
  277. theIds.append(theImgId)
  278. # if
  279. return theIds
  280. # def getPhotoIdsAtUrl
  281.  
  282. def getPhotoUrlFromPhotoId(self, pPhotoId):
  283. # https://www.flickr.com/photos/<user name>/<photo id>/
  284. return self.getUserUrl()+f"/{pPhotoId}/"
  285. # def getPhotoUrlFromPhotoId
  286.  
  287. def getPhotoDictFromPhotoId(self, pPhotoId):
  288. url = self.getPhotoUrlFromPhotoId(pPhotoId)
  289. src = self.mBot.consumeUrl(url)
  290.  
  291. iWhereDictStarts = src.find(BotFlickr.MARK_IMG_DICT_START)
  292. if(iWhereDictStarts!=-1):
  293. iWhereDictStarts+=len(BotFlickr.MARK_IMG_DICT_START)
  294. srcDict = src[iWhereDictStarts:]
  295.  
  296. iWhereDictEnds = srcDict.find(BotFlickr.MARK_IMG_DICT_END)
  297. if(iWhereDictEnds!=-1):
  298. # add the starting curly-brace
  299. srcDict = "{"+ (srcDict[0:iWhereDictEnds]).strip()
  300.  
  301. # fix the 1st key
  302. srcDict = srcDict.replace(
  303. "params:",
  304. "\"params\":"
  305. )
  306.  
  307. srcDict:str = (srcDict.split("\n")[0])+"}"
  308.  
  309. realDict:dict = json.loads(srcDict)
  310.  
  311. return realDict
  312. # if
  313. # if
  314. return False
  315. # def getPhotoDictFromPhotoId
  316.  
  317. def getDescriptionAndOriginalUrlFromPhotoId(self, pPhotoId):
  318. theDict = self.getPhotoDictFromPhotoId(pPhotoId)
  319. theDescription = theDict['params']['photoModel']['description']
  320. o = "https:" + theDict['params']['photoModel']['sizes']['o']['src']
  321.  
  322. return theDescription, o
  323. # def getDescriptionAndOriginalUrlFromPhotoId
  324.  
  325. # class BotFlickr
  326.  
  327. bot = BotFlickr(TEST_USER)
  328. url = bot.getUserUrl()
  329. print(url)
  330. #print (bot.getUserUrlForUserPage(159))
  331.  
  332. #theSrcs = bot.mBot.getImgs(url)
  333. #print(theSrcs)
  334. theIds = bot.getPhotoIdsAtUrl(url)
  335. print(theIds)
  336.  
  337. urlFor1stPhoto = bot.getPhotoUrlFromPhotoId(theIds[0])
  338. print(urlFor1stPhoto)
  339.  
  340. #print(bot.getPhotoDictFromPhotoId(theIds[0]))
  341.  
  342. desc, urlOriginal = bot.getDescriptionAndOriginalUrlFromPhotoId(theIds[1])
  343. print(desc)
  344. print(urlOriginal)
  345.  
Advertisement
Add Comment
Please, Sign In to add comment