Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # AmBotTools.py
- import certifi
- import ssl
- from urllib.request import urlopen, Request
- from http.client import HTTPResponse
- import bs4
- from bs4 import BeautifulSoup
- class AmBotTools:
- KEY_ANCHOR = "anchor"
- KEY_HREF = "href"
- CHROME_SIGNATURE = "Chrome"
- FF78_SIGNATURE =\
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0"
- def __init__(self):
- self.mCAFile = certifi.where()
- self.mSSLContext =\
- ssl.create_default_context(
- cafile=self.mCAFile
- )
- # def __init__
- def consumeUrl(
- self,
- pUrl:str,
- pbPreferBytes:bool=False,
- pStrEncoding="UTF-8",
- pHeaders:dict={}
- ):
- theRequest:Request = Request(
- url=pUrl,
- # custom http-headers
- headers=pHeaders
- )
- response:HTTPResponse =\
- urlopen(
- theRequest,
- #url = pUrl,
- context=self.mSSLContext
- )
- theBytes = response.read()
- if(pbPreferBytes):
- return theBytes
- else:
- strResponse = str(
- theBytes,
- pStrEncoding
- )
- return strResponse
- # if-else
- # def consumeUrl
- def getImgs(
- self,
- pUrl:str
- ):
- listOfImgSrcs = list()
- strContent = self.consumeUrl(pUrl=pUrl)
- if(strContent):
- bs = BeautifulSoup(
- strContent,
- "html5lib" # parser => python -m pip install html5lib
- )
- if(bs):
- theImgs = bs.findAll("img") #<img src="1.jpg" alt="bla bla">
- for img in theImgs:
- if("src" in img.attrs.keys()): # é um dict
- src = img.attrs['src']
- listOfImgSrcs.append(src)
- # if
- # for
- # if
- # if
- return listOfImgSrcs
- # def getImgs
- def getAnchors(
- self,
- pUrl:str
- ):
- listOfFoundAnchors = list() #
- strConteudo = \
- self.consumeUrl(
- pUrl=pUrl
- )
- if (strConteudo):
- bs = BeautifulSoup(
- strConteudo,
- "html5lib"
- )
- if (bs):
- theAs = bs.findAll("a")
- if (theAs):
- for anchor in theAs:
- #texto:str = anchor.title
- texto: str = anchor.text
- bThereIsHref = "href" in anchor.attrs.keys()
- if (bThereIsHref):
- href = anchor.attrs["href"]
- else:
- href = ""
- # if-else
- listOfFoundAnchors.append(
- {
- AmBotTools.KEY_ANCHOR:texto,
- AmBotTools.KEY_HREF:href
- }
- )
- # for every anchor
- # if there are anchors
- # if it was possible to get a bs object
- # if there is content
- return listOfFoundAnchors
- # def getAnchors
- """
- escreva um método que permita filtrar
- uma lista de anchors
- incluindo no retorno apenas aquelas
- cujo atributo href
- contenha certa expressão.
- Por exemplo:
- AmBotTools.getFilteredByHrefAnchors(
- theAnchors, # uma lista
- "4cdn.org" # uma frase de filtro
- )
- """
- @staticmethod
- def getFilteredByHrefAnchors(
- pListOfAnchors:list,
- pStrHrefFilter:str
- ):
- filtered = list()
- for thing in pListOfAnchors:
- #anchor = thing[AmBotTools.KEY_ANCHOR]
- href = thing[AmBotTools.KEY_HREF]
- bSatisfiesFilter =\
- pStrHrefFilter in href
- if(bSatisfiesFilter):
- filtered.append(thing)
- # if
- # for
- return filtered
- # def getFilteredByHrefAnchors
- # use for jpg, png, webm, etc, NOT for text
- def downloadBin(
- self,
- pUrlBin:str,
- pDestinationName:str,
- pHeaders:dict
- ):
- theBytes =\
- self.consumeUrl(
- pUrl=pUrlBin,
- pbPreferBytes=True,
- pHeaders=pHeaders
- )
- try:
- fw = open(
- pDestinationName,
- "wb" # write binary
- )
- if(fw):
- fw.write(theBytes)
- fw.close()
- return True
- #if
- except Exception as e:
- print(str(e))
- #try-except
- return False
- # def downloadBin
- # class AmBotTools
- ************************
- # bot_flickr
- # forma geral de um URL de utilizador flickr.com
- # https://www.flickr.com/photos/<user name>/
- # exemplo:
- # https://www.flickr.com/photos/projectapolloarchive/
- # numa página de um utilizador, é possível chegar aos photo ids por extração dos elementos img
- # exemplo:
- # <img loading="lazy" src="//live.staticflickr.com/65535/51357005462_97135e04be.jpg"
- # os elementos img que interessam são os que têm atributo src
- # cujo valor começar por //live.staticflickr.com/
- # a terminação do valor esconde o photo id
- # por exemplo: 51357005462_97135e04be.jpg
- # o photo id é a expressão ANTES do underscore
- # por exemplo: 51357005462
- # forma geral de um URL de foto
- # # https://www.flickr.com/photos/<user name>/<photo id>/
- # exemplo de um URL direto para uma foto
- # https://www.flickr.com/photos/projectapolloarchive/51357005462/
- # sem JS não se vê a foto
- # no source code deste URL
- # existe um dict, de interesse a partir da chave "params"
- # algures, dentro do dict, existe uma chave "o" que esconde
- # o endereço da foto na sua qualidade original
- # por exemplo: "o":{"displayUrl":"\/\/live.staticflickr.com\/65535\/51357005462_c179c4dfff_o.jpg"
- """
- importantes
- MARCA DE início do dict de interesse
- name: 'photo-page-scrappy-view',
- params:
- MARCA de fim do dict de interesse
- },
- modelExport:
- """
- from AmBotTools import AmBotTools
- import json
- TEST_USER = "projectapolloarchive"
- class BotFlickr:
- BASE_URL = "https://www.flickr.com/photos"
- FILTER_FOR_IMG_SRC = "//live.staticflickr.com"
- MARK_IMG_DICT_START = "name: 'photo-page-scrappy-view',"
- MARK_IMG_DICT_END = "modelExport:"
- def __init__(self, pUserName):
- self.mUserName = pUserName
- self.mBot = AmBotTools()
- # def __init__
- def getUserUrl(self):
- return f"{BotFlickr.BASE_URL}/{self.mUserName}"
- # def getUserUrl
- def getUserUrlForUserPage(self, pN):
- # https://www.flickr.com/photos/projectapolloarchive/page2
- return f"{BotFlickr.BASE_URL}/{self.mUserName}/page{pN}"
- # def getUserUrlForUserPage
- def getPhotoIdsAtUrl(self, pUrl:str):
- theIds = list()
- theImgSrcs = self.mBot.getImgs(pUrl)
- # filter the images
- # '//live.staticflickr.com/65535/51357978988_cee7be2c65_z.jpg'
- for src in theImgSrcs:
- # example: src='//live.staticflickr.com/65535/51357978988_cee7be2c65_z.jpg'
- bSatisfiesFilter:bool = src.find(BotFlickr.FILTER_FOR_IMG_SRC)!=-1
- """
- "artur".find("z") # -1
- "artur".find("r") # 1
- """
- if(bSatisfiesFilter):
- theImgFileName = src.split("/")[-1] # '51357978988_cee7be2c65_z.jpg'
- theImgId = theImgFileName.split("_")[0]
- theIds.append(theImgId)
- # if
- return theIds
- # def getPhotoIdsAtUrl
- def getPhotoUrlFromPhotoId(self, pPhotoId):
- # https://www.flickr.com/photos/<user name>/<photo id>/
- return self.getUserUrl()+f"/{pPhotoId}/"
- # def getPhotoUrlFromPhotoId
- def getPhotoDictFromPhotoId(self, pPhotoId):
- url = self.getPhotoUrlFromPhotoId(pPhotoId)
- src = self.mBot.consumeUrl(url)
- iWhereDictStarts = src.find(BotFlickr.MARK_IMG_DICT_START)
- if(iWhereDictStarts!=-1):
- iWhereDictStarts+=len(BotFlickr.MARK_IMG_DICT_START)
- srcDict = src[iWhereDictStarts:]
- iWhereDictEnds = srcDict.find(BotFlickr.MARK_IMG_DICT_END)
- if(iWhereDictEnds!=-1):
- # add the starting curly-brace
- srcDict = "{"+ (srcDict[0:iWhereDictEnds]).strip()
- # fix the 1st key
- srcDict = srcDict.replace(
- "params:",
- "\"params\":"
- )
- srcDict:str = (srcDict.split("\n")[0])+"}"
- realDict:dict = json.loads(srcDict)
- return realDict
- # if
- # if
- return False
- # def getPhotoDictFromPhotoId
- def getDescriptionAndOriginalUrlFromPhotoId(self, pPhotoId):
- theDict = self.getPhotoDictFromPhotoId(pPhotoId)
- theDescription = theDict['params']['photoModel']['description']
- o = "https:" + theDict['params']['photoModel']['sizes']['o']['src']
- return theDescription, o
- # def getDescriptionAndOriginalUrlFromPhotoId
- # class BotFlickr
- bot = BotFlickr(TEST_USER)
- url = bot.getUserUrl()
- print(url)
- #print (bot.getUserUrlForUserPage(159))
- #theSrcs = bot.mBot.getImgs(url)
- #print(theSrcs)
- theIds = bot.getPhotoIdsAtUrl(url)
- print(theIds)
- urlFor1stPhoto = bot.getPhotoUrlFromPhotoId(theIds[0])
- print(urlFor1stPhoto)
- #print(bot.getPhotoDictFromPhotoId(theIds[0]))
- desc, urlOriginal = bot.getDescriptionAndOriginalUrlFromPhotoId(theIds[1])
- print(desc)
- print(urlOriginal)
Advertisement
Add Comment
Please, Sign In to add comment