Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from time import sleep
- from json import dumps
- from requests import get
- import os
- from PIL import Image
- SEARCH_HEAD={
- 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
- 'accept-encoding': 'gzip, deflate, br',
- 'accept-language': 'ru,en;q=0.9',
- 'cache-control': 'no-cache',
- 'dnt': '1',
- 'pragma': 'no-cache',
- 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
- 'sec-ch-ua-mobile': '?0',
- 'sec-ch-ua-platform': '"Windows"',
- 'sec-fetch-dest': 'document',
- 'sec-fetch-mode': 'navigate',
- 'sec-fetch-site': 'none',
- 'sec-fetch-user': '?1',
- 'upgrade-insecure-requests': '1',
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36'
- }
- import requests
- from requests.adapters import HTTPAdapter
- from requests.packages.urllib3.util.retry import Retry
- def func(index, photo, id_advert):
- extent=os.path.splitext(os.path.basename(photo))
- if len(extent[1]) == 0:
- extent = '.webp'
- else:
- extent = extent[1]
- session = requests.Session()
- retry = Retry(connect=3, backoff_factor=0.5)
- adapter = HTTPAdapter(max_retries=retry)
- session.mount('http://', adapter)
- session.mount('https://', adapter)
- try:
- photoPage = session.get(photo, headers=SEARCH_HEAD)
- # photoPage = get(photo)
- except Exception as e:
- print("ERROR ID_ADVERT", str(e), id_advert, 'photo', photo)
- return {'url':photo,'err': 'ERR'}
- if photoPage.status_code != 200:
- return {'url':photo,'err': str(photoPage.status_code)}
- if photoPage.headers['Content-Type'] not in ("image/png", "image/jpeg", "image/jpg", 'image/webp'):
- return {'url':photo,'err': 'content_type'}
- name_file = "photos/"+id_advert+'/'+str(index)+str(extent)
- try:
- with open(name_file, 'wb+') as f:
- f.write(photoPage.content)
- f.close()
- except Exception as e:
- print("Ошибка скачивания 1", str(e), id_advert, photo, '\n_____')
- return {'url':photo,'err': 'down'}
- if '.webp' in extent:
- im = Image.open(name_file).convert("RGB")
- name_file_new = name_file.replace('.webp','.jpg')
- im.save(name_file_new, "jpeg")
- os.remove(name_file)
- extent = '.jpg'
- # send_files_to_postgresql(connection, cursor, "/home/daniilak/photos/"+id_advert+'/'+str(index)+str(extent), extent, id_advert, photo)
- return {
- 'index': index,
- 'extent':extent,
- 'url':photo,
- 'err': '',
- }
- from joblib import Parallel, delayed
- import json
- import pandas
- wine_data = pandas.read_csv('photos.csv', sep = ';', usecols= ['id','photos'])
- wine_data = wine_data.iloc[11000:]
- a = wine_data.set_index('id').T.to_dict('list')
- del wine_data
- def myfun(id_advert):
- print(id_advert)
- photos = json.loads(a[id_advert][0])
- os.makedirs("photos/"+str(id_advert), exist_ok=True)
- index = 1
- answer = []
- for photo in photos:
- # print(index, str(id_advert), photo)
- answer.append(func(index, photo, str(id_advert)))
- index = index + 1
- f = open("photos/"+str(id_advert)+'/info.json', 'w+')
- f.write(dumps(answer))
- f.close()
- results = Parallel(n_jobs=12)(delayed(myfun)(id_advert) for id_advert in a)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement