Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python3
- # -*- Coding: utf-8 -*-
- import threading
- import requests
- import random
- from bs4 import BeautifulSoup
- import pika
- import time
- import json
- import os
- from ParserModels.AvitoSubjectModel import AvitoSubject
- from ParserModels.ProxyModel import Proxy
- from ParserModels.CategoryModel import Category
- from ParserModels.CityModel import City
- os.environ['TZ'] = 'Europe/Samara'
- time.tzset()
- class AvitoDaemon(object):
- connection = None
- pages_channel = None
- def __init__(self):
- """
- Инициализируем соединение с RabbitMQ
- """
- # подключаем конфиг
- config_filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config', 'rabbit.json')
- rabbit_config = json.load(open( config_filename, 'r' ))
- # задаём параметры подключения
- credentials = pika.PlainCredentials(rabbit_config['user'], rabbit_config['password'])
- parameters = pika.ConnectionParameters(rabbit_config['host'], rabbit_config['port'], '/', credentials, socket_timeout=10)
- self.connection = pika.BlockingConnection(parameters)
- # инициализация канала и очереди для URLов страниц
- self.pages_channel = self.connection.channel()
- # кол-во единовременно получаемых сообщений
- self.pages_channel.basic_qos(prefetch_count=1)
- # объявляем очередь (если её нет)
- self.pages_channel.queue_declare(queue='pages', durable=True)
- # прилепляем к очереди callback
- self.pages_channel.basic_consume(self.proc, queue='pages')
- def start(self):
- """
- Старт демона
- """
- try:
- self.pages_channel.start_consuming()
- except Exception as error_msg:
- print('start_consuming(): ', error_msg)
- self.pages_channel.stop_consuming()
- self.connection.close()
- def proc(self, channel, method_frame, header_frame, message_json):
- try:
- page_dict = json.loads(str(message_json,'utf-8'))
- if 'source' not in page_dict:
- channel.basic_ack(delivery_tag=method_frame.delivery_tag)
- return False
- if page_dict['source'] == 'avito':
- result = self.proc_page(page_dict)
- except Exception as ErrMsg:
- print('proc(): ', ErrMsg)
- channel.basic_ack(delivery_tag=method_frame.delivery_tag)
- return False
- channel.basic_ack(delivery_tag=method_frame.delivery_tag)
- return True
- def get_via_proxy(self, url, **kwargs):
- """
- Функция выполняет GET-запрос с использованием прокси из RabbitMQ
- """
- # список User-Agent для запроса (выбираем рандомно)
- user_agents = [
- 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
- 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.137 YaBrowser/17.4.1.758 Yowser/2.5 Safari/537.36',
- 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
- ]
- if 'mobile' in kwargs and kwargs['mobile'] == True:
- # переопределяем заголовки на мобильные если запрос идет на мобильную страницу
- user_agents = [
- 'Mozilla/5.0 (Linux; Android 6.0.1; SM-G920V Build/MMB29K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.98 Mobile Safari/537.36',
- 'Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mobile Safari/537.36 Edge/13.10586',
- 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36',
- 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 6P Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36',
- 'Mozilla/5.0 (Linux; Android 6.0.1; E6653 Build/32.2.A.0.253) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.98 Mobile Safari/537.36',
- 'Mozilla/5.0 (Linux; Android 6.0; HTC One M9 Build/MRA58K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.98 Mobile Safari/537.36'
- ]
- # заголовки для запроса
- request_headers = {
- "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
- "accept-language": "ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4",
- "cache-control": "no-cache",
- "upgrade-insecure-requests": "1"
- }
- # выбираем рандомный User-Agent
- request_headers['user-agent'] = random.choice(user_agents)
- # получаем свободный прокси из RabbitMQ через наше местное соединение
- proxy = Proxy.get_proxy(self.connection)
- # формируем прокси для запроса
- proxy_string = 'http://%s:%s@%s:%s' % ( proxy['login'], proxy['password'], proxy['ip'], str(proxy['port']) )
- proxies = { 'https': proxy_string }
- response = None
- # выполняем запрос через прокси с левыми заголовками
- try:
- response = requests.get(url, proxies=proxies, headers=request_headers, timeout=15.0)
- except Exception:
- pass
- # освобождаем поюзанный прокси
- Proxy.exempt_proxy(proxy['ip'])
- return response
- def proc_page(self, page_dict):
- # получаем список урлов со страницы
- subject_urls = []
- try:
- url = page_dict['url']
- category = Category.objects.get(id=page_dict['category_id'])
- page_response = self.get_via_proxy(url)
- page_html = page_response.text
- page_soup = BeautifulSoup(page_html, 'lxml')
- subject_url_soups = page_soup.find('div', class_='catalog-list').find_all('div', class_='item_table')
- for subject_url_soup in subject_url_soups:
- try:
- subject_url = subject_url_soup.find('div', class_='description').find('h3').find('a').get('href')
- except Exception:
- continue
- subject_urls.append(subject_url)
- if len(subject_urls) == 0:
- raise Exception('subject_urls is empty!')
- except Exception as ErrMsg:
- print('proc_page(): ', ErrMsg)
- # если список урлов не получен, нечего обрабатывать. выходим отсюда
- return False
- del page_response, page_html, page_soup, subject_url_soups
- for full_subject_url in subject_urls:
- subject_url = full_subject_url.split('?')[0]
- try:
- new_subject = AvitoSubject()
- new_subject.url = subject_url
- new_subject.category_id = str(category.id)
- city = City.objects.get(avito_alias=page_dict['avito_city_alias'])
- # определение в URL объявления запрошенного города
- # если URL объявления не содержит правильного avito_alias,
- # значит, нам пришла левота и мы её пропускаем
- city_alias = city.avito_alias
- del city
- if city_alias not in subject_url:
- continue
- new_subject.log = [ time.strftime("%Y.%m.%d %H:%M:%S", time.localtime(time.time())) + ' URL добавлен' ]
- new_subject.save()
- except Exception as err:
- continue
- try:
- self.proc_subject(new_subject)
- except Exception as ErrMsg:
- continue
- del subject_urls, new_subject
- return True
- def proc_subject(self, subject):
- # получаем полный HTML
- try:
- full_url = 'https://www.avito.ru' + subject.url
- full_response = self.get_via_proxy(full_url)
- full_html = full_response.text
- full_soup = BeautifulSoup(full_html, 'lxml')
- except Exception as ErrMsg:
- print('get full soup: ', ErrMsg)
- return False
- del full_url, full_response, full_html
- # получаем мобильный HTML
- try:
- mobile_url = 'https://m.avito.ru' + subject.url
- mobile_response = self.get_via_proxy(mobile_url, mobile=True)
- mobile_html = mobile_response.text
- mobile_soup = BeautifulSoup(mobile_html, 'lxml')
- except Exception as ErrMsg:
- print('get mobile soup: ', ErrMsg)
- return False
- del mobile_url, mobile_response, mobile_html
- # получаем телефон
- try:
- tel_tag = mobile_soup.find('a', {'data-marker':'item-contact-bar/call'}).get('href').strip()
- phone = tel_tag.split(':')[1]
- subject.phone = ''.join([i if i.isdigit() else '' for i in phone])
- subject.save()
- except Exception as ErrMsg:
- print('get phone: ', ErrMsg)
- return False
- del tel_tag, phone
- # получаем имя автора
- try:
- subject.author_name = mobile_soup.find('span', {'data-marker':'seller-info/name'}).text.strip()
- subject.save()
- except Exception:
- print('get author name: ', ErrMsg)
- # получаем заголовок
- try:
- subject.title = mobile_soup.find('h1', {'data-marker':'item-description/title'}).find('span').text.strip()
- subject.save()
- except Exception:
- print('get title: ', ErrMsg)
- # получаем описание
- try:
- subject.description = mobile_soup.find('meta', {'property':'og:description'}).get('content').strip()
- subject.save()
- except Exception as ErrMsg:
- print('get description: ', ErrMsg)
- # получаем цену
- try:
- subject.price = mobile_soup.find('span', {'data-marker':'item-description/price'}).text.strip()
- subject.save()
- except Exception as ErrMsg:
- print('get price: ', ErrMsg)
- # получаем фотки
- try:
- photos = []
- scripts = mobile_soup.find('body').findAll('script')
- for script in scripts:
- script_text = script.text
- if 'window.__initialData__ =' not in script_text:
- continue
- json_content = script_text[25:-6]
- json_object = json.loads(json_content)
- item = json_object['item']
- if 'currentItem' in item:
- current_item = item['currentItem']
- elif 'item' in item:
- current_item = item['item']
- else:
- raise Exception('Фотки не получены (блок currentItem отсутствует)')
- images = current_item['images']
- for image in images:
- if '640x480' in image:
- photos.append(image['640x480'])
- continue
- if '1280x960' in image:
- photos.append(image['1280x960'])
- continue
- del images
- del scripts
- if len(photos) == 0:
- raise Exception('photos are empty!')
- subject.photos = photos
- subject.save()
- except Exception as ErrMsg:
- print('get photos: ', ErrMsg)
- del photos
- # получаем поля
- try:
- param_fields = {}
- param_fields_soups = full_soup.find('ul', class_='item-params-list').find_all('li', class_='item-params-list-item')
- for param_fields_soup in param_fields_soups:
- param = param_fields_soup.text.strip().split(':')
- param_fields[param[0]] = param[1]
- del param_fields_soup
- if len(param_fields) == 0:
- raise Exception('param_fields are empty!')
- subject.fields = param_fields
- subject.save()
- except Exception as ErrMsg:
- print('get fields: ', ErrMsg)
- del param_fields, full_soup, mobile_soup
- return True
- def start_new_thread():
- ad = AvitoDaemon()
- ad.start()
- if __name__ == '__main__':
- for i in range(0, 5):
- threading.Thread(target=start_new_thread).start()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement