Untitled

#!/usr/bin/python3
# -*- Coding: utf-8 -*-

import threading
import requests
import random
from bs4 import BeautifulSoup
import pika
import time
import json
import os
from ParserModels.AvitoSubjectModel import AvitoSubject
from ParserModels.ProxyModel import Proxy
from ParserModels.CategoryModel import Category
from ParserModels.CityModel import City

os.environ['TZ'] = 'Europe/Samara'
time.tzset()

class AvitoDaemon(object):
    connection = None
    pages_channel = None

    def __init__(self):
        """
        Инициализируем соединение с RabbitMQ
        """

        # подключаем конфиг
        config_filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config', 'rabbit.json')
        rabbit_config = json.load(open( config_filename, 'r' ))

        # задаём параметры подключения
        credentials = pika.PlainCredentials(rabbit_config['user'], rabbit_config['password'])
        parameters = pika.ConnectionParameters(rabbit_config['host'], rabbit_config['port'], '/', credentials, socket_timeout=10)
        self.connection = pika.BlockingConnection(parameters)

        # инициализация канала и очереди для URLов страниц
        self.pages_channel = self.connection.channel()
        # кол-во единовременно получаемых сообщений
        self.pages_channel.basic_qos(prefetch_count=1)
        # объявляем очередь (если её нет)
        self.pages_channel.queue_declare(queue='pages', durable=True)
        # прилепляем к очереди callback
        self.pages_channel.basic_consume(self.proc, queue='pages')

    def start(self):
        """
        Старт демона
        """

        try:
            self.pages_channel.start_consuming()
        except Exception as error_msg:
            print('start_consuming(): ', error_msg)
            self.pages_channel.stop_consuming()

        self.connection.close()

    def proc(self, channel, method_frame, header_frame, message_json):
        try:
            page_dict = json.loads(str(message_json,'utf-8'))

            if 'source' not in page_dict:
                channel.basic_ack(delivery_tag=method_frame.delivery_tag)
                return False

            if page_dict['source'] == 'avito':
                result = self.proc_page(page_dict)
        except Exception as ErrMsg:
            print('proc(): ', ErrMsg)
            channel.basic_ack(delivery_tag=method_frame.delivery_tag)
            return False

        channel.basic_ack(delivery_tag=method_frame.delivery_tag)
        return True

    def get_via_proxy(self, url, **kwargs):
        """
        Функция выполняет GET-запрос с использованием прокси из RabbitMQ
        """

        # список User-Agent для запроса (выбираем рандомно)
        user_agents = [
            'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.137 YaBrowser/17.4.1.758 Yowser/2.5 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
        ]

        if 'mobile' in kwargs and kwargs['mobile'] == True:
            # переопределяем заголовки на мобильные если запрос идет на мобильную страницу
            user_agents = [
                'Mozilla/5.0 (Linux; Android 6.0.1; SM-G920V Build/MMB29K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.98 Mobile Safari/537.36',
                'Mozilla/5.0 (Windows Phone 10.0; Android 4.2.1; Microsoft; Lumia 950) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Mobile Safari/537.36 Edge/13.10586',
                'Mozilla/5.0 (Linux; Android 5.1.1; SM-G928X Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36',
                'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 6P Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.83 Mobile Safari/537.36',
                'Mozilla/5.0 (Linux; Android 6.0.1; E6653 Build/32.2.A.0.253) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.98 Mobile Safari/537.36',
                'Mozilla/5.0 (Linux; Android 6.0; HTC One M9 Build/MRA58K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.98 Mobile Safari/537.36'
            ]

        # заголовки для запроса
        request_headers = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "accept-language": "ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4",
            "cache-control": "no-cache",
            "upgrade-insecure-requests": "1"
        }

        # выбираем рандомный User-Agent
        request_headers['user-agent'] = random.choice(user_agents)

        # получаем свободный прокси из RabbitMQ через наше местное соединение
        proxy = Proxy.get_proxy(self.connection)

        # формируем прокси для запроса
        proxy_string = 'http://%s:%s@%s:%s' % ( proxy['login'], proxy['password'], proxy['ip'], str(proxy['port']) )
        proxies = { 'https': proxy_string }

        response = None

        # выполняем запрос через прокси с левыми заголовками
        try:
            response = requests.get(url, proxies=proxies, headers=request_headers, timeout=15.0)
        except Exception:
            pass

        # освобождаем поюзанный прокси
        Proxy.exempt_proxy(proxy['ip'])

        return response

    def proc_page(self, page_dict):
        # получаем список урлов со страницы
        subject_urls = []
        try:
            url = page_dict['url']
            category = Category.objects.get(id=page_dict['category_id'])
            page_response = self.get_via_proxy(url)
            page_html = page_response.text
            page_soup = BeautifulSoup(page_html, 'lxml')

            subject_url_soups = page_soup.find('div', class_='catalog-list').find_all('div', class_='item_table')

            for subject_url_soup in subject_url_soups:
                try:
                    subject_url = subject_url_soup.find('div', class_='description').find('h3').find('a').get('href')
                except Exception:
                    continue
                subject_urls.append(subject_url)

            if len(subject_urls) == 0:
                raise Exception('subject_urls is empty!')

        except Exception as ErrMsg:
            print('proc_page(): ', ErrMsg)
            # если список урлов не получен, нечего обрабатывать. выходим отсюда
            return False

        del page_response, page_html, page_soup, subject_url_soups

        for full_subject_url in subject_urls:

            subject_url = full_subject_url.split('?')[0]

            try:
                new_subject = AvitoSubject()
                new_subject.url = subject_url
                new_subject.category_id = str(category.id)

                city = City.objects.get(avito_alias=page_dict['avito_city_alias'])

                # определение в URL объявления запрошенного города
                # если URL объявления не содержит правильного avito_alias,
                # значит, нам пришла левота и мы её пропускаем
                city_alias = city.avito_alias

                del city

                if city_alias not in subject_url:
                    continue

                new_subject.log = [ time.strftime("%Y.%m.%d %H:%M:%S", time.localtime(time.time())) + ' URL добавлен' ]

                new_subject.save()
            except Exception as err:
                continue

            try:
                self.proc_subject(new_subject)
            except Exception as ErrMsg:
                continue

            del subject_urls, new_subject

        return True

    def proc_subject(self, subject):
        # получаем полный HTML
        try:
            full_url = 'https://www.avito.ru' + subject.url
            full_response = self.get_via_proxy(full_url)
            full_html = full_response.text
            full_soup = BeautifulSoup(full_html, 'lxml')
        except Exception as ErrMsg:
            print('get full soup: ', ErrMsg)
            return False

        del full_url, full_response, full_html

        # получаем мобильный HTML
        try:
            mobile_url = 'https://m.avito.ru' + subject.url
            mobile_response = self.get_via_proxy(mobile_url, mobile=True)
            mobile_html = mobile_response.text
            mobile_soup = BeautifulSoup(mobile_html, 'lxml')
        except Exception as ErrMsg:
            print('get mobile soup: ', ErrMsg)
            return False

        del mobile_url, mobile_response, mobile_html

        # получаем телефон
        try:
            tel_tag = mobile_soup.find('a', {'data-marker':'item-contact-bar/call'}).get('href').strip()
            phone = tel_tag.split(':')[1]
            subject.phone = ''.join([i if i.isdigit() else '' for i in phone])
            subject.save()
        except Exception as ErrMsg:
            print('get phone: ', ErrMsg)
            return False

        del tel_tag, phone

        # получаем имя автора
        try:
            subject.author_name = mobile_soup.find('span', {'data-marker':'seller-info/name'}).text.strip()
            subject.save()
        except Exception:
            print('get author name: ', ErrMsg)

        # получаем заголовок
        try:
            subject.title = mobile_soup.find('h1', {'data-marker':'item-description/title'}).find('span').text.strip()
            subject.save()
        except Exception:
            print('get title: ', ErrMsg)

        # получаем описание
        try:
            subject.description = mobile_soup.find('meta', {'property':'og:description'}).get('content').strip()
            subject.save()
        except Exception as ErrMsg:
            print('get description: ', ErrMsg)

        # получаем цену
        try:
            subject.price = mobile_soup.find('span', {'data-marker':'item-description/price'}).text.strip()
            subject.save()
        except Exception as ErrMsg:
            print('get price: ', ErrMsg)

        # получаем фотки
        try:
            photos = []
            scripts = mobile_soup.find('body').findAll('script')
            for script in scripts:
                script_text = script.text

                if 'window.__initialData__ =' not in script_text:
                    continue

                json_content = script_text[25:-6]
                json_object = json.loads(json_content)

                item = json_object['item']

                if 'currentItem' in item:
                    current_item = item['currentItem']
                elif 'item' in item:
                    current_item = item['item']
                else:
                    raise Exception('Фотки не получены (блок currentItem отсутствует)')

                images = current_item['images']

                for image in images:
                    if '640x480' in image:
                        photos.append(image['640x480'])
                        continue
                    if '1280x960' in image:
                        photos.append(image['1280x960'])
                        continue

                del images

            del scripts

            if len(photos) == 0:
                raise Exception('photos are empty!')

            subject.photos = photos
            subject.save()
        except Exception as ErrMsg:
            print('get photos: ', ErrMsg)

        del photos

        # получаем поля
        try:
            param_fields = {}

            param_fields_soups = full_soup.find('ul', class_='item-params-list').find_all('li', class_='item-params-list-item')

            for param_fields_soup in param_fields_soups:
                param = param_fields_soup.text.strip().split(':')

                param_fields[param[0]] = param[1]

            del param_fields_soup

            if len(param_fields) == 0:
                raise Exception('param_fields are empty!')

            subject.fields = param_fields
            subject.save()
        except Exception as ErrMsg:
            print('get fields: ', ErrMsg)

        del param_fields, full_soup, mobile_soup

        return True

def start_new_thread():
    ad = AvitoDaemon()
    ad.start()

if __name__ == '__main__':
    for i in range(0, 5):
        threading.Thread(target=start_new_thread).start()