Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # coding: utf-8
- import requests
- from lxml import html
- from threading import Thread, Lock
- from Queue import Queue
- from abc import abstractmethod
- class MarketSearcher(object): # Класс поиска в интернет-магазине
- def __init__(self, product=''):
- self.product = product.decode('utf-8')
- self._success = None
- self.page = None
- self.result = []
- def _check_input(self): # Проверка непустого ввода
- if not self.product:
- while True:
- control = (raw_input('Product (type "exit" to exit): ')).decode('utf-8')
- if control:
- self.product = control
- break
- def _found_product(self): # Определение, найден товар в магазине или нет
- respond = requests.get(self._request_params, {'find': self.product,
- 'sidx':'price', 'sord':'asc'})
- parsed = html.fromstring(respond.text)
- result = ''.join(parsed.xpath(self._not_found_xpath))
- if self._code_phrase in result:
- print 'Nothing found'
- self._success = None
- return False
- else:
- print 'Product found'
- self._success = parsed
- self.url = respond.url
- return True
- def _get_pages_num(self): # Определение числа страниц товаров в магазине
- try:
- self.page = max(map(int, self._success.xpath(self._pages_xpath)))
- except ValueError:
- self.page = 1
- return self.page
- def _parse(self, pages, mutex): # Запрос и извлечение информации из страницы для многопоточной обработки
- page = pages.get()
- respond = requests.get(self._many_url, {'find': self.product,
- 'page': page, 'sidx': 'price',
- 'sord': 'asc'})
- parsed = html.fromstring(respond.text)
- name = parsed.xpath(self._many_name_xpath)
- price = parsed.xpath(self._many_price_xpath)
- with mutex:
- self.result.append((name, price))
- def _get_result(self): # Сбор результатов со страниц
- if self._one_product_url in self.url:
- name = self._success.xpath(self._one_name_xpath)
- price = self._success.xpath(self._one_price_xpath)
- self.result = [(name, price)]
- return self.result
- else: # Если страниц несколько, то распределяет запросы по потокам
- pages = Queue()
- for num in xrange(1, self.page + 1):
- pages.put(num)
- threads =[]
- lock = Lock()
- for i in xrange(self.page if self.page <= 12 else 12):
- thread = Thread(target=self._parse, args=(pages, lock))
- thread.start()
- threads.append(thread)
- for thread in threads:
- thread.join()
- return self.result
- def _output(self): # Вывод в файл
- with open('Result.txt', 'w') as file:
- for number, page in enumerate(self.result):
- file.write('\n\nPage %-100s\t\n\n' % (number + 1))
- for item in zip(*page):
- file.write(('%-100s\t%s\n' % (item[0], item[1])).encode('cp1251'))
- def _end_search(self): # Окончание поиска
- self.result = []
- self.page = None
- self._success = None
- self.product = ''
- @abstractmethod
- def start(self): # Последовательность действий для поиска
- while True:
- self._check_input()
- if self.product != 'exit':
- if self._found_product():
- self._get_pages_num()
- self._get_result()
- self._output()
- self._end_search()
- else:
- break
- class TopShopSearch(MarketSearcher): # Класс поиска в Top-Shop
- def __init__(self, product=''): # Параметры для формирования запрсоов и сбора информации
- MarketSearcher.__init__(self, product)
- self.market = self.url = 'http://www.top-shop.ru/'
- self._request_params = self.market + 'search/'
- self._not_found_xpath = '//div[@class="result_text"]/text()'
- self._code_phrase = 'мы не нашли товаров'.decode('utf-8')
- self._pages_xpath = '//li[@class=" js_page"]/@data-num'
- self._one_product_url = '/product/'
- self._one_name_xpath = '//body/div[6]/div[2]/div/div/div/div/h1/text()'
- self._one_price_xpath = '//body/div[6]/div[3]/div[3]/div/div[2]/div/div[1]/text()'
- self._many_url = self.market + 'search/'
- self._many_name_xpath = '//span[@class="hidden js_ectrack"]/@data-name'
- self._many_price_xpath = '//span[@class="hidden js_ectrack"]/@data-price'
- class EldoShopSearch(MarketSearcher): # Посик в интернет-магазине Эльдорадо
- def __init__(self, product=''): # Параметры
- MarketSearcher.__init__(self, product)
- self.market = self.url = 'http://www.eldorado.ru/'
- self._request_params = self.market + 'search/catalog.php'
- self._not_found_xpath = '//p[@class="paragraph searchInfoTitle"]/text()'
- self._code_phrase = 'ничего не найдено'.decode('utf-8')
- self._pages_xpath = '//div[@class="pages"]/a/text()'
- self._one_product_url = '/cat/detail'
- self._one_name_xpath = '//h1[@itemprop="name"]/text()'
- self._one_price_xpath = '//td/span[@itemprop="price"]/text()'
- self._many_url = self.market + 'search/catalog.php'
- self._many_name_xpath = '//div[@class="itemDescription"]/div[@class="itemTitle"]/a/text()'
- self._many_price_xpath = '//div[@class="priceContainer"]/div/span[@class="discountPrice itemPrice"]/text()'
- def _parse(self, pages, mutex): # Переопределен запрос для многопоточной обработки
- page = pages.get()
- respond = requests.get(self._many_url, {'q': self.product,
- 'page': page, 'sort': 'price',
- 'type': 'asc',
- "list_num" : 50})
- parsed = html.fromstring(respond.text)
- name = parsed.xpath(self._many_name_xpath)
- price = parsed.xpath(self._many_price_xpath)
- with mutex:
- self.result.append((name, price))
- def _found_product(self): # Переопределено определение успешности поиска
- respond = requests.get(self._request_params, {'q': self.product,
- 'sort':'price', 'type':'asc',
- 'list_num': 50})
- parsed = html.fromstring(respond.text)
- result = ''.join(parsed.xpath(self._not_found_xpath))
- if self._code_phrase in result:
- print 'Nothing found'
- self._success = None
- return False
- else:
- print 'Product found'
- self._success = parsed
- self.url = respond.url
- return True
- if __name__ == '__main__':
- first = TopShopSearch()
- first.start()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement