Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- import lxml.html
- import sqlite3
- import threading
- import time
- import datetime
- import re
- from itertools import zip_longest
- import logging
- from fake_useragent import UserAgent
- ua = UserAgent
- logging.basicConfig(level='DEBUG')
- # Параметры подключения к БД
- conn_proxy = sqlite3.connect ( 'D:\ZP_Template\PROXY-CHECKER\PROXY.sqlite3' )
- cursor_proxy = conn_proxy.cursor ()
- conn = sqlite3.connect ( 'D:\ZP_Template\СБОР-КОНТАКТОВ\orgpage_ru\Orpage.sqlite3' )
- cursor = conn.cursor ()
- def get_item_url_in_db():
- url_item = cursor.execute("SELECT url_item, rowid FROM urls WHERE status IS NULL LIMIT 1" ).fetchone () # возвращает значение в виде кортежа
- return url_item
- raw_url = get_item_url_in_db()
- url = raw_url[0]
- rowid = raw_url[1]
- print(url)
- print(rowid)
- def update_item_status_in_db(num) :
- cursor.execute ( "UPDATE urls SET status = '{}' WHERE rowid = '{}'". format(num, rowid ) )
- conn.commit () # фиксайия изменений в БД
- # region Получаем proxy из ДБ
- def get_and_update_proxy_in_db():
- proxy_in_db = cursor_proxy.execute("SELECT proxy FROM PROXY ORDER BY lastUpdate ASC LIMIT 1" ).fetchone () # возвращает значение в виде кортежа
- data_t_proxy = (str(datetime.datetime.now()), str(proxy_in_db[0]))
- cursor_proxy.execute("UPDATE PROXY SET lastUpdate = ? WHERE proxy = ?", data_t_proxy)
- conn_proxy.commit()
- return proxy_in_db
- def combine_proxy():
- proxy = get_and_update_proxy_in_db()[0]
- proxy_dict = {'http':proxy, 'https':proxy}
- #print('PROXY ' + proxy)
- #print(proxy_dict)
- return proxy_dict
- # proxy_dict = combine_proxy()
- # print('PP ' + str(proxy_dict))
- # regionПолучаем_страницу
- def get_page(url):
- headers = {"Host" : "www.orgpage.ru",
- "Connection" : "keep-alive",
- "Pragma" : "no-cache",
- "Cache-Control" : "no-cache",
- "DNT" : "1",
- "Upgrade-Insecure-Requests" : "1",
- "User-Agent" : UserAgent ().chrome,
- "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
- "Accept-Encoding" : "gzip, deflate",
- "Accept-Language" : "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7"}
- try :
- print ( '5) грузим страницу' )
- print ( '6) УРЛ в гет page ' + url )
- proxy_dict = combine_proxy()
- print('IN GET' + str(proxy_dict))
- doc_source = ''
- response = requests.get(url, proxies=proxy_dict, timeout=10 ) # headers=headers,
- if response.status_code == 200 :
- print ( '7) Страница загружена' )
- print ( 'Тип ответа' )
- #print ( type ( response.text ) )
- print(response.encoding)
- print(response.content)
- return response.content
- except requests.exceptions.RequestException as e :
- print ( "Ошибка: " + str ( e ) )
- print ( '-- -- Меняем прокси' )
- print('ОБНОВЛЯЕМ')
- combine_proxy()
- print ( '-- -- Новый get_page' )
- get_page(url)
- # endregion
- doc_source2 = get_page(url)
- print('PRRR ' + str(doc_source2))
- from bs4 import UnicodeDammit
- def decode_html(doc_source):
- print('Функция - конвертации')
- converted = UnicodeDammit(doc_source)
- if not converted.unicode_markup:
- raise UnicodeDecodeError(
- "Failed to detect encoding, tried [%s]",
- ', '.join(converted.tried_encodings))
- # print converted.original_encoding
- print(converted.unicode_markup)
- return converted.unicode_markup
- def parsing_page(doc_source2):
- print ( '4)Парсим данные:' )
- # print(type(doc_source))
- # #print(doc_source)
- # if doc_source is None:
- # print("нет значения")
- # print(doc_source)
- # get_page(url)
- # parsing_page(doc_source)
- # #doc_source = get_page(url)
- doc = BeautifulSoup(doc_source2, 'lxml')
- print(type(doc))
- #doc = doc_source
- title = doc.find('h1')
- #title.span.decompose()
- #title = re.sub(r'\s+', ' ', title)
- title = title.text.strip()
- print(title)
- # #Подзаголовок
- # try :
- # sub_header = doc.find('div', class_='company-header__text')
- # sub_header = sub_header.text.strip ()
- # except :
- # sub_header = ''
- # print (sub_header)
- # # region Правовая форма
- # try :
- # prav_form = doc.xpath ( '//h1/span' )[0]
- # prav_form = prav_form.text.strip ()
- # except :
- # prav_form = ''
- # # endregion
- # # regionСайт
- # try :
- # site = doc.find('a', class_='nofol-link')
- # site = site.text.strip()
- # except :
- # site = ''
- # # endregion
- # regionТелефоны
- # try :
- # phone = []
- # phone_n = doc.xpath (
- # '//ul[@class="company-information__phone-list"]/li/span[@class="company-information__phone"]' )
- # phone_comm = doc.xpath (
- # '//ul[@class="company-information__phone-list"]/li/span[@class="company-information__phone-name"]' )
- #
- # for a_, b_ in zip_longest ( phone_n, phone_comm, fillvalue=" " ) :
- # if len ( phone_comm ) > 0 :
- # phone_all = ('{}{}').format ( a_.text.strip (), b_.text.strip () )
- # phone.append ( phone_all )
- # else :
- # phone.append ( a_.text.strip () )
- # phones = ', '.join ( phone )
- # except:
- # phones = ''
- # # print(phones)
- # # endregion
- #
- #
- #
- #
- # # region social
- # try :
- # social = []
- # for s in doc.xpath ( '//ul[@class="company-information__social"]/li/a' ) :
- # social.append ( s.get ( 'href' ) )
- # socials = ', '.join ( social )
- # except :
- # social = ''
- #
- # # endregion
- #
- # # region Сферы_деятельности
- # rubriks = []
- # try :
- # for i in range ( len ( doc.xpath ( '//div[@class="about rubrics-list"]/div' ) ) ) :
- #
- # try :
- # r_root = doc.xpath (f'//div[@class="about rubrics-list"]/div[{i + 1}]/div[@class="rubrics-content rubrics-content--title"]/a' )[
- # 0]
- # rub1 = r_root.text.strip ()
- # except :
- # rub1 = ""
- # # print('Рутт - Не найден')
- #
- # try :
- # r2 = doc.xpath ( f'//div[@class="about rubrics-list"]/div[{i + 1}]/div[2]/div/a' )[0]
- # rub2 = r2.text
- # rub2 = re.sub ( r' в.*', '', rub2 ).strip ()
- # except :
- # rub2 = ""
- # # print('Подруб2 - Не найден')
- #
- # try :
- # r3 = doc.xpath ( f'//div[@class="about rubrics-list"]/div[{i + 1}]/div[3]/div/a' )[0]
- # rub3 = r3.text
- # rub3 = re.sub ( r' в.*', '', rub3 ).strip ()
- # except :
- # rub3 = ""
- # # print('Подруб3 - Не найден')
- #
- # try :
- # r4 = doc.xpath ( f'//div[@class="about rubrics-list"]/div[{i + 1}]/div[4]/div/a' )[0]
- # rub4 = r4.text
- # rub4 = re.sub ( r' в.*', '', rub4 ).strip ()
- # # print('Подрубрика: ' + rr2)
- # except :
- # rub4 = ""
- # # print('Подруб - Не найден')
- #
- # rub = str ( rub1 ) + '|' + str ( rub2 ) + '|' + str ( rub3 ) + '|' + str ( rub4 )
- # rub = re.sub ( r'\|+$', '', rub ).strip ()
- # rubriks.append ( rub )
- # rubrika = ', '.join ( rubriks )
- # except :
- # rubrika = ''
- #
- # # endregion
- #
- # # regionАдрес
- # try :
- # country = doc.xpath ( '//span[@itemprop="addressCountry"]' )[0]
- # country = country.text.strip ()
- # except :
- # country = ''
- #
- # # Город
- # try :
- # city = doc.xpath ( '//div[@class="company-information__address-text"]/span' )[0]
- # city = city.text.strip ()
- # except :
- # city = ''
- #
- # try :
- # postal_code = doc.xpath ( '//span[@itemprop="postalCode"]' )[0]
- # postal_code = postal_code.text.strip ()
- # except :
- # postal_code = ''
- #
- # try :
- # address = doc.xpath ( '//span[@itemprop="streetAddress"]' )[0]
- # address = address.text.strip ()
- # except :
- # address = ''
- #
- # # Метро
- # try :
- # metro_list = []
- # metro_s = doc.xpath ( '//span[@class="metro-item__name"]' )
- # for m in metro_s :
- # # print(m.text.strip())
- # station = m.text.strip ()
- # metro_list.append ( station )
- # metro = ', '.join ( metro_list )
- # except :
- # metro = ''
- # # endregion
- #
- # # regionВремя работы
- # try :
- # work_time = doc.xpath ( '//div[@class="period"]/p' )[0]
- # work_time = work_time.text.strip ()
- # except :
- # work_time = ''
- #
- # # endregion
- #
- # # regionКоординаты
- # try :
- # coordinate_latitude = doc.xpath ( '//meta[@itemprop="latitude"]' )[0].get ( 'content' )
- # coordinate_longitude = doc.xpath ( '//meta[@itemprop="longitude"]' )[0].get ( 'content' )
- # coordinate = (f'{coordinate_latitude},{coordinate_longitude}')
- # except :
- # coordinate = ''
- # # endregion
- #
- # #'Подзаголовок' : sub_header,
- # #'Email' : email,
- #
- #
- # out_data = {'url':url,
- # 'Title':title,
- # 'Телефон':phones,
- # 'Рубрика':rubrika,
- # 'Страна':country,
- # 'Город':city,
- # 'Индекс':postal_code,
- # 'Адрес':address,
- # 'Удаленность_от_остановки':metro,
- # 'GPS_координаты':coordinate,
- # 'Время_работы':work_time}
- #
- # return out_data
- def import_in_db(parsing_data) :
- print ( '3 импорт в БД' )
- # try:
- values = parsing_data
- columns = ', '.join ( values.keys () )
- placeholders = ':' + ', :'.join ( values.keys () )
- query = 'INSERT INTO data (%s) VALUES (%s)' % (columns, placeholders)
- # print(query)
- print( ' '.join ( values.values () ) )
- cursor.execute ( query, values )
- conn.commit ()
- parsing_page(doc_source2)
- #
- # def main() :
- #
- # get_item_url_in_db()
- # update_item_status_in_db(num=0)
- # print ( 'Обновили статус - 0' )
- #
- # #get_and_update_proxy_in_db()
- #
- # doc_source2 = get_page(url)
- # parsing_data = parsing_page(doc_source2)
- #
- # import_in_db(parsing_data)
- #
- # update_item_status_in_db(num=1)
- # print( 'Обновили статус - 1' )
- #
- #
- # if __name__ == "__main__" :
- # main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement