Advertisement
Guest User

Untitled

a guest
Jan 21st, 2020
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 11.39 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import lxml.html
  4. import sqlite3
  5. import threading
  6. import time
  7. import datetime
  8. import re
  9. from itertools import zip_longest
  10. import logging
  11. from fake_useragent import UserAgent
  12.  
  13.  
  14.  
  15.  
  16.  
  17.  
  18. ua = UserAgent
  19.  
  20. logging.basicConfig(level='DEBUG')
  21.  
  22.  
  23. # Параметры подключения к БД
  24. conn_proxy = sqlite3.connect ( 'D:\ZP_Template\PROXY-CHECKER\PROXY.sqlite3' )
  25. cursor_proxy = conn_proxy.cursor ()
  26.  
  27. conn = sqlite3.connect ( 'D:\ZP_Template\СБОР-КОНТАКТОВ\orgpage_ru\Orpage.sqlite3' )
  28. cursor = conn.cursor ()
  29.  
  30.  
  31. def get_item_url_in_db():
  32. url_item = cursor.execute("SELECT url_item, rowid FROM urls WHERE status IS NULL LIMIT 1" ).fetchone () # возвращает значение в виде кортежа
  33. return url_item
  34.  
  35.  
  36. raw_url = get_item_url_in_db()
  37. url = raw_url[0]
  38. rowid = raw_url[1]
  39. print(url)
  40. print(rowid)
  41.  
  42.  
  43. def update_item_status_in_db(num) :
  44. cursor.execute ( "UPDATE urls SET status = '{}' WHERE rowid = '{}'". format(num, rowid ) )
  45. conn.commit () # фиксайия изменений в БД
  46.  
  47.  
  48. # region Получаем proxy из ДБ
  49. def get_and_update_proxy_in_db():
  50. proxy_in_db = cursor_proxy.execute("SELECT proxy FROM PROXY ORDER BY lastUpdate ASC LIMIT 1" ).fetchone () # возвращает значение в виде кортежа
  51. data_t_proxy = (str(datetime.datetime.now()), str(proxy_in_db[0]))
  52. cursor_proxy.execute("UPDATE PROXY SET lastUpdate = ? WHERE proxy = ?", data_t_proxy)
  53. conn_proxy.commit()
  54. return proxy_in_db
  55.  
  56.  
  57.  
  58. def combine_proxy():
  59. proxy = get_and_update_proxy_in_db()[0]
  60. proxy_dict = {'http':proxy, 'https':proxy}
  61. #print('PROXY ' + proxy)
  62. #print(proxy_dict)
  63. return proxy_dict
  64.  
  65. # proxy_dict = combine_proxy()
  66. # print('PP ' + str(proxy_dict))
  67. # regionПолучаем_страницу
  68.  
  69. def get_page(url):
  70.  
  71. headers = {"Host" : "www.orgpage.ru",
  72. "Connection" : "keep-alive",
  73. "Pragma" : "no-cache",
  74. "Cache-Control" : "no-cache",
  75. "DNT" : "1",
  76. "Upgrade-Insecure-Requests" : "1",
  77. "User-Agent" : UserAgent ().chrome,
  78. "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
  79. "Accept-Encoding" : "gzip, deflate",
  80. "Accept-Language" : "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7"}
  81.  
  82. try :
  83. print ( '5) грузим страницу' )
  84. print ( '6) УРЛ в гет page ' + url )
  85. proxy_dict = combine_proxy()
  86. print('IN GET' + str(proxy_dict))
  87. doc_source = ''
  88. response = requests.get(url, proxies=proxy_dict, timeout=10 ) # headers=headers,
  89. if response.status_code == 200 :
  90. print ( '7) Страница загружена' )
  91. print ( 'Тип ответа' )
  92. #print ( type ( response.text ) )
  93. print(response.encoding)
  94. print(response.content)
  95.  
  96. return response.content
  97. except requests.exceptions.RequestException as e :
  98. print ( "Ошибка: " + str ( e ) )
  99. print ( '-- -- Меняем прокси' )
  100. print('ОБНОВЛЯЕМ')
  101. combine_proxy()
  102. print ( '-- -- Новый get_page' )
  103. get_page(url)
  104. # endregion
  105.  
  106. doc_source2 = get_page(url)
  107. print('PRRR ' + str(doc_source2))
  108.  
  109. from bs4 import UnicodeDammit
  110. def decode_html(doc_source):
  111. print('Функция - конвертации')
  112. converted = UnicodeDammit(doc_source)
  113. if not converted.unicode_markup:
  114. raise UnicodeDecodeError(
  115. "Failed to detect encoding, tried [%s]",
  116. ', '.join(converted.tried_encodings))
  117. # print converted.original_encoding
  118. print(converted.unicode_markup)
  119. return converted.unicode_markup
  120.  
  121.  
  122.  
  123.  
  124. def parsing_page(doc_source2):
  125. print ( '4)Парсим данные:' )
  126.  
  127. # print(type(doc_source))
  128. # #print(doc_source)
  129. # if doc_source is None:
  130. # print("нет значения")
  131. # print(doc_source)
  132. # get_page(url)
  133. # parsing_page(doc_source)
  134. # #doc_source = get_page(url)
  135.  
  136. doc = BeautifulSoup(doc_source2, 'lxml')
  137. print(type(doc))
  138. #doc = doc_source
  139.  
  140.  
  141.  
  142. title = doc.find('h1')
  143. #title.span.decompose()
  144. #title = re.sub(r'\s+', ' ', title)
  145. title = title.text.strip()
  146. print(title)
  147.  
  148.  
  149.  
  150.  
  151. # #Подзаголовок
  152. # try :
  153. # sub_header = doc.find('div', class_='company-header__text')
  154. # sub_header = sub_header.text.strip ()
  155. # except :
  156. # sub_header = ''
  157. # print (sub_header)
  158.  
  159.  
  160. # # region Правовая форма
  161. # try :
  162. # prav_form = doc.xpath ( '//h1/span' )[0]
  163. # prav_form = prav_form.text.strip ()
  164. # except :
  165. # prav_form = ''
  166. # # endregion
  167.  
  168. # # regionСайт
  169. # try :
  170. # site = doc.find('a', class_='nofol-link')
  171. # site = site.text.strip()
  172. # except :
  173. # site = ''
  174. # # endregion
  175.  
  176. # regionТелефоны
  177. # try :
  178. # phone = []
  179. # phone_n = doc.xpath (
  180. # '//ul[@class="company-information__phone-list"]/li/span[@class="company-information__phone"]' )
  181. # phone_comm = doc.xpath (
  182. # '//ul[@class="company-information__phone-list"]/li/span[@class="company-information__phone-name"]' )
  183. #
  184. # for a_, b_ in zip_longest ( phone_n, phone_comm, fillvalue=" " ) :
  185. # if len ( phone_comm ) > 0 :
  186. # phone_all = ('{}{}').format ( a_.text.strip (), b_.text.strip () )
  187. # phone.append ( phone_all )
  188. # else :
  189. # phone.append ( a_.text.strip () )
  190. # phones = ', '.join ( phone )
  191. # except:
  192. # phones = ''
  193. # # print(phones)
  194. # # endregion
  195. #
  196. #
  197. #
  198. #
  199. # # region social
  200. # try :
  201. # social = []
  202. # for s in doc.xpath ( '//ul[@class="company-information__social"]/li/a' ) :
  203. # social.append ( s.get ( 'href' ) )
  204. # socials = ', '.join ( social )
  205. # except :
  206. # social = ''
  207. #
  208. # # endregion
  209. #
  210. # # region Сферы_деятельности
  211. # rubriks = []
  212. # try :
  213. # for i in range ( len ( doc.xpath ( '//div[@class="about rubrics-list"]/div' ) ) ) :
  214. #
  215. # try :
  216. # r_root = doc.xpath (f'//div[@class="about rubrics-list"]/div[{i + 1}]/div[@class="rubrics-content rubrics-content--title"]/a' )[
  217. # 0]
  218. # rub1 = r_root.text.strip ()
  219. # except :
  220. # rub1 = ""
  221. # # print('Рутт - Не найден')
  222. #
  223. # try :
  224. # r2 = doc.xpath ( f'//div[@class="about rubrics-list"]/div[{i + 1}]/div[2]/div/a' )[0]
  225. # rub2 = r2.text
  226. # rub2 = re.sub ( r' в.*', '', rub2 ).strip ()
  227. # except :
  228. # rub2 = ""
  229. # # print('Подруб2 - Не найден')
  230. #
  231. # try :
  232. # r3 = doc.xpath ( f'//div[@class="about rubrics-list"]/div[{i + 1}]/div[3]/div/a' )[0]
  233. # rub3 = r3.text
  234. # rub3 = re.sub ( r' в.*', '', rub3 ).strip ()
  235. # except :
  236. # rub3 = ""
  237. # # print('Подруб3 - Не найден')
  238. #
  239. # try :
  240. # r4 = doc.xpath ( f'//div[@class="about rubrics-list"]/div[{i + 1}]/div[4]/div/a' )[0]
  241. # rub4 = r4.text
  242. # rub4 = re.sub ( r' в.*', '', rub4 ).strip ()
  243. # # print('Подрубрика: ' + rr2)
  244. # except :
  245. # rub4 = ""
  246. # # print('Подруб - Не найден')
  247. #
  248. # rub = str ( rub1 ) + '|' + str ( rub2 ) + '|' + str ( rub3 ) + '|' + str ( rub4 )
  249. # rub = re.sub ( r'\|+$', '', rub ).strip ()
  250. # rubriks.append ( rub )
  251. # rubrika = ', '.join ( rubriks )
  252. # except :
  253. # rubrika = ''
  254. #
  255. # # endregion
  256. #
  257. # # regionАдрес
  258. # try :
  259. # country = doc.xpath ( '//span[@itemprop="addressCountry"]' )[0]
  260. # country = country.text.strip ()
  261. # except :
  262. # country = ''
  263. #
  264. # # Город
  265. # try :
  266. # city = doc.xpath ( '//div[@class="company-information__address-text"]/span' )[0]
  267. # city = city.text.strip ()
  268. # except :
  269. # city = ''
  270. #
  271. # try :
  272. # postal_code = doc.xpath ( '//span[@itemprop="postalCode"]' )[0]
  273. # postal_code = postal_code.text.strip ()
  274. # except :
  275. # postal_code = ''
  276. #
  277. # try :
  278. # address = doc.xpath ( '//span[@itemprop="streetAddress"]' )[0]
  279. # address = address.text.strip ()
  280. # except :
  281. # address = ''
  282. #
  283. # # Метро
  284. # try :
  285. # metro_list = []
  286. # metro_s = doc.xpath ( '//span[@class="metro-item__name"]' )
  287. # for m in metro_s :
  288. # # print(m.text.strip())
  289. # station = m.text.strip ()
  290. # metro_list.append ( station )
  291. # metro = ', '.join ( metro_list )
  292. # except :
  293. # metro = ''
  294. # # endregion
  295. #
  296. # # regionВремя работы
  297. # try :
  298. # work_time = doc.xpath ( '//div[@class="period"]/p' )[0]
  299. # work_time = work_time.text.strip ()
  300. # except :
  301. # work_time = ''
  302. #
  303. # # endregion
  304. #
  305. # # regionКоординаты
  306. # try :
  307. # coordinate_latitude = doc.xpath ( '//meta[@itemprop="latitude"]' )[0].get ( 'content' )
  308. # coordinate_longitude = doc.xpath ( '//meta[@itemprop="longitude"]' )[0].get ( 'content' )
  309. # coordinate = (f'{coordinate_latitude},{coordinate_longitude}')
  310. # except :
  311. # coordinate = ''
  312. # # endregion
  313. #
  314. # #'Подзаголовок' : sub_header,
  315. # #'Email' : email,
  316. #
  317. #
  318. # out_data = {'url':url,
  319. # 'Title':title,
  320. # 'Телефон':phones,
  321. # 'Рубрика':rubrika,
  322. # 'Страна':country,
  323. # 'Город':city,
  324. # 'Индекс':postal_code,
  325. # 'Адрес':address,
  326. # 'Удаленность_от_остановки':metro,
  327. # 'GPS_координаты':coordinate,
  328. # 'Время_работы':work_time}
  329. #
  330. # return out_data
  331.  
  332.  
  333.  
  334.  
  335.  
  336.  
  337. def import_in_db(parsing_data) :
  338. print ( '3 импорт в БД' )
  339. # try:
  340. values = parsing_data
  341. columns = ', '.join ( values.keys () )
  342. placeholders = ':' + ', :'.join ( values.keys () )
  343. query = 'INSERT INTO data (%s) VALUES (%s)' % (columns, placeholders)
  344. # print(query)
  345. print( ' '.join ( values.values () ) )
  346. cursor.execute ( query, values )
  347. conn.commit ()
  348.  
  349.  
  350.  
  351. parsing_page(doc_source2)
  352. #
  353. # def main() :
  354. #
  355. # get_item_url_in_db()
  356. # update_item_status_in_db(num=0)
  357. # print ( 'Обновили статус - 0' )
  358. #
  359. # #get_and_update_proxy_in_db()
  360. #
  361. # doc_source2 = get_page(url)
  362. # parsing_data = parsing_page(doc_source2)
  363. #
  364. # import_in_db(parsing_data)
  365. #
  366. # update_item_status_in_db(num=1)
  367. # print( 'Обновили статус - 1' )
  368. #
  369. #
  370. # if __name__ == "__main__" :
  371. # main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement