Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import asyncio
- import nest_asyncio
- import pyppeteer
- from pyppeteer import launch
- from pprint import pprint
- import requests
- import base64
- import json
- import os
- import aiofiles
- import random
- from pyquery import PyQuery as pq
- import aiosqlite
- from datetime import datetime
- import regex
- from urllib.parse import urlparse
- import idna
- import configparser
- from flask import Flask, flash, redirect, render_template, request, session, abort, url_for
- from multiprocessing import Process
- import sqlite3
- import tkinter as tk
- from tkinter import filedialog
- from tkinter.ttk import Progressbar
- from tkinter import OptionMenu
- from tkinter import StringVar
- from tkinter import messagebox
- from tkinter import Label
- from tkinter import Entry
- from tkinter import Button
- import tkinter.font as font
- import psutil
- config = configparser.ConfigParser()
- config.read('settings.ini')
- BASEDIR = os.path.dirname(os.path.realpath(__file__))
- def get_as_base64(url):
- return base64.b64encode(requests.get(url).content).decode('utf8')
- def dict_factory(cursor, row):
- d = {}
- for idx, col in enumerate(cursor.description):
- d[col[0]] = row[idx]
- return d
- f = open(os.path.join(BASEDIR, config['PARSER']['Proxies']), mode='r', encoding="utf-8")
- proxies = f.readlines()
- f.close()
- proxies = [x.strip() for x in proxies if x != '']
- f = open(os.path.join(BASEDIR, config['PARSER']['Queries']), mode='r', encoding="utf-8")
- queries = f.readlines()
- f.close()
- queries = [x.strip() for x in queries if x != '']
- random.shuffle(queries)
- async def fetch(query, proxy):
- print('START PROCESS')
- print('TEST')
- browser = await launch(headless=True, args=['--no-sandbox', '--proxy-server='+proxy, ])
- print(4)
- try:
- page = await browser.newPage()
- await page.authenticate({ 'username': config['PARSER']['Login'], 'password': config['PARSER']['Password'] })
- await page.setJavaScriptEnabled(False)
- # await page.setViewport({'width': 1200, 'height': 4000})
- await page.setRequestInterception(True)
- async def intercept(request):
- try:
- if any(request.resourceType == _ for _ in ('image', 'script')):
- await request.abort()
- else:
- await request.continue_()
- except pyppeteer.errors.NetworkError as e:
- print(e)
- page.on('request', lambda req: asyncio.ensure_future(intercept(req)))
- await page.goto('https://yandex.ru', {'timeout': 100000})
- print("AFTER GOTO")
- # await page.waitFor(4000)
- # await page.waitForNavigation()
- await page.focus('input[name=text]')
- await page.keyboard.type(query),
- await page.keyboard.press("Enter")
- # await page.waitFor(4000)
- await page.waitForNavigation({'timeout': 100000})
- print(5)
- html = await page.content()
- d = pq(html)
- captcha_input = d("input[name=rep]")
- # captcha_input = await page.evaluate("() => document.querySelector('input[name=rep]')")
- if len(captcha_input):
- captcha_lnk = d('div.captcha__image img').attr('src')
- # captcha_lnk = await page.evaluate("() => document.querySelector('div.captcha__image img').src")
- print(captcha_lnk)
- b64pic = get_as_base64(captcha_lnk)
- url = 'http://'+config['PARSER']['CapMonster']+'/createTask'
- print(url)
- task = {
- "clientKey": "dce6bcbb1a728ea8d563de6d169a2057",
- "task": {
- "type": "ImageToTextTask",
- "body": b64pic,
- "phrase": False,
- "case": False,
- "numeric": False,
- "math": 0,
- "minLength": 0,
- "maxLength": 0
- }
- }
- print(6)
- r = requests.post(url, data=json.dumps(task))
- res = r.json()
- print(res)
- taskId = res['taskId']
- # await page.waitFor(4000)
- url2 = 'http://'+config['PARSER']['CapMonster']+'/getTaskResult'
- task2 = {
- "clientKey": "dce6bcbb1a728ea8d563de6d169a2057",
- "taskId": taskId
- }
- r = requests.post(url2, data=json.dumps(task2))
- res = r.json()
- print(res)
- if 'solution' in res.keys():
- rep = res['solution']['text']
- else:
- await page.close()
- await browser.close()
- return (query, False)
- print(7)
- await page.focus('input[name=rep]')
- await page.keyboard.type(rep)
- await page.keyboard.press("Enter")
- # await page.waitFor(3000)
- await page.waitForNavigation({'timeout': 100000})
- html = await page.content()
- d = pq(html)
- captcha_input = d("input[name=rep]")
- # captcha_input = await page.evaluate("() => document.querySelector('input[name=rep]')")
- if len(captcha_input):
- print('wrong capcha')
- await page.close()
- await browser.close()
- # await tasks.add(loop.create_task(process(query)))
- return (query, False)
- else:
- html = await page.content()
- d = pq(html)
- page1 = d("li.serp-item")
- print('len', len(page1))
- if not len(page1):
- await page.close()
- await browser.close()
- return (query, False)
- sites = []
- for item in page1:
- d = pq(item)
- label = d("div.label")
- if len(label):
- if label.text() == 'реклама':
- site = d('a.link_theme_outer > b').text()
- if regex.search(r'\p{IsCyrillic}', site) is not None:
- link = d('a.link_theme_outer').attr('href')
- result = regex.search(r'market\.yandex\.ru', link)
- if result is not None:
- continue
- print(query, link)
- r = requests.get(link, allow_redirects=True)
- content = r.text
- result = regex.search(r'\;URL=\'.*\'', content)
- if result is not None:
- url = result.group(0)[6:-1]
- site = urlparse(url).hostname
- site = idna.decode(site)
- sites.append(site)
- print(sites)
- if not len(sites):
- await page.close()
- await browser.close()
- return (query, False)
- print(8)
- now = int(datetime.timestamp(datetime.now()))
- async with aiosqlite.connect(os.path.join(BASEDIR, config['PARSER']['Database'])) as db:
- cursor = await db.execute("""INSERT INTO results (`text`, sites, updated, busy, file) VALUES('{t}', '{s}', {u}, {b}, '{f}') ON CONFLICT(`text`) DO UPDATE SET sites=excluded.sites, updated=excluded.updated, busy=excluded.busy, file=excluded.file""".format(t=query, s=json.dumps(sites), u=now, b=1, f='984.txt'))
- await db.commit()
- print('DONE COMPLETE')
- # await page.screenshot({'path': query+'.png'})
- await page.close()
- await browser.close()
- return (query, True)
- else:
- html = await page.content()
- d = pq(html)
- page1 = d("li.serp-item")
- print('len', len(page1))
- if not len(page1):
- await page.close()
- await browser.close()
- return (query, False)
- print(9)
- sites = []
- for item in page1:
- d = pq(item)
- label = d("div.label")
- if len(label):
- if label.text() == 'реклама':
- site = d('a.link_theme_outer > b').text()
- if regex.search(r'\p{IsCyrillic}', site) is not None:
- link = d('a.link_theme_outer').attr('href')
- print(query, link)
- result = regex.search(r'market\.yandex\.ru', link)
- if result is not None:
- continue
- r = requests.get(link, allow_redirects=True)
- content = r.text
- # print(query, content)
- result = regex.search(r'\;URL=\'.*\'', content)
- if result is not None:
- url = result.group(0)[6:-1]
- site = urlparse(url).hostname
- site = idna.decode(site)
- sites.append(site)
- print(sites)
- if not len(sites):
- await page.close()
- await browser.close()
- return (query, False)
- now = int(datetime.timestamp(datetime.now()))
- async with aiosqlite.connect(os.path.join(BASEDIR, config['PARSER']['Database'])) as db:
- cursor = await db.execute("""INSERT INTO results (`text`, sites, updated, busy, file) VALUES('{t}', '{s}', {u}, {b}, '{f}') ON CONFLICT(`text`) DO UPDATE SET sites=excluded.sites, updated=excluded.updated, busy=excluded.busy, file=excluded.file""".format(t=query, s=json.dumps(sites), u=now, b=1, f=config['PARSER']['Queries']))
- await db.commit()
- print('DONE COMPLETE')
- # await page.screenshot({'path': query+'.png'})
- await page.close()
- await browser.close()
- return (query, True)
- except (pyppeteer.errors.TimeoutError, pyppeteer.errors.PageError, pyppeteer.errors.NetworkError, pyppeteer.errors.ElementHandleError) as e:
- print(e)
- await browser.close()
- # await tasks.add(loop.create_task(process(query)))
- return (query, False)
- async def run(proxies):
- tasks = set()
- no_concurrent = 5
- for query in queries:
- if len(tasks) >= no_concurrent:
- _done, tasks = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
- proxy = random.choice(proxies)
- task = asyncio.ensure_future(fetch(query, proxy))
- tasks.add(task)
- while True:
- try:
- flag = True
- print(2)
- tmp = [task for task in tasks.copy() if task.done()]
- t = {}
- for task in tmp:
- if task.result()[0] not in t.keys():
- t[task.result()[0]] = task
- elif task.result()[1]:
- t[task.result()[0]] = task
- tmp = t.values()
- for task in tmp:
- print(task.result())
- if not task.result()[1]:
- if len(tasks) >= no_concurrent:
- _done, tasks = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
- flag = False
- # tasks.add(loop.create_task(process(task.result()[0])))
- proxy = random.choice(proxies)
- task = asyncio.ensure_future(fetch(task.result()[0], proxy))
- tasks.add(task)
- if flag:
- break
- except ValueError as e:
- break
- responses = await asyncio.gather(*tasks)
- while True:
- flag = True
- tmp = [task for task in tasks.copy() if task.done()]
- t = {}
- print(3)
- for task in tmp:
- if task.result()[0] not in t.keys():
- t[task.result()[0]] = task
- elif task.result()[1]:
- t[task.result()[0]] = task
- tmp = t.values()
- for task in tmp:
- print(task.result())
- if not task.result()[1]:
- notdone = [task for task in tasks.copy() if not task.done()]
- if len(notdone) >= no_concurrent:
- _done, tasks = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
- flag = False
- print('3!!!!', task)
- proxy = random.choice(proxies)
- task = asyncio.ensure_future(fetch(task.result()[0], proxy))
- tasks.add(task)
- if flag:
- break
- responses = await asyncio.gather(*tasks)
- async with aiosqlite.connect(os.path.join(BASEDIR, config['PARSER']['Database'])) as db:
- db.row_factory = dict_factory
- sql = "SELECT * FROM results"
- cursor = await db.execute(sql)
- rows = await cursor.fetchall()
- for i in range(len(rows)):
- rows[i]['sites'] = json.loads(rows[i]['sites'])
- def yaparser():
- while True:
- loop = asyncio.new_event_loop()
- asyncio.set_event_loop(asyncio.new_event_loop())
- loop = asyncio.get_event_loop()
- future = asyncio.ensure_future(run(proxies))
- loop.run_until_complete(future)
- def yaserver():
- app = Flask(__name__)
- @app.route('/json', methods=['POST', 'GET'])
- def do_json():
- with sqlite3.connect(os.path.join(BASEDIR, config['PARSER']['Database'])) as db:
- db.row_factory = dict_factory
- sql = "SELECT `text`, sites, updated, busy, file AS id FROM results"
- cursor = db.execute(sql)
- rows = cursor.fetchall()
- for i in range(len(rows)):
- rows[i]['sites'] = json.loads(rows[i]['sites'])
- rows[i]['busy'] = bool(rows[i]['busy'])
- sites = rows[i]['sites']
- rows[i]['sites'] = []
- for j in range(len(sites)):
- rows[i]['sites'].append({'position': j, 'link': sites[j]})
- response = app.response_class(
- response=json.dumps(rows),
- mimetype='application/json'
- )
- return response
- @app.route('/text', methods=['POST', 'GET'])
- def do_text():
- with sqlite3.connect(os.path.join(BASEDIR, config['PARSER']['Database'])) as db:
- db.row_factory = dict_factory
- sql = "SELECT `text`, sites, updated, busy, file AS id FROM results"
- cursor = db.execute(sql)
- rows = cursor.fetchall()
- for i in range(len(rows)):
- rrows[i]['sites'] = json.loads(rows[i]['sites'])
- rows[i]['busy'] = bool(rows[i]['busy'])
- sites = rows[i]['sites']
- rows[i]['sites'] = []
- for j in range(len(sites)):
- rows[i]['sites'].append({'position': j, 'link': sites[j]})
- response = app.response_class(
- response=str(rows),
- mimetype=' text/html',
- )
- response.headers["Content-Type"] = "text/html; charset=utf-8"
- return response
- app.secret_key = os.urandom(12)
- app.run(debug=False,use_reloader=False, host='0.0.0.0', port=int(config['PARSER']['HTTP_port']))
- proc1 = None
- proc2 = None
- def start():
- current_process = psutil.Process()
- children = current_process.children(recursive=True)
- if not len(children):
- proc1 = Process(target=yaserver, name='yaserver')
- proc1.start()
- proc2 = Process(target=yaparser, name='yaparser')
- proc2.start()
- def stop():
- current_process = psutil.Process()
- children = current_process.children(recursive=True)
- for child in children:
- print('Child pid {} terminate'.format(child.pid))
- p = psutil.Process(child.pid)
- p.terminate()
- def on_closing():
- stop()
- window.destroy()
- if __name__ == "__main__":
- window = tk.Tk()
- window.title("Yandex Parser")
- # window.iconbitmap("ya.ico")
- window.geometry('480x240')
- arial36 = font.Font(family='Arial', size=12, weight=font.BOLD)
- btn_run = Button(window, text="START", command=start, width=45, height=5, font=arial36)
- btn_run.grid(row=1,column=2, padx=5, pady=5)
- btn_stop = Button(window, text="STOP", command=stop, width=45, height=5, font=arial36)
- btn_stop.grid(row=3,column=2, padx=5, pady=5)
- window.protocol("WM_DELETE_WINDOW", on_closing)
- window.mainloop()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement