Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding:UTF-8 -*-
- # license: WTFPL
- import re
- import os
- import sys
- import time
- import random
- import requests
- import argparse
- from urllib import parse
- from bs4 import BeautifulSoup
- parser = argparse.ArgumentParser(description="get.google.com finder.")
- parser.add_argument("query", help="search query", nargs='+', default=[])
- parser.add_argument("-l", "--lang", help="google language in form: fr, com, br...", default='com')
- args = parser.parse_args()
- if not args.query:
- print('missing query')
- sys.exit(1)
- query = ' '.join(args.query)
- short_country = args.lang
- parsed_query = parse.quote('{} site:http://get.google.com/albumarchive'.format(query))
- start = '0'
- parsed_start = '&start={}'.format(start)
- url = "https://www.google.{}/search?q={}{}{}".format(
- short_country,
- parsed_query,
- '&num=100',
- parsed_start
- )
- s = requests.Session()
- while True:
- time.sleep(random.randint(9, 12))
- page = s.get(url)
- content = page.content
- soup = BeautifulSoup(content, "html.parser")
- links = soup.findAll("a")
- if 'Our systems have detected unusual traffic from your computer network' in str(content):
- print('To many requests!')
- break
- next_page_links = []
- saved_links = []
- print('Offset: {}'.format(start))
- for link in links:
- if 'search?q' in link['href'] and '&start' in link['href']:
- next_page_links.append(link['href'])
- for link in links:
- if link['href'].startswith('/url?q=') and 'get.google.com' in link['href'] and not 'webcache.googleusercontent.com' in link['href']:
- saved_links.append(link['href'][7:].split('&')[0])
- with open('{}.txt'.format(query), 'a') as f:
- f.writelines('%s\n' % l for l in saved_links)
- if not saved_links:
- print('no results!')
- break
- if saved_links and not next_page_links:
- break
- new_start = next_page_links[-1].split('&start=')[1].split('&')[0].split('%')[0]
- if int(start) > int(new_start):
- break
- else:
- start = new_start
- url = 'https://www.google.{}'.format(short_country) + next_page_links[-1]
- print('update links_merged.txt')
- if os.path.isfile('links_merged.txt'):
- os.remove('links_merged.txt')
- lines_seen = set()
- for file in os.listdir("."):
- if file.endswith(".txt"):
- with open(file) as fr:
- with open('links_merged.txt', 'a') as fw:
- for line in fr.readlines():
- if line not in lines_seen:
- fw.writelines([line])
- lines_seen.add(line)
- print('done')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement