Advertisement
Guest User

Untitled

a guest
Mar 8th, 2017
2,318
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.63 KB | None | 0 0
  1. # -*- coding:UTF-8 -*-
  2. # license: WTFPL
  3.  
  4. import re
  5. import os
  6. import sys
  7. import time
  8. import random
  9. import requests
  10. import argparse
  11. from urllib import parse
  12. from bs4 import BeautifulSoup
  13.  
  14. parser = argparse.ArgumentParser(description="get.google.com finder.")
  15. parser.add_argument("query", help="search query", nargs='+', default=[])
  16. parser.add_argument("-l", "--lang", help="google language in form: fr, com, br...", default='com')
  17. args = parser.parse_args()
  18.  
  19. if not args.query:
  20. print('missing query')
  21. sys.exit(1)
  22. query = ' '.join(args.query)
  23.  
  24. short_country = args.lang
  25. parsed_query = parse.quote('{} site:http://get.google.com/albumarchive'.format(query))
  26. start = '0'
  27. parsed_start = '&start={}'.format(start)
  28.  
  29. url = "https://www.google.{}/search?q={}{}{}".format(
  30. short_country,
  31. parsed_query,
  32. '&num=100',
  33. parsed_start
  34. )
  35.  
  36. s = requests.Session()
  37.  
  38. while True:
  39. time.sleep(random.randint(9, 12))
  40. page = s.get(url)
  41. content = page.content
  42. soup = BeautifulSoup(content, "html.parser")
  43. links = soup.findAll("a")
  44.  
  45. if 'Our systems have detected unusual traffic from your computer network' in str(content):
  46. print('To many requests!')
  47. break
  48.  
  49. next_page_links = []
  50. saved_links = []
  51.  
  52. print('Offset: {}'.format(start))
  53.  
  54. for link in links:
  55. if 'search?q' in link['href'] and '&start' in link['href']:
  56. next_page_links.append(link['href'])
  57.  
  58. for link in links:
  59. if link['href'].startswith('/url?q=') and 'get.google.com' in link['href'] and not 'webcache.googleusercontent.com' in link['href']:
  60. saved_links.append(link['href'][7:].split('&')[0])
  61.  
  62. with open('{}.txt'.format(query), 'a') as f:
  63. f.writelines('%s\n' % l for l in saved_links)
  64.  
  65. if not saved_links:
  66. print('no results!')
  67. break
  68. if saved_links and not next_page_links:
  69. break
  70.  
  71. new_start = next_page_links[-1].split('&start=')[1].split('&')[0].split('%')[0]
  72. if int(start) > int(new_start):
  73. break
  74. else:
  75. start = new_start
  76. url = 'https://www.google.{}'.format(short_country) + next_page_links[-1]
  77. print('update links_merged.txt')
  78. if os.path.isfile('links_merged.txt'):
  79. os.remove('links_merged.txt')
  80. lines_seen = set()
  81. for file in os.listdir("."):
  82. if file.endswith(".txt"):
  83. with open(file) as fr:
  84. with open('links_merged.txt', 'a') as fw:
  85. for line in fr.readlines():
  86. if line not in lines_seen:
  87. fw.writelines([line])
  88. lines_seen.add(line)
  89. print('done')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement