Advertisement
goebelmasse

Google.com-Ergebnisse nicht in .de ermitteln

May 31st, 2014
377
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.24 KB | None | 0 0
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. ########################################################################
  4. #
  5. # google-hidden.py
  6. # Find results hidden by google, but accessible by other search engines.
  7. #
  8. # Copy? Right! 2014 Elias Schwerdtfeger, http://www.tamagothi.de/
  9. #
  10. # This program is free software, licesend under the terms of the pirate's
  11. # license. You can do with it whatever you want, as long as you do not
  12. # sue me. If you want to use this program and to sue me for it, please
  13. # buy a commercial license. You can read the full terms of the license
  14. # (in german language) at http://www.tamagothi.de/impressum/lizenz/
  15. #
  16. # Share and enjoy!
  17. #
  18. # $Id: google-hidden.py,v 1.2 2014/05/31 15:50:55 elias Exp $
  19. #
  20. # (All helpful comments are intentionally removed.)
  21. #
  22. ########################################################################
  23.  
  24. RESULTS = 200
  25.  
  26. import sys
  27. import urllib.parse
  28. import urllib.request
  29. import html.parser
  30.  
  31.  
  32. class LinkExtractor(html.parser.HTMLParser):
  33.     def __init__(self, htmldoc):
  34.         super().__init__()
  35.         self.links = []
  36.         self.feed(htmldoc)
  37.  
  38.     def handle_starttag(self, tag, attrs):
  39.         if tag == 'a':
  40.             for attr, content in attrs:
  41.                 if attr == 'href':
  42.                     self.links.append(content)
  43.  
  44.  
  45. class BaseSearchResult(object):
  46.     def __init__(self, search_term):
  47.         super().__init__()
  48.         self.result_links = []
  49.         for uri in self.perform_search(search_term):
  50.             if self.filter_link(uri) and uri not in self.result_links:
  51.                 self.result_links.append(uri)
  52.         self.result_links = self.postprocess_links(self.result_links)
  53.         self.result_links.sort()
  54.  
  55.     def filter_link(self, uri):
  56.         return true
  57.    
  58.     def perform_search(self, search_term):
  59.         raise NotImplemented()
  60.  
  61.     def postprocess_links(self, linklist):
  62.         return linklist
  63.    
  64.     def get_links_from_uri(self, uri):
  65.         req = urllib.request.Request(uri)
  66.         req.add_header('User-agent', 'Mozilla/5.0')
  67.         httpdocument = urllib.request.urlopen(req)
  68.         link_extractor = LinkExtractor(httpdocument.read().decode('utf-8'))
  69.         return link_extractor.links
  70.  
  71.  
  72. class GoogleCommon(BaseSearchResult):
  73.     def common_search_part(self, domain, search_term):
  74.         params = urllib.parse.urlencode({'q': search_term, 'num': RESULTS})
  75.         uri = 'http://{}/search?{}'.format(domain, params)
  76.         return self.get_links_from_uri(uri)
  77.    
  78.     def postprocess_links(self, linklist):
  79.         newlist = []
  80.         for link in linklist:
  81.             parse_res = urllib.parse.urlparse(link)
  82.             params = urllib.parse.parse_qs(parse_res.query)
  83.             # // ist ein Hack, um nur vollständige URIs zu haben...
  84.             if 'q' in params and '//' in params['q'][0]:
  85.                 newlist.append(params['q'][0])
  86.         return newlist
  87.    
  88.  
  89. class GoogleDe(GoogleCommon):
  90.     def perform_search(self, search_term):
  91.         return self.common_search_part('www.google.de', search_term)
  92.  
  93.     def filter_link(self, uri):
  94.         return ('google.de' not in uri and
  95.                 'google.com' not in uri and
  96.                 'googleusercontent' not in uri and
  97.                 not uri.startswith('/search'))
  98.  
  99.  
  100. class GoogleCom(GoogleCommon):
  101.     def perform_search(self, search_term):
  102.         return self.common_search_part('www.google.com', search_term)
  103.  
  104.     def filter_link(self, uri):
  105.         return ('google.com' not in uri and
  106.                 'googleusercontent' not in uri and
  107.                 not uri.startswith('/search'))
  108.  
  109.  
  110. class Yahoo(BaseSearchResult):
  111.     pass
  112.  
  113.  
  114. class Bing(BaseSearchResult):
  115.     pass
  116.  
  117.  
  118. def not_found_in_google_de(term, engines=(GoogleCom, )):
  119.     not_found = []
  120.     google_de = GoogleDe(term)
  121.     for engine in engines:
  122.         other_engine = engine(term)
  123.         for uri in other_engine.result_links:
  124.             if uri not in google_de.result_links and uri not in not_found:
  125.                 not_found.append(uri)
  126.     not_found.sort()
  127.     return not_found
  128.  
  129.  
  130. def main():
  131.     search = '+'.join([urllib.parse.quote(i.lower()) for i in sys.argv[1:]])
  132.     for i in not_found_in_google_de(search):
  133.         print(i)
  134.  
  135.  
  136. if __name__ == '__main__':
  137.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement