Advertisement
Guest User

Untitled

a guest
Aug 23rd, 2017
218
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.04 KB | None | 0 0
  1. #!/usr/bin/python
  2. # coding: utf8
  3.  
  4.  
  5. import selenium
  6. from selenium import webdriver
  7. from sys import platform
  8. import json
  9. import feedparser
  10. import time
  11. from random import randint
  12. import os
  13. import datetime
  14. import time
  15. import sys
  16. from pyvirtualdisplay import Display
  17.  
  18. rssfeeds = [
  19.             "http://derStandard.at/?page=rss&ressort=Seite1",
  20.             "http://derstandard.at/?page=rss&ressort=International",
  21.             "http://derstandard.at/?page=rss&ressort=Inland",
  22.             "http://derStandard.at/?page=rss&ressort=Wirtschaft",
  23.             "http://derStandard.at/?page=rss&ressort=Web",
  24.             "http://derStandard.at/?page=rss&ressort=Sport",
  25.             "http://derStandard.at/?page=rss&ressort=Panorama",
  26.             "http://derStandard.at/?page=rss&ressort=Etat",
  27.             "http://derStandard.at/?page=rss&ressort=Kultur",
  28.             "http://derStandard.at/?page=rss&ressort=Wissenschaft",
  29.             "http://derStandard.at/?page=rss&ressort=Gesundheit",
  30.             "http://derStandard.at/?page=rss&ressort=Bildung",
  31.             "http://derStandard.at/?page=rss&ressort=Lifestyle",
  32.             "http://derStandard.at/?page=rss&ressort=Reisen",
  33.             "http://derStandard.at/?page=rss&ressort=Familie",
  34.             "http://derStandard.at/?page=rss&ressort=Meinung",
  35.             "http://derStandard.at/?page=rss&ressort=User",
  36.             "http://derStandard.at/?page=rss&ressort=Karriere",
  37.             "http://derstandard.at/?page=rss&ressort=Immobilien",
  38.             "http://derstandard.at/?page=rss&ressort=Automobil",
  39.             "http://derStandard.at/?page=rss&ressort=diestandard"
  40. ]
  41.  
  42. rssfeeds_test = ["http://derStandard.at/?page=rss&ressort=Kultur"]
  43.  
  44. if platform == "win32":
  45.     driver = webdriver.Chrome('PATH_TO_CHROMEDRIVER')
  46. elif platform == "linux" or platform == "linux2":
  47.     import pyvirtualdisplay
  48.     display = Display(visible=0, size=(1024,860))
  49.     display.start()
  50.     driver = webdriver.Chrome('PATH_TO_CHROMEDRIVER')
  51.  
  52. def main():
  53.     #First create a list of all urls, then remove duplicates
  54.     todaysurls = []
  55.     for url in rssfeeds:
  56.         feed = feedparser.parse(url)
  57.         #time.sleep(2)
  58.         for entry in feed["entries"]:
  59.             #first check if the entry is from today
  60.             time_struct=entry["published_parsed"]
  61.             dateofarticle = datetime.datetime.fromtimestamp(time.mktime(time_struct))
  62.             print(str(dateofarticle.date()))
  63.             #heute = (datetime.datetime.today()-datetime.timedelta(1)).date()
  64.             heute = datetime.datetime.today().date()
  65.             print(str(heute))
  66.             if dateofarticle.date() == datetime.datetime.today().date():           
  67.                 print("was gutes gefunden")
  68.                 todaysurls.append(entry["link"])
  69.             else:
  70.                 print("Found old article")
  71.    
  72.    
  73.     uniqueurls = list(set(todaysurls))
  74.    
  75.     uniqueurlswithoutliveurls = [x for x in uniqueurls if not x.startswith('http://derstandard.at/jetzt')]
  76.    
  77.     print("Total unique urls from today: "+str(len(uniqueurlswithoutliveurls)))
  78.    
  79.     todayscomments = []
  80.    
  81.     for url in uniqueurlswithoutliveurls:
  82.         todayscomments.extend(getCommentsFromArticle(url))
  83.         time.sleep(10)
  84.    
  85.     dumpJson(todayscomments)
  86.    
  87.  
  88. def getCommentsFromArticle(url):
  89.     """
  90.     Input: url as string
  91.     Output: Opens the url in browser and fetches 3 top and 3 most negative articles and returns them as
  92.             list of dictionaries (headline, text, likes, permalink)
  93.     """
  94.     #open Url and wait until everything is loaded
  95.     driver.get(url)
  96.     time.sleep(10)
  97.     driver.execute_script('scroll(0, document.body.scrollHeight);')
  98.     time.sleep(3)
  99.     filterbutton = driver.find_elements_by_xpath('//*[@id="forum-tb-sorting-button"]/span')
  100.     try:
  101.         filterbutton[0].click()
  102.  
  103.         bestbutton = driver.find_elements_by_xpath('//*[@for="VoteCountsPositiveFirst"]')
  104.         bestbutton[0].click()
  105.         time.sleep(3)
  106.         print("Best Posts:")
  107.        
  108.         articles = []
  109.         bestposts = driver.find_elements_by_xpath('//div[contains(@class, "posting upost upost-is-expanded") and not(contains(@class, "upost-is-reply"))]')
  110.  
  111.         for post in bestposts:
  112.             headline = post.find_elements_by_xpath('.//*[@class="upost-title"]')
  113.             body = post.find_elements_by_xpath('.//*[@class="upost-text"]')
  114.             urlcontainer = post.find_elements_by_xpath('.//*[@class="upost-usercontainer js-usercontainer"]')
  115.             url = urlcontainer[0].get_attribute('data-closable-target')[9:]
  116.             likes = post.find_elements_by_xpath('.//*[@class="js-ratings-positive-count ratings-positive-count"]')
  117.             encoding = "utf-8"
  118.             try:
  119.                 print(headline[0].text.encode(encoding, errors='replace') + "|" + body[0].text.encode(encoding, errors='replace') + "|" + likes[0].text.encode(encoding, errors='replace') + "|" + url.encode(encoding, errors='replace') + "\n")
  120.             except TypeError:
  121.                 print("Failed to display comment")
  122.             article = {}
  123.             article['headline'] = headline[0].text
  124.             article['body'] = body[0].text
  125.             if likes[0].text == "":
  126.                 article['likes'] = "0"
  127.             else:
  128.                 article['likes'] = likes[0].text
  129.             article['url'] = url
  130.             articles.append(article.copy())
  131.        
  132.         filterbutton = driver.find_elements_by_xpath('//*[@id="forum-tb-sorting-button"]/span')
  133.         filterbutton[0].click()
  134.         time.sleep(1)
  135.         worstbutton = driver.find_elements_by_xpath('//*[@for="VoteCountsNegativeFirst"]')
  136.         worstbutton[0].click()
  137.         time.sleep(3)
  138.        
  139.         worstposts = driver.find_elements_by_xpath('//div[contains(@class, "posting upost upost-is-expanded") and not(contains(@class, "upost-is-reply"))]')
  140.        
  141.         for post in worstposts:
  142.             headline = post.find_elements_by_xpath('.//*[@class="upost-title"]')
  143.             body = post.find_elements_by_xpath('.//*[@class="upost-text"]')
  144.             urlcontainer = post.find_elements_by_xpath('.//*[@class="upost-usercontainer js-usercontainer"]')
  145.             url = urlcontainer[0].get_attribute('data-closable-target')[9:]
  146.             likes = post.find_elements_by_xpath('.//*[@class="js-ratings-negative-count ratings-negative-count"]')
  147.             encoding = 'utf-8'
  148.             try:
  149.                 print(headline[0].text.encode(encoding, errors='replace') + "|" + body[0].text.encode(encoding, errors='replace') + "|" + likes[0].text.encode(encoding, errors='replace') + "|" + url.encode(encoding, errors='replace') + "\n")
  150.             except TypeError:
  151.                 print("Couldn't display comment")
  152.             article = {}
  153.             article['headline'] = headline[0].text
  154.             article['body'] = body[0].text
  155.             if likes[0].text == "":
  156.                 article['likes'] = "0"
  157.             else:
  158.                 article['likes'] = "-"+likes[0].text
  159.             article['url'] = url
  160.             articles.append(article.copy())
  161.            
  162.        
  163.         return articles
  164.     except selenium.common.exceptions.ElementNotVisibleException as e:
  165.         print("Error :"+str(e))
  166.         return []
  167.    
  168.     except IndexError as e:
  169.         print("Kein Button gefunden: "+str(e))
  170.         return []
  171.  
  172.  
  173.  
  174. def dumpJson(dictionaryofcomments):
  175.     """
  176.     Input: a dictionary of articles
  177.     Output: writes dictionary as JSON file and returns void
  178.     """
  179.    
  180.     # we are going to use the date for saving the files
  181.     ts = time.time()
  182.     date =datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d')
  183.    
  184.     #change to directory
  185.     os.chdir(os.path.dirname(os.path.realpath(__file__)))
  186.    
  187.     try:
  188.         os.mkdir("data")
  189.  
  190.     except OSError as e:
  191.         print("data directory already existing")
  192.  
  193.     os.chdir("data")
  194.    
  195.     with open(date+".json", "a+") as file:
  196.         json.dump(dictionaryofcomments, file)
  197.        
  198.     os.chdir("..")
  199.    
  200. if __name__ == "__main__":
  201.     main()
  202.     driver.close()
  203.     if platform == "linux" or platform =="linux2":
  204.         display.stop()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement