Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- # coding: utf8
- import selenium
- from selenium import webdriver
- from sys import platform
- import json
- import feedparser
- import time
- from random import randint
- import os
- import datetime
- import time
- import sys
- from pyvirtualdisplay import Display
- rssfeeds = [
- "http://derStandard.at/?page=rss&ressort=Seite1",
- "http://derstandard.at/?page=rss&ressort=International",
- "http://derstandard.at/?page=rss&ressort=Inland",
- "http://derStandard.at/?page=rss&ressort=Wirtschaft",
- "http://derStandard.at/?page=rss&ressort=Web",
- "http://derStandard.at/?page=rss&ressort=Sport",
- "http://derStandard.at/?page=rss&ressort=Panorama",
- "http://derStandard.at/?page=rss&ressort=Etat",
- "http://derStandard.at/?page=rss&ressort=Kultur",
- "http://derStandard.at/?page=rss&ressort=Wissenschaft",
- "http://derStandard.at/?page=rss&ressort=Gesundheit",
- "http://derStandard.at/?page=rss&ressort=Bildung",
- "http://derStandard.at/?page=rss&ressort=Lifestyle",
- "http://derStandard.at/?page=rss&ressort=Reisen",
- "http://derStandard.at/?page=rss&ressort=Familie",
- "http://derStandard.at/?page=rss&ressort=Meinung",
- "http://derStandard.at/?page=rss&ressort=User",
- "http://derStandard.at/?page=rss&ressort=Karriere",
- "http://derstandard.at/?page=rss&ressort=Immobilien",
- "http://derstandard.at/?page=rss&ressort=Automobil",
- "http://derStandard.at/?page=rss&ressort=diestandard"
- ]
- rssfeeds_test = ["http://derStandard.at/?page=rss&ressort=Kultur"]
- if platform == "win32":
- driver = webdriver.Chrome('PATH_TO_CHROMEDRIVER')
- elif platform == "linux" or platform == "linux2":
- import pyvirtualdisplay
- display = Display(visible=0, size=(1024,860))
- display.start()
- driver = webdriver.Chrome('PATH_TO_CHROMEDRIVER')
- def main():
- #First create a list of all urls, then remove duplicates
- todaysurls = []
- for url in rssfeeds:
- feed = feedparser.parse(url)
- #time.sleep(2)
- for entry in feed["entries"]:
- #first check if the entry is from today
- time_struct=entry["published_parsed"]
- dateofarticle = datetime.datetime.fromtimestamp(time.mktime(time_struct))
- print(str(dateofarticle.date()))
- #heute = (datetime.datetime.today()-datetime.timedelta(1)).date()
- heute = datetime.datetime.today().date()
- print(str(heute))
- if dateofarticle.date() == datetime.datetime.today().date():
- print("was gutes gefunden")
- todaysurls.append(entry["link"])
- else:
- print("Found old article")
- uniqueurls = list(set(todaysurls))
- uniqueurlswithoutliveurls = [x for x in uniqueurls if not x.startswith('http://derstandard.at/jetzt')]
- print("Total unique urls from today: "+str(len(uniqueurlswithoutliveurls)))
- todayscomments = []
- for url in uniqueurlswithoutliveurls:
- todayscomments.extend(getCommentsFromArticle(url))
- time.sleep(10)
- dumpJson(todayscomments)
- def getCommentsFromArticle(url):
- """
- Input: url as string
- Output: Opens the url in browser and fetches 3 top and 3 most negative articles and returns them as
- list of dictionaries (headline, text, likes, permalink)
- """
- #open Url and wait until everything is loaded
- driver.get(url)
- time.sleep(10)
- driver.execute_script('scroll(0, document.body.scrollHeight);')
- time.sleep(3)
- filterbutton = driver.find_elements_by_xpath('//*[@id="forum-tb-sorting-button"]/span')
- try:
- filterbutton[0].click()
- bestbutton = driver.find_elements_by_xpath('//*[@for="VoteCountsPositiveFirst"]')
- bestbutton[0].click()
- time.sleep(3)
- print("Best Posts:")
- articles = []
- bestposts = driver.find_elements_by_xpath('//div[contains(@class, "posting upost upost-is-expanded") and not(contains(@class, "upost-is-reply"))]')
- for post in bestposts:
- headline = post.find_elements_by_xpath('.//*[@class="upost-title"]')
- body = post.find_elements_by_xpath('.//*[@class="upost-text"]')
- urlcontainer = post.find_elements_by_xpath('.//*[@class="upost-usercontainer js-usercontainer"]')
- url = urlcontainer[0].get_attribute('data-closable-target')[9:]
- likes = post.find_elements_by_xpath('.//*[@class="js-ratings-positive-count ratings-positive-count"]')
- encoding = "utf-8"
- try:
- print(headline[0].text.encode(encoding, errors='replace') + "|" + body[0].text.encode(encoding, errors='replace') + "|" + likes[0].text.encode(encoding, errors='replace') + "|" + url.encode(encoding, errors='replace') + "\n")
- except TypeError:
- print("Failed to display comment")
- article = {}
- article['headline'] = headline[0].text
- article['body'] = body[0].text
- if likes[0].text == "":
- article['likes'] = "0"
- else:
- article['likes'] = likes[0].text
- article['url'] = url
- articles.append(article.copy())
- filterbutton = driver.find_elements_by_xpath('//*[@id="forum-tb-sorting-button"]/span')
- filterbutton[0].click()
- time.sleep(1)
- worstbutton = driver.find_elements_by_xpath('//*[@for="VoteCountsNegativeFirst"]')
- worstbutton[0].click()
- time.sleep(3)
- worstposts = driver.find_elements_by_xpath('//div[contains(@class, "posting upost upost-is-expanded") and not(contains(@class, "upost-is-reply"))]')
- for post in worstposts:
- headline = post.find_elements_by_xpath('.//*[@class="upost-title"]')
- body = post.find_elements_by_xpath('.//*[@class="upost-text"]')
- urlcontainer = post.find_elements_by_xpath('.//*[@class="upost-usercontainer js-usercontainer"]')
- url = urlcontainer[0].get_attribute('data-closable-target')[9:]
- likes = post.find_elements_by_xpath('.//*[@class="js-ratings-negative-count ratings-negative-count"]')
- encoding = 'utf-8'
- try:
- print(headline[0].text.encode(encoding, errors='replace') + "|" + body[0].text.encode(encoding, errors='replace') + "|" + likes[0].text.encode(encoding, errors='replace') + "|" + url.encode(encoding, errors='replace') + "\n")
- except TypeError:
- print("Couldn't display comment")
- article = {}
- article['headline'] = headline[0].text
- article['body'] = body[0].text
- if likes[0].text == "":
- article['likes'] = "0"
- else:
- article['likes'] = "-"+likes[0].text
- article['url'] = url
- articles.append(article.copy())
- return articles
- except selenium.common.exceptions.ElementNotVisibleException as e:
- print("Error :"+str(e))
- return []
- except IndexError as e:
- print("Kein Button gefunden: "+str(e))
- return []
- def dumpJson(dictionaryofcomments):
- """
- Input: a dictionary of articles
- Output: writes dictionary as JSON file and returns void
- """
- # we are going to use the date for saving the files
- ts = time.time()
- date =datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d')
- #change to directory
- os.chdir(os.path.dirname(os.path.realpath(__file__)))
- try:
- os.mkdir("data")
- except OSError as e:
- print("data directory already existing")
- os.chdir("data")
- with open(date+".json", "a+") as file:
- json.dump(dictionaryofcomments, file)
- os.chdir("..")
- if __name__ == "__main__":
- main()
- driver.close()
- if platform == "linux" or platform =="linux2":
- display.stop()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement