Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # coding=utf-8
- from selenium import webdriver
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- import itertools
- import csv
- import codecs
- import sys
- reload(sys)
- sys.setdefaultencoding('utf-8')
- unicode('gbk', errors='ignore')
- def crawl(url):
- chromedriverPath = "C:\Python27\Scripts\chromedriver.exe"
- # twitter includes 13 tweets every refresh
- eachCount = 13
- #scroll times
- #scrollTimes = 1
- #driver = webdriver.Chrome()
- driver = webdriver.Chrome(chromedriverPath)
- driver.get(url)
- try:
- for i in itertools.count():##range(scrollTimes):
- driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
- lastTweetCss = "#stream-items-id >li:nth-of-type(" + str(eachCount*(i+2)+1) + ") .tweet-text"
- print lastTweetCss
- elem = WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,lastTweetCss)))
- printTweet(driver)
- finally:
- driver.close()
- def printTweet(driver):
- tweetCss = "[data-item-type=tweet]"
- nameCss = ".fullname"
- timeCss = "._timestamp"
- contentCss = ".tweet-text"
- items = driver.find_elements_by_css_selector(tweetCss)
- resultArr = []
- for i,item in enumerate(items):
- try:
- nameElem = item.find_element_by_css_selector(nameCss)
- timeElem = item.find_element_by_css_selector(timeCss)
- contentElem = item.find_element_by_css_selector(contentCss)
- name = nameElem.text
- time = timeElem.text
- content = contentElem.text
- resultArr.append([name,time,content])
- print("%d: name=%s time=%s content=%s" % (i,name,time,content))
- except Exception, e:
- pass
- #print("error index=%d" % (i))
- #print e
- printToCsv(resultArr)
- def printToCsv(data):
- writer = csv.writer(codecs.open('result.csv','wb'))
- writer.writerow(['name','time','content'])
- for item in data:
- writer.writerow(item)
- if __name__ == '__main__':
- url = "https://twitter.com/search?q=%23%24aapl%20lang%3Aen%20since%3A2015-12-30%20until%3A2016-01-01&src=typd"
- crawl(url)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement