Advertisement
Guest User

Untitled

a guest
Feb 9th, 2016
252
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.40 KB | None | 0 0
  1. # coding=utf-8
  2. from selenium import webdriver
  3. from selenium.webdriver.common.keys import Keys
  4. from selenium.webdriver.common.by import By
  5. from selenium.webdriver.support.ui import WebDriverWait
  6. from selenium.webdriver.support import expected_conditions as EC
  7.  
  8. import itertools
  9. import csv
  10. import codecs
  11. import sys
  12. import time
  13. reload(sys)
  14. sys.setdefaultencoding('utf-8')
  15. unicode('gbk', errors='ignore')
  16.  
  17. def crawl(url):
  18. chromedriverPath = "C:\Python27\Scripts\chromedriver.exe"
  19.  
  20. # twitter includes 13 tweets every refresh
  21. eachCount = 13
  22.  
  23. #scroll times
  24. #scrollTimes = 99
  25.  
  26. #driver = webdriver.Firefox()
  27. driver = webdriver.Chrome(chromedriverPath)
  28. driver.get(url)
  29.  
  30. try:
  31. for i in itertools.count():##range(scrollTimes):
  32. try:
  33. driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  34.  
  35. lastTweetCss = "#stream-items-id >li:nth-of-type(" + str(eachCount*(i+2)+1) + ") .tweet-text"
  36. print lastTweetCss
  37.  
  38. elem = WebDriverWait(driver,10).until(EC.visibility_of_element_located((By.CSS_SELECTOR,lastTweetCss)))
  39. except elem.TimeoutException:
  40. pass
  41. printTweet(driver)
  42. finally:
  43. driver.close()
  44.  
  45.  
  46. def printTweet(driver):
  47. tweetCss = "[data-item-type=tweet]"
  48. nameCss = ".fullname"
  49. timeCss = "._timestamp"
  50. contentCss = ".tweet-text"
  51.  
  52. items = driver.find_elements_by_css_selector(tweetCss)
  53. resultArr = []
  54. for i,item in enumerate(items):
  55. try:
  56. nameElem = item.find_element_by_css_selector(nameCss)
  57. timeElem = item.find_element_by_css_selector(timeCss)
  58. contentElem = item.find_element_by_css_selector(contentCss)
  59.  
  60. name = nameElem.text
  61. time = timeElem.text
  62. content = contentElem.text
  63.  
  64. resultArr.append([name,time,content])
  65. print("%d: name=%s time=%s content=%s" % (i,name,time,content))
  66. except Exception, e:
  67. pass
  68. #print("error index=%d" % (i))
  69. #print e
  70.  
  71. printToCsv(resultArr)
  72.  
  73.  
  74. def printToCsv(data):
  75. writer = csv.writer(codecs.open('aaplejuneweektest.csv','wb'))
  76. writer.writerow(['name','time','content'])
  77.  
  78. for item in data:
  79. writer.writerow(item)
  80.  
  81.  
  82. if __name__ == '__main__':
  83. url = "https://twitter.com/search?q=%23%24aapl%20since%3A2015-06-20%20until%3A2015-06-21&src=typd&lang=en"
  84. crawl(url)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement