Advertisement
Guest User

Untitled

a guest
Feb 3rd, 2016
394
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.37 KB | None | 0 0
  1. # coding=utf-8
  2. from selenium import webdriver
  3. from selenium.webdriver.common.keys import Keys
  4. from selenium.webdriver.common.by import By
  5. from selenium.webdriver.support.ui import WebDriverWait
  6. from selenium.webdriver.support import expected_conditions as EC
  7.  
  8. import itertools
  9. import csv
  10. import codecs
  11. import sys
  12. reload(sys)
  13. sys.setdefaultencoding('utf-8')
  14. unicode('gbk', errors='ignore')
  15.  
  16. def crawl(url):
  17. chromedriverPath = "C:\Python27\Scripts\chromedriver.exe"
  18.  
  19. # twitter includes 13 tweets every refresh
  20. eachCount = 13
  21.  
  22. #scroll times
  23. #scrollTimes = 1
  24.  
  25. #driver = webdriver.Chrome()
  26. driver = webdriver.Chrome(chromedriverPath)
  27. driver.get(url)
  28.  
  29. try:
  30. for i in itertools.count():##range(scrollTimes):
  31. driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  32.  
  33. lastTweetCss = "#stream-items-id >li:nth-of-type(" + str(eachCount*(i+2)+1) + ") .tweet-text"
  34. print lastTweetCss
  35. elem = WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR,lastTweetCss)))
  36. printTweet(driver)
  37. finally:
  38. driver.close()
  39.  
  40.  
  41. def printTweet(driver):
  42. tweetCss = "[data-item-type=tweet]"
  43. nameCss = ".fullname"
  44. timeCss = "._timestamp"
  45. contentCss = ".tweet-text"
  46.  
  47. items = driver.find_elements_by_css_selector(tweetCss)
  48. resultArr = []
  49. for i,item in enumerate(items):
  50. try:
  51. nameElem = item.find_element_by_css_selector(nameCss)
  52. timeElem = item.find_element_by_css_selector(timeCss)
  53. contentElem = item.find_element_by_css_selector(contentCss)
  54.  
  55. name = nameElem.text
  56. time = timeElem.text
  57. content = contentElem.text
  58.  
  59. resultArr.append([name,time,content])
  60. print("%d: name=%s time=%s content=%s" % (i,name,time,content))
  61. except Exception, e:
  62. pass
  63. #print("error index=%d" % (i))
  64. #print e
  65.  
  66. printToCsv(resultArr)
  67.  
  68.  
  69. def printToCsv(data):
  70. writer = csv.writer(codecs.open('result.csv','wb'))
  71. writer.writerow(['name','time','content'])
  72.  
  73. for item in data:
  74. writer.writerow(item)
  75.  
  76.  
  77. if __name__ == '__main__':
  78. url = "https://twitter.com/search?q=%23%24aapl%20lang%3Aen%20since%3A2015-12-30%20until%3A2016-01-01&src=typd"
  79. crawl(url)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement