Advertisement
Guest User

Quora Scraper

a guest
Jun 25th, 2017
255
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.02 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import selenium
  4. from selenium import webdriver
  5. from selenium.webdriver.common.keys import Keys
  6. from selenium.webdriver.support.ui import WebDriverWait
  7. from selenium.webdriver.support import expected_conditions as EC
  8. from selenium.webdriver.common.by import By
  9. from bs4 import BeautifulSoup
  10. import csv
  11. import time
  12.  
  13. # Replace this with whatever topic page you'd like to scrape
  14. quoraTopicPage = 'https://www.quora.com/topic/Wanting-and-Making-Money/all_questions'
  15.  
  16. # Set a maximum number of questions to scrape
  17. numberOfQuestionsToScrape = 1000
  18.  
  19. def filterSearchResults(resultArray, minViewVolume, minRatio = 20):
  20. '''Takes in an array of questions with stats, returns an array that's filtered'''
  21. filteredArray = []
  22.  
  23. for result in resultArray:
  24. if (result[2] > minViewVolume) and (result[3] > minRatio):
  25. filteredArray.append(result)
  26. return filteredArray
  27.  
  28. def HTMLNumberToPlain (numberText):
  29. if '.' in numberText:
  30. periodIndex = numberText.index('.') + 3
  31. numberText = numberText.replace('.', '')
  32. numberText = numberText.replace('k', '')
  33.  
  34. if len(numberText) > periodIndex:
  35. newNumberText = ''
  36. i=0
  37. for ch in numberText:
  38. if i == periodIndex:
  39. newNumberText += '.'
  40. newNumberText += ch
  41. i+=1
  42. return(int(newNumberText))
  43.  
  44. else:
  45. while len(numberText) < periodIndex:
  46. numberText += '0'
  47. return(int(numberText))
  48. else:
  49. return int(numberText)
  50.  
  51. # Initialize webdriver
  52. driver = webdriver.Chrome()
  53. driver.get("https://www.quora.com/")
  54. wait = WebDriverWait(driver, 30)
  55.  
  56. # Find sign-in by Google button and click it
  57. elem = driver.find_element_by_class_name("google_button")
  58. elem.click()
  59. time.sleep(2)
  60. window_before = driver.window_handles[0]
  61. window_after = driver.window_handles[1]
  62.  
  63. # Switch to login popup
  64. time.sleep(15)
  65. driver.switch_to_window(window_after)
  66.  
  67. pwSubmit = driver.find_element_by_id("passwordNext").click()
  68.  
  69. time.sleep(2)
  70. # Manually put in 2FA code here
  71.  
  72. #need to switch to first window again
  73. driver.switch_to_window(window_before)
  74.  
  75. time.sleep(2)
  76.  
  77. #navigate to topic page
  78. driver.get(quoraTopicPage)
  79.  
  80. #find total number of questions
  81. numberOfQuestionsDiv = driver.find_element_by_class_name('TopicQuestionsStatsRow').get_attribute("innerHTML")
  82. numberOfQuestionsSoup = BeautifulSoup(numberOfQuestionsDiv, 'html.parser').strong.text
  83. numberOfQuestions = HTMLNumberToPlain(numberOfQuestionsSoup)
  84.  
  85. #get div with all questions
  86. questionDiv = driver.find_element_by_class_name('layout_2col_main')
  87. questionHTML = questionDiv.get_attribute("innerHTML")
  88. driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  89. # Allow time to update page
  90. time.sleep(3)
  91.  
  92. #get questions again
  93. questionDiv = driver.find_element_by_class_name('layout_2col_main')
  94. newQuestionHTML = questionDiv.get_attribute("innerHTML")
  95.  
  96. if newQuestionHTML == questionHTML:
  97. questionsScrapedSoFar = numberOfQuestions
  98. else:
  99. soup = BeautifulSoup(newQuestionHTML.encode("utf-8"), 'html.parser')
  100. questionsScrapedSoFarSoup = soup.find_all('a', class_= 'question_link')
  101. questionsScrapedSoFar=0
  102. for q in questionsScrapedSoFarSoup:
  103. questionsScrapedSoFar+=1
  104.  
  105. repeatCount = 0
  106. # Keep checking if there are new questions after scrolling down
  107. while (questionsScrapedSoFar < int(0.9 * numberOfQuestions)):
  108. questionHTML = newQuestionHTML
  109. driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  110. time.sleep(5)
  111. questionDiv = driver.find_element_by_class_name('layout_2col_main')
  112. newQuestionHTML = questionDiv.get_attribute("innerHTML")
  113.  
  114. if newQuestionHTML != questionHTML:
  115. # Each time you scroll down, 20 more are added
  116. questionsScrapedSoFar += 20
  117. repeatCount = 0
  118. else:
  119. repeatCount +=1
  120.  
  121. if repeatCount > 10:
  122. print("Quora stalled after scraping " + str(questionsScrapedSoFar) + " questions")
  123. break
  124.  
  125. if questionsScrapedSoFar > numberOfQuestionsToScrape:
  126. break
  127.  
  128.  
  129. finalQuestions = questionDiv.get_attribute("innerHTML").encode("utf-8")
  130.  
  131. # Get questions as strings
  132. soup = BeautifulSoup(finalQuestions, 'html.parser')
  133. questions = soup.find_all('a', class_= 'question_link')
  134. questionLinks = []
  135. for q in questions:
  136. questionLinks.append(q['href'])
  137.  
  138. # Visit each question page to get stats
  139. questionStats = []
  140.  
  141. # Need to add something in here in case quora messes up
  142. for qLink in questionLinks:
  143. try:
  144. driver.get('https://www.quora.com'+qLink)
  145.  
  146. # Get question text
  147. questionsText = driver.find_element_by_class_name('rendered_qtext').text.encode("utf-8")
  148.  
  149. # Need to get number of answers
  150. try:
  151. numberOfAnswersText = driver.find_element_by_class_name('answer_count').text.split(" ")[0].replace(',','').replace('+', '')
  152. except:
  153. numberOfAnswersText = 1
  154.  
  155. # Need to get number of views
  156. numberOfViewsText = driver.find_element_by_class_name('ViewsRow').text.split(" ")[0].replace(',','')
  157.  
  158. # Calculate ratio for sorting
  159. viewsToAnswersRatio = float(numberOfViewsText) / float(numberOfAnswersText)
  160.  
  161. questionStats.append([questionsText, float(numberOfAnswersText), float(numberOfViewsText), viewsToAnswersRatio])
  162. except:
  163. pass
  164.  
  165. sortedQuestionStats = sorted(filterSearchResults(questionStats, 100), key=lambda question: question[3], reverse=True)
  166.  
  167. # Close the window
  168. driver.close()
  169.  
  170. # Export data to CSV file in same location as this file
  171. results = []
  172. with open('results.csv', 'w') as csvfile:
  173. fieldnames = ['Question', 'Views', 'Answers', 'Views to Answers Ratio']
  174. writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
  175. writer.writeheader()
  176. for line in sortedQuestionStats:
  177. writer.writerow({'Question': line[0].decode('utf-8'), 'Answers': str(line[1]), 'Views': str(line[2]), 'Views to Answers Ratio': str(line[3])})
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement