Advertisement
Guest User

Untitled

a guest
Oct 15th, 2019
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.89 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import time
  4. import plotly.graph_objects as go
  5. import csv
  6.  
  7. fig = go.Figure()
  8.  
  9. git_total_count = 0
  10. stack_total_count = 0
  11.  
  12. git_total_progress = 0
  13. git_current_progress = 0
  14.  
  15. stack_total_progress = 0
  16. stack_current_progress = 0
  17.  
  18. git_id = 1
  19.  
  20. stack_id = 1
  21.  
  22. def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█'):
  23. """
  24. Call in a loop to create terminal progress bar
  25. @params:
  26. iteration - Required : current iteration (Int)
  27. total - Required : total iterations (Int)
  28. prefix - Optional : prefix string (Str)
  29. suffix - Optional : suffix string (Str)
  30. decimals - Optional : positive number of decimals in percent complete (Int)
  31. length - Optional : character length of bar (Int)
  32. fill - Optional : bar fill character (Str)
  33. """
  34. percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
  35. filledLength = int(length * iteration // total)
  36. bar = fill * filledLength + '-' * (length - filledLength)
  37. print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\r')
  38. # Print New Line on Complete
  39. #if iteration == total:
  40. # print()
  41.  
  42. def write_csv(data_matrix, header, csv_output_name):
  43. with open(csv_output_name, mode='w') as csv_file:
  44. data_writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
  45.  
  46. data_writer.writerow(header)
  47.  
  48. for i in range( len(data_matrix)):
  49. data_writer.writerow(data_matrix[i])
  50.  
  51.  
  52.  
  53. def run_x_times_stack(x):
  54.  
  55. if(x == 0):
  56. return
  57.  
  58. global stack_total_progress
  59. global stack_current_progress
  60.  
  61. stack_total_progress = x
  62. stack_current_progress = 0
  63.  
  64. data_matrix = []
  65.  
  66. i = 1
  67. while i <= x:
  68. data_matrix = sof_spider(i, data_matrix)
  69. i+=1
  70. time.sleep(3)
  71.  
  72.  
  73.  
  74. write_csv(data_matrix, ['Stack id', 'Question Title', 'Question Link', 'Number of Upvotes', 'Date Created', 'Views', 'Last Active'], 'stack.csv')
  75.  
  76. tempList = []
  77. filler_data_matrix = initialize_twodlist(7, stack_total_count)
  78.  
  79. #
  80. for j in range(len(data_matrix[0])):
  81. for i in range(len(data_matrix)):
  82. tempList.append(data_matrix[i][j])
  83.  
  84. count = 0
  85.  
  86. for i in range(len(filler_data_matrix)):
  87. for j in range(len(filler_data_matrix[i])):
  88. filler_data_matrix[i][j] = tempList[count]
  89. count += 1
  90.  
  91. fig = go.Figure(data=[go.Table(header=dict(
  92. values=['Stack id', 'Question Title', 'Question Link', 'Number of Upvotes', 'Date Created', 'Last Active', 'Views']),
  93. cells=dict(values=filler_data_matrix)),
  94.  
  95. ])
  96. fig.show()
  97.  
  98.  
  99. def run_x_times_git(x):
  100.  
  101. if(x == 0):
  102. return
  103.  
  104. data_matrix = []
  105.  
  106. global git_total_progress
  107. global git_current_progress
  108.  
  109. git_total_progress = x
  110. git_current_progress = 0
  111.  
  112. i = 1
  113.  
  114. while i <= x:
  115. data_matrix = github_crawler(i, data_matrix)
  116. i+=1
  117. time.sleep(3)
  118.  
  119.  
  120. write_csv(data_matrix, ['Project_id', 'Project Name', 'Project Link', 'Project Issues', 'Pull Requests', 'Watch', 'Stars', 'Fork'], 'github.csv')
  121.  
  122.  
  123. tempList = []
  124. filler_data_matrix = initialize_twodlist(8, git_total_count)
  125. for j in range(len(data_matrix[0])):
  126. for i in range(len(data_matrix)):
  127. tempList.append(data_matrix[i][j])
  128.  
  129. print("")
  130. count = 0
  131.  
  132. for i in range(len(filler_data_matrix)):
  133. for j in range(len(filler_data_matrix[i])):
  134. filler_data_matrix[i][j] = tempList[count]
  135. count+=1
  136.  
  137. fig = go.Figure(data=[go.Table(
  138. header=dict(values=["Project id", "Project Name", "Project Link", "Project Issues", "Pull Requests", "Watch", "Stars", "Fork"]),
  139. cells=dict(values=filler_data_matrix))
  140. ])
  141. fig.show()
  142.  
  143.  
  144. def initialize_twodlist(rows, cols):
  145. twod_list = []
  146. new = []
  147. for i in range (0, rows):
  148. for j in range (0, cols):
  149. new.append("0")
  150. twod_list.append(new)
  151. new = []
  152. return twod_list
  153.  
  154.  
  155. def github_crawler(page, data_matrix):
  156.  
  157. global git_total_progress
  158. global git_current_progress
  159.  
  160. global git_id
  161.  
  162. urlPart1 = "https://github.com/search?p=" + str(page)
  163. urlPart2 = "&q=tensorflow&ref=simplesearch&type=Repositories&utf8=✓"
  164. super_url = urlPart1 + urlPart2
  165.  
  166. source_code = requests.get(super_url)
  167. plain_text = source_code.text
  168. soup = BeautifulSoup(plain_text, 'html5lib')
  169. count = 0
  170.  
  171. project_count = 0
  172.  
  173. for link in soup.findAll('div', {'class': 'col-12 col-md-8 pr-md-3'}):
  174. project_count+=1
  175.  
  176. if(project_count == 0):
  177. print("You have accessed the website too many times")
  178.  
  179.  
  180. increase = 1 / project_count
  181.  
  182.  
  183. global git_total_count
  184. git_total_count+= project_count
  185.  
  186. for link in soup.findAll('div', {'class': 'col-12 col-md-8 pr-md-3'}):
  187.  
  188. for link2 in link.findAll('a', {'class': 'v-align-middle'}):
  189.  
  190. pretty_name = ""
  191. pretty_link = ""
  192. pretty_issues = ""
  193. pretty_pull_requests = ""
  194. pretty_watch = ""
  195. pretty_stars = ""
  196. pretty_fork = ""
  197.  
  198.  
  199.  
  200.  
  201. href = "http://github.com/" + link2.get('href')
  202. project_title = ' '.join(link2.findAll(text=True))
  203. project_link = href
  204. project_soup = BeautifulSoup(requests.get(project_link).text, 'html5lib')
  205. counter = 0 #0 for issues, 1 for pull requests
  206. git_id = git_id
  207.  
  208.  
  209.  
  210. pretty_name = project_title
  211. pretty_link = href
  212.  
  213. for link3 in project_soup.findAll('span', {'class': 'Counter'}):
  214. if(counter == 0):
  215. project_issues = ' '.join(link3.findAll(text=True))
  216.  
  217. pretty_issues = project_issues
  218.  
  219. if(counter == 1):
  220. project_pull_requests = ' '.join(link3.findAll(text=True))
  221.  
  222. pretty_pull_requests = project_pull_requests
  223.  
  224. break
  225. counter+=1
  226.  
  227.  
  228.  
  229.  
  230.  
  231.  
  232.  
  233. counter = 0 #0 == watch, 1 == stars, 2 == fork
  234.  
  235. for link3 in project_soup.findAll('a', {'class': 'social-count'}):
  236. if(counter == 0):
  237. project_social_count = ' '.join(link3.findAll(text=True)).lstrip().rstrip()
  238.  
  239. pretty_watch = project_social_count
  240.  
  241. if(counter == 1):
  242. project_stars = ' '.join(link3.findAll(text=True)).lstrip().rstrip()
  243.  
  244. pretty_stars = project_stars
  245.  
  246. if(counter == 2):
  247. project_fork = ' '.join(link3.findAll(text=True)).lstrip().rstrip()
  248.  
  249. pretty_fork = project_fork
  250.  
  251. counter+=1
  252.  
  253. data_matrix.append([git_id, pretty_name, pretty_link, pretty_issues, pretty_pull_requests, pretty_watch, pretty_stars, pretty_fork])
  254.  
  255. git_current_progress += increase
  256.  
  257. git_id += 1
  258.  
  259. printProgressBar(git_current_progress, git_total_progress)
  260. #print(git_current_progress, end='\r')
  261.  
  262.  
  263. return data_matrix
  264.  
  265.  
  266.  
  267.  
  268.  
  269. def sof_spider(page, data_matrix):
  270. global stack_total_progress
  271. global stack_current_progress
  272.  
  273. global stack_id
  274.  
  275.  
  276. urlPart1 = "https://stackoverflow.com/search?page=" + str(page)
  277. urlPart2 = "&tab=Relevance&q=%5btensorflow%5d%20tensorflow"
  278. super_url = urlPart1 + urlPart2
  279.  
  280.  
  281.  
  282. source_code = requests.get(super_url)
  283. plain_text = source_code.text
  284. soup = BeautifulSoup(plain_text, 'html5lib')
  285.  
  286. question_count = 0
  287.  
  288. for link in soup.findAll('a', {'class': 'question-hyperlink'}):
  289. identifier = link.get('class')
  290. if(len(identifier) == 1):
  291. question_count+=1
  292.  
  293. if(question_count == 0):
  294. print("You have accessed the website too many times")
  295.  
  296.  
  297. increase = 1 / question_count
  298.  
  299.  
  300. global stack_total_count
  301.  
  302. stack_total_count += question_count
  303.  
  304. count = 0
  305.  
  306. for link in soup.findAll('a', {'class': 'question-hyperlink'}):
  307. #sees if the class is ONLY question-hyperlink
  308. identifier = link.get('class')
  309. #only get the questions
  310. if(len(identifier) == 1):
  311. href = link.get('href')
  312. question_title = ' '.join(link.findAll(text=True))
  313. question_title = question_title.lstrip().rstrip()
  314.  
  315. data_question_title = ""
  316. data_question_link = ""
  317. data_question_upvotes = ""
  318. data_question_date_created = ""
  319. data_question_views = ""
  320. data_question_last_active = ""
  321. stack_id = stack_id
  322.  
  323.  
  324. data_question_title = question_title
  325.  
  326.  
  327. question_link = "http://stackoverflow.com" + href
  328. data_question_link = question_link
  329. question_soup = BeautifulSoup(requests.get(question_link).text, 'html5lib')
  330.  
  331. for question_link in question_soup.findAll('div', {'class': "js-vote-count grid--cell fc-black-500 fs-title grid fd-column ai-center"}):
  332. num_upvotes = ' '.join(question_link.findAll(text=True))
  333. data_question_upvotes = num_upvotes
  334. break
  335.  
  336. counter = 0;
  337.  
  338. for question_link in question_soup.findAll('div', {'class': "grid fw-wrap pb8 mb16 bb bc-black-2"}):
  339. for question_link2 in question_link.findAll('div'):
  340. if(counter == 0):
  341. date_created = ' '.join(question_link2.findAll(text=True)).replace("Asked", "").lstrip().rstrip()
  342. data_question_date_created = date_created
  343. if(counter == 1):
  344. view_count = ' '.join(question_link2.findAll(text=True)).replace("Active","").lstrip().rstrip()
  345. data_question_views = view_count
  346. if(counter == 2):
  347. last_active = ' '.join(question_link2.findAll(text=True)).replace("Viewed","").replace("times", "").lstrip().rstrip()
  348. data_question_last_active = last_active
  349.  
  350. counter+=1
  351.  
  352.  
  353. data_matrix.append([stack_id, data_question_title, data_question_link, data_question_upvotes, data_question_date_created, data_question_views,data_question_last_active])
  354. count += 1
  355.  
  356. stack_current_progress += increase
  357.  
  358. stack_id+=1
  359.  
  360. printProgressBar(stack_current_progress, stack_total_progress)
  361.  
  362. return data_matrix
  363.  
  364. run_x_times_git(int(input("Please enter the number of pages that you want to crawl on github: ")))
  365.  
  366.  
  367.  
  368. run_x_times_stack(int(input("Please enter the number of pages that you want to crawl on Stack Overflow: ")))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement