Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # scrape_cpp_stackoverflow.py ZZZ rewritten for py3, but now has somewhat a faulty output
- # to gather C++ content
- import re
- import os
- from urllib.request import urlopen
- import time
- # import tkFileDialog
- # from Tkinter import *
- from tkinter import filedialog
- from tkinter import *
- misc='''
- </ <
- > >
- < <
- '''.splitlines()
- br='''
- <a>
- <p>
- <blockquote>
- '''.splitlines()
- no='''
- <code>
- <li>
- <ul>
- <h1>
- <em>
- <pre>
- <strong>
- '''.splitlines()
- def write_data(f, data=''):
- NL = ''
- mode = 'a'
- if f == pagemark:
- mode = 'w'
- if data: NL='\n'
- f = open(f, mode)
- f.write(data+NL)
- f.close()
- #
- web = "https://stackoverflow.com"
- url = web+"/questions/tagged/cpp?{}sort=votes&pagesize=50"
- ttt = "stackoverflow/"
- targetA = 'questions/'
- targetB = '" class=' # brackets capture string
- ans = '<div class="answercell.+?<p>(.+?)</div>'
- r10 = [10-z for z in range(10)]
- root = Tk()
- file = filedialog.askopenfilename(filetypes=[("txt files","*.txt"),("all files","*.*")])
- file = file[:file.rfind('/')+1]
- if ttt not in file: file+=ttt
- logs = os.path.join(file,'cpp_checked.txt')
- pagemark = os.path.join(file,'cpp_pagemark.txt')
- if not os.path.exists(file):
- os.mkdir(file)
- if not os.path.isfile(logs):
- write_data(logs, '')
- if not os.path.isfile(pagemark):
- write_data(pagemark, '')
- #
- root.destroy()
- misc='''
- </ <
- > >
- < <
- '''.splitlines()
- br='''
- <a>
- <p>
- <blockquote>
- '''.splitlines()
- no='''
- <code>
- <li>
- <ul>
- <h1>
- <pre>
- <strong>
- '''.splitlines()
- f = open(logs)
- checked = f.read().splitlines()
- f.close()
- f = open(pagemark)
- mark = f.read().splitlines()
- f.close()
- print(mark)
- p = input('Start From Which Page? ')
- if not p: p = 1
- p = int(p)
- def pg(p):
- if p > 1:
- return url.format("page={}&".format(p))
- return url.format('')
- #
- prev = ''
- def scrape(z=0):
- c = 5
- while 1:
- try:
- return str(urlopen(z).read())
- except:
- print('*** Unable to access data, retrying...')
- print(c)
- t = time.time()+c
- while t > time.time(): 0
- c += 5
- #
- while 1:
- links = []
- L = scrape(pg(p))
- L = L.replace('questions/tagged/','')
- L = L.replace(targetA,'@+++@<!!!>')
- L = L.replace(targetB,'@+++@')
- L = L.split('@+++@')
- L = [s[5:] for s in L if '<!!!>' in s][1:]
- if prev == L[-1]:
- break
- print('\n\n'.join(L))
- print()
- prev = L[-1]
- links.extend(L)
- for z in links:
- if z not in checked:
- print()
- L = scrape(web+'/questions/'+z)
- L = L.replace('\n','')
- L = re.findall(ans, L)
- L = '\n\n###@###\n'.join(L)
- if len(L) > 100:
- for i in r10:
- tabs = ' '*(3*i)
- L = L.replace(tabs,'\n'+'\t')
- for zzz in misc:
- try:
- a,b = zzz.split(' ')
- L = L.replace(a,b)
- except:
- 0
- for zzz in br:
- if zzz: L = L.replace(zzz,'\n\n')
- for zzz in no:
- if zzz: L = L.replace(zzz,'')
- for t in 'z'*10:
- L = L.replace('\n\n\n','\n\n')
- L = L.replace('\t\n','\n\n')
- L = L.replace(' \n','\n\n')
- L = L.replace('\n.','\n')
- L = L.replace('\t ','\t')
- L = L.replace(' \t','\t')
- f = z.replace('/','-')
- new_file=os.path.join(file,f+'.txt')
- write_data(new_file, L)
- checked.append(z)
- write_data(logs, z)
- print(L)
- if z == links[-1]:
- log = '{} files ***$$$$$*** pg: {}'.format(len(checked), p)
- write_data(pagemark, log)
- print(log)
- print([file])
- print()
- else:
- print('.',end='')
- print()
- print(p)
- p+=1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement