Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Loop and read wiki html files and then parse the text
- # to get a raw list of contributions.
- from re import sub
- from os import listdir
- # Directory with the html files
- str_inputDir='wiki_education/'
- # Directory where the user lists will go
- str_outputDir='rawPosts/'
- # Scan directory and place all
- # wiki html files into a list
- list_htmlPages=[]
- for str_eachFile in listdir(str_inputDir):
- if str_eachFile.startswith('index.php'):
- list_htmlPages.append(str_eachFile)
- # Function that extract the name
- # from the html file to use in new file
- def getName(str_eachFile):
- str_pass1=sub('^.*title=','',str_eachFile)
- str_pass2=sub('&offset.*$','',str_pass1)
- str_pass3=sub(' ','-',str_pass2)
- str_pass4=sub('[^a-zA-Z0-9-]','',str_pass3)
- str_fileName='edu_'+str_pass4+'.txt'
- return str_fileName
- # This is where all the magic happens
- # Loop through the lines and get either
- # the user name or the user ID for each edit.
- def getUserNames(list_fileLines,file_wOutput):
- # Loop through each line in the file
- for str_eachLine in list_fileLines:
- # only lines that are relevant
- if str_eachLine.startswith('<li><span'):
- # Get user name if line has user name
- if 'wiki/User:' in str_eachLine:
- str_pass1=sub('^.*wiki/User:','',str_eachLine)
- str_pass2=sub('" .*$','',str_pass1)
- str_userName=sub('[^a-zA-Z0-9_-]','',str_pass2)
- file_wOutput.write(str_userName+'\n')
- # Otherwise, get the id
- elif 'wiki/Special:Contributions/' in str_eachLine:
- str_pass1=sub('^.*wiki/Special:Contributions/','',str_eachLine)
- str_pass2=sub('" .*$','',str_pass1)
- str_userID=sub('[^a-zA-Z0-9_-]','',str_pass2)
- file_wOutput.write(str_userID+'\n')
- # start a counter to count down files that are done
- int_fileCounter=len(list_htmlPages)
- # Loop throught the list of relevant files and parse.
- for str_eachFile in list_htmlPages:
- # open files and place lines in list
- file_rTheHtml=open(str_inputDir+str_eachFile)
- list_fileLines=file_rTheHtml.readlines()
- file_rTheHtml.close()
- # use fuction to get new file name
- str_fileName=getName(str_eachFile)
- # open new file for writing
- file_wOutput=open(str_outputDir+str_fileName,'w')
- # get user names ir ids using function
- getUserNames(list_fileLines,file_wOutput)
- # Close file
- file_wOutput.close()
- # print counter to see how many
- # files are left along with file name
- print ' '.join([str(int_fileCounter),str_fileName])
- int_fileCounter-=1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement