wiki_userNamesParser

# Loop and read wiki html files and then parse the text
# to get a raw list of contributions.

from re import sub
from os import listdir

# Directory with the html files
str_inputDir='wiki_education/'

# Directory where the user lists will go
str_outputDir='rawPosts/'

# Scan directory and place all
# wiki html files into a list
list_htmlPages=[]
for str_eachFile in listdir(str_inputDir):
    if str_eachFile.startswith('index.php'):
        list_htmlPages.append(str_eachFile)

# Function that extract the name
# from the html file to use in new file
def getName(str_eachFile):
    str_pass1=sub('^.*title=','',str_eachFile)
    str_pass2=sub('&offset.*$','',str_pass1)
    str_pass3=sub(' ','-',str_pass2)
    str_pass4=sub('[^a-zA-Z0-9-]','',str_pass3)
    str_fileName='edu_'+str_pass4+'.txt'
    return str_fileName

# This is where all the magic happens
# Loop through the lines and get either
# the user name or the user ID for each edit.
def getUserNames(list_fileLines,file_wOutput):
    # Loop through each line in the file
    for str_eachLine in list_fileLines:
        # only lines that are relevant
        if str_eachLine.startswith('<li><span'):

            # Get user name if line has user name
            if 'wiki/User:' in str_eachLine:
                str_pass1=sub('^.*wiki/User:','',str_eachLine)
                str_pass2=sub('" .*$','',str_pass1)
                str_userName=sub('[^a-zA-Z0-9_-]','',str_pass2)
                file_wOutput.write(str_userName+'\n')

            # Otherwise, get the id
            elif 'wiki/Special:Contributions/' in str_eachLine:
                str_pass1=sub('^.*wiki/Special:Contributions/','',str_eachLine)
                str_pass2=sub('" .*$','',str_pass1)
                str_userID=sub('[^a-zA-Z0-9_-]','',str_pass2)
                file_wOutput.write(str_userID+'\n')

# start a counter to count down files that are done
int_fileCounter=len(list_htmlPages)

# Loop throught the list of relevant files and parse.
for str_eachFile in list_htmlPages:

    # open files and place lines in list
    file_rTheHtml=open(str_inputDir+str_eachFile)
    list_fileLines=file_rTheHtml.readlines()
    file_rTheHtml.close()

    # use fuction to get new file name
    str_fileName=getName(str_eachFile)

    # open new file for writing
    file_wOutput=open(str_outputDir+str_fileName,'w')
    # get user names ir ids using function
    getUserNames(list_fileLines,file_wOutput)
    # Close file
    file_wOutput.close()

    # print counter to see how many
        # files are left along with file name
    print ' '.join([str(int_fileCounter),str_fileName])
    int_fileCounter-=1