Advertisement
Guest User

wiki_userNamesParser

a guest
Mar 19th, 2013
31
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.44 KB | None | 0 0
  1. # Loop and read wiki html files and then parse the text
  2. # to get a raw list of contributions.
  3.  
  4. from re import sub
  5. from os import listdir
  6.  
  7. # Directory with the html files
  8. str_inputDir='wiki_education/'
  9.  
  10. # Directory where the user lists will go
  11. str_outputDir='rawPosts/'
  12.  
  13. # Scan directory and place all
  14. # wiki html files into a list
  15. list_htmlPages=[]
  16. for str_eachFile in listdir(str_inputDir):
  17.     if str_eachFile.startswith('index.php'):
  18.         list_htmlPages.append(str_eachFile)
  19.  
  20. # Function that extract the name
  21. # from the html file to use in new file
  22. def getName(str_eachFile):
  23.     str_pass1=sub('^.*title=','',str_eachFile)
  24.     str_pass2=sub('&offset.*$','',str_pass1)
  25.     str_pass3=sub(' ','-',str_pass2)
  26.     str_pass4=sub('[^a-zA-Z0-9-]','',str_pass3)
  27.     str_fileName='edu_'+str_pass4+'.txt'
  28.     return str_fileName
  29.  
  30. # This is where all the magic happens
  31. # Loop through the lines and get either
  32. # the user name or the user ID for each edit.
  33. def getUserNames(list_fileLines,file_wOutput):
  34.     # Loop through each line in the file
  35.     for str_eachLine in list_fileLines:
  36.         # only lines that are relevant
  37.         if str_eachLine.startswith('<li><span'):
  38.  
  39.             # Get user name if line has user name
  40.             if 'wiki/User:' in str_eachLine:
  41.                 str_pass1=sub('^.*wiki/User:','',str_eachLine)
  42.                 str_pass2=sub('" .*$','',str_pass1)
  43.                 str_userName=sub('[^a-zA-Z0-9_-]','',str_pass2)
  44.                 file_wOutput.write(str_userName+'\n')
  45.  
  46.             # Otherwise, get the id
  47.             elif 'wiki/Special:Contributions/' in str_eachLine:
  48.                 str_pass1=sub('^.*wiki/Special:Contributions/','',str_eachLine)
  49.                 str_pass2=sub('" .*$','',str_pass1)
  50.                 str_userID=sub('[^a-zA-Z0-9_-]','',str_pass2)
  51.                 file_wOutput.write(str_userID+'\n')
  52.  
  53. # start a counter to count down files that are done
  54. int_fileCounter=len(list_htmlPages)
  55.  
  56. # Loop throught the list of relevant files and parse.
  57. for str_eachFile in list_htmlPages:
  58.  
  59.     # open files and place lines in list
  60.     file_rTheHtml=open(str_inputDir+str_eachFile)
  61.     list_fileLines=file_rTheHtml.readlines()
  62.     file_rTheHtml.close()
  63.  
  64.     # use fuction to get new file name
  65.     str_fileName=getName(str_eachFile)
  66.  
  67.     # open new file for writing
  68.     file_wOutput=open(str_outputDir+str_fileName,'w')
  69.     # get user names ir ids using function
  70.     getUserNames(list_fileLines,file_wOutput)
  71.     # Close file
  72.     file_wOutput.close()
  73.  
  74.     # print counter to see how many
  75.         # files are left along with file name
  76.     print ' '.join([str(int_fileCounter),str_fileName])
  77.     int_fileCounter-=1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement