Want more features on Pastebin? Sign Up, it's FREE!
Guest

Cleanup Hackaday lowbrow corpus.

By: a guest on Jul 16th, 2012  |  syntax: Python  |  size: 3.21 KB  |  views: 57  |  expires: Never
download  |  raw  |  embed  |  report abuse  |  print
Text below is selected. Please press Ctrl+C to copy to your clipboard. (⌘+C on Mac)
  1. #!/usr/bin/env python
  2.  
  3. import re
  4. ### Name of the text file your're running this on. ###
  5. file_name = 'lowbrow_dump.txt'
  6. ### File Name of the output from this program WARNING this ###
  7. ### file will be OVERWRITTEN everytime the program is run. ###
  8. out_file_name = 'lowbrow_dump_cleaned.txt'
  9.  
  10.  
  11. # Were going to break up the posts in the file and shove them into this class to make them easier to handle.
  12. class Post:
  13.     def __init__(self, number, message):
  14.         self.number = number
  15.         self.message = message
  16.  
  17.         # Mangle the message a little to squeeze out a few more dupes.
  18.         ouch = message.lower()   # changes all letters to lowercase
  19.         self.ouch = re.sub( r'[^a-z]', '', ouch )   # removes all numbers, punctuation, whitespace, etc.
  20.  
  21.  
  22.     # Overwrite the compare method so we can sort the post back into their original order.
  23.     def __cmp__(self, other):
  24.         return cmp( self.number, other.number )
  25.  
  26.     # Overwrite the hash and eq methods to make this class work in a dictionary. ( This allows use of a dictionary to sort out dupes. )
  27.     def __hash__(self):
  28.         return self.ouch.__hash__()
  29.  
  30.     def __eq__(self, other):
  31.         return self.ouch == other.ouch
  32.  
  33.     # Because were never supposed to directly access the internal structure of a class.
  34.     def read(self):
  35.         return self.message
  36.  
  37.  
  38. # Read the lines from the file into a list.
  39. f = open( file_name, 'r' )
  40.  
  41. lines = f.readlines()
  42. f.close()
  43.  
  44.  
  45. # Begin stage1
  46. stage1 = {}
  47. count = 0
  48.  
  49. for line in lines:
  50.     # Remove some gibberish that shows up repeatedly through out the file.
  51.     message = re.sub( r"Hey! Pay Up! +\(We're paid up through mid April, now!\)", ' ', line )
  52.     message = re.sub( r"Hey! Pay Up! +\(The ads are coming! The ads are coming!\)", ' ', message )
  53.     message = re.sub( r" ?", ' ', message )
  54.     message = re.sub( r"Hey! Pay Up!", ' ', message )
  55.     message = re.sub( r"        Hey! *$", ' ', message )
  56.     message = re.sub( r"        Help Wanted. *$", ' ', message )
  57.     message = re.sub( r"        Why\? +Check for updates +Sat-Sat-Sat-Saturday is Shut-Shut-Shut-Shutdown day. +-->", ' ', message )
  58.  
  59.     # Remove all the whitespace from the start and end of all the lines.
  60.     message = message.strip()
  61.  
  62.     # Create a 'post' object with our message and current line count.
  63.     post = Post( count, message )
  64.  
  65.     # Checks to see if our dictionary already has a copy of our post in it. ( This is where the dupe removal happens. )
  66.     if not stage1.has_key( post ):
  67.         stage1[ post ] = None   # All the information we need is already in our post object, so we don't need to store anything else.
  68.  
  69.     count += 1
  70.  
  71.  
  72.  
  73. # We need to know how well this worked don't we ;-)
  74. print 'Initial count: %i, Dupes removed: %i, New count: %i' % ( count, count - len(stage1), len(stage1) )
  75.  
  76.  
  77.  
  78. # Stage2
  79. # Dump the keys out of the dictionary and sort them.  ( only works because we overrode the __cmp__ method for our class. )
  80. stage2 = stage1.keys()
  81. stage2.sort()
  82.  
  83.  
  84.  
  85. # Open new file and write out cleaned lines.
  86. out_file = open( out_file_name, 'w' )
  87.  
  88. for post in stage2:
  89.     out_file.write( '%s\n\n' % ( post.read() ,) )   # I'd like to be able to read this, so add an extra line between posts.
clone this paste RAW Paste Data