Cleanup Hackaday lowbrow corpus.

#!/usr/bin/env python

import re
### Name of the text file your're running this on. ###
file_name = 'lowbrow_dump.txt'
### File Name of the output from this program WARNING this ###
### file will be OVERWRITTEN everytime the program is run. ###
out_file_name = 'lowbrow_dump_cleaned.txt'


# Were going to break up the posts in the file and shove them into this class to make them easier to handle.
class Post:
    def __init__(self, number, message):
        self.number = number
        self.message = message

        # Mangle the message a little to squeeze out a few more dupes.
        ouch = message.lower()   # changes all letters to lowercase
        self.ouch = re.sub( r'[^a-z]', '', ouch )   # removes all numbers, punctuation, whitespace, etc.


    # Overwrite the compare method so we can sort the post back into their original order.
    def __cmp__(self, other):
        return cmp( self.number, other.number )

    # Overwrite the hash and eq methods to make this class work in a dictionary. ( This allows use of a dictionary to sort out dupes. )
    def __hash__(self):
        return self.ouch.__hash__()

    def __eq__(self, other):
        return self.ouch == other.ouch

    # Because were never supposed to directly access the internal structure of a class.
    def read(self):
        return self.message


# Read the lines from the file into a list.
f = open( file_name, 'r' )

lines = f.readlines()
f.close()


# Begin stage1
stage1 = {}
count = 0

for line in lines:
    # Remove some gibberish that shows up repeatedly through out the file.
    message = re.sub( r"Hey! Pay Up! +\(We're paid up through mid April, now!\)", ' ', line )
    message = re.sub( r"Hey! Pay Up! +\(The ads are coming! The ads are coming!\)", ' ', message )
    message = re.sub( r"&nbsp;?", ' ', message )
    message = re.sub( r"Hey! Pay Up!", ' ', message )
    message = re.sub( r"        Hey! *$", ' ', message )
    message = re.sub( r"        Help Wanted. *$", ' ', message )
    message = re.sub( r"        Why\? +Check for updates +Sat-Sat-Sat-Saturday is Shut-Shut-Shut-Shutdown day. +-->", ' ', message )

    # Remove all the whitespace from the start and end of all the lines.
    message = message.strip()

    # Create a 'post' object with our message and current line count.
    post = Post( count, message )

    # Checks to see if our dictionary already has a copy of our post in it. ( This is where the dupe removal happens. )
    if not stage1.has_key( post ):
        stage1[ post ] = None   # All the information we need is already in our post object, so we don't need to store anything else.

    count += 1


# We need to know how well this worked don't we ;-)
print 'Initial count: %i, Dupes removed: %i, New count: %i' % ( count, count - len(stage1), len(stage1) )


# Stage2
# Dump the keys out of the dictionary and sort them.  ( only works because we overrode the __cmp__ method for our class. )
stage2 = stage1.keys()
stage2.sort()


# Open new file and write out cleaned lines.
out_file = open( out_file_name, 'w' )

for post in stage2:
    out_file.write( '%s\n\n' % ( post.read() ,) )   # I'd like to be able to read this, so add an extra line between posts.