Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import re
- ### Name of the text file your're running this on. ###
- file_name = 'lowbrow_dump.txt'
- ### File Name of the output from this program WARNING this ###
- ### file will be OVERWRITTEN everytime the program is run. ###
- out_file_name = 'lowbrow_dump_cleaned.txt'
- # Were going to break up the posts in the file and shove them into this class to make them easier to handle.
- class Post:
- def __init__(self, number, message):
- self.number = number
- self.message = message
- # Mangle the message a little to squeeze out a few more dupes.
- ouch = message.lower() # changes all letters to lowercase
- self.ouch = re.sub( r'[^a-z]', '', ouch ) # removes all numbers, punctuation, whitespace, etc.
- # Overwrite the compare method so we can sort the post back into their original order.
- def __cmp__(self, other):
- return cmp( self.number, other.number )
- # Overwrite the hash and eq methods to make this class work in a dictionary. ( This allows use of a dictionary to sort out dupes. )
- def __hash__(self):
- return self.ouch.__hash__()
- def __eq__(self, other):
- return self.ouch == other.ouch
- # Because were never supposed to directly access the internal structure of a class.
- def read(self):
- return self.message
- # Read the lines from the file into a list.
- f = open( file_name, 'r' )
- lines = f.readlines()
- f.close()
- # Begin stage1
- stage1 = {}
- count = 0
- for line in lines:
- # Remove some gibberish that shows up repeatedly through out the file.
- message = re.sub( r"Hey! Pay Up! +\(We're paid up through mid April, now!\)", ' ', line )
- message = re.sub( r"Hey! Pay Up! +\(The ads are coming! The ads are coming!\)", ' ', message )
- message = re.sub( r" ?", ' ', message )
- message = re.sub( r"Hey! Pay Up!", ' ', message )
- message = re.sub( r" Hey! *$", ' ', message )
- message = re.sub( r" Help Wanted. *$", ' ', message )
- message = re.sub( r" Why\? +Check for updates +Sat-Sat-Sat-Saturday is Shut-Shut-Shut-Shutdown day. +-->", ' ', message )
- # Remove all the whitespace from the start and end of all the lines.
- message = message.strip()
- # Create a 'post' object with our message and current line count.
- post = Post( count, message )
- # Checks to see if our dictionary already has a copy of our post in it. ( This is where the dupe removal happens. )
- if not stage1.has_key( post ):
- stage1[ post ] = None # All the information we need is already in our post object, so we don't need to store anything else.
- count += 1
- # We need to know how well this worked don't we ;-)
- print 'Initial count: %i, Dupes removed: %i, New count: %i' % ( count, count - len(stage1), len(stage1) )
- # Stage2
- # Dump the keys out of the dictionary and sort them. ( only works because we overrode the __cmp__ method for our class. )
- stage2 = stage1.keys()
- stage2.sort()
- # Open new file and write out cleaned lines.
- out_file = open( out_file_name, 'w' )
- for post in stage2:
- out_file.write( '%s\n\n' % ( post.read() ,) ) # I'd like to be able to read this, so add an extra line between posts.
Add Comment
Please, Sign In to add comment