Untitled

import re
import math
import itertools
from nltk.corpus import stopwords
from nltk import PorterStemmer
from collections import defaultdict
from collections import Counter
from itertools import dropwhile

import sys, getopt

inp = "inp_6000.txt"  #input file name
out = "bowfilter10"   #output file name
with open(inp,'r') as plot_data:
    main_dict = Counter()
    file1, file2 = itertools.tee(plot_data, 2)
    line_one = itertools.islice(file1, 0, None, 4)
    line_two = itertools.islice(file2, 2, None, 4)
    dictionary = defaultdict(Counter)
    doc_count = defaultdict(Counter)
    for movie_name, movie_plot in itertools.izip(line_one, line_two):
        movie_plot = movie_plot.lower()
        words = re.findall(r'w+', movie_plot, flags = re.UNICODE | re.LOCALE)  #split words
        elemStopW = filter(lambda x: x not in stopwords.words('english'), words)   #remove stop words, python nltk
        for word in elemStopW:
            word = PorterStemmer().stem_word(word)   #use python stemmer class to do stemming
            #increment the word count of the movie in the particular movie synopsis
            dictionary[movie_name][word] += 1
            #increment the count of a partiular word in main dictionary which stores frequency of all documents.
            main_dict[word] += 1
            #This is done to calculate term frequency inverse document frequency. Takes note of the first occurance of the word in the synopsis and neglect all other.
            if doc_count[word]['this_mov']==0:
                doc_count[word].update(count=1, this_mov=1);
        for word in doc_count:
            doc_count[word].update(this_mov=-1)
    #print "---------main_dict---------"
    #print main_dict
    #Remove all the words with frequency less than 5 in whole set of movies
    for key, count in dropwhile(lambda key_count: key_count[1] >= 5, main_dict.most_common()):
        del main_dict[key]
    #print main_dict
   .#Write to file
    bow_vec = open(out, 'w');
    #calculate the the bog vector and write it
    m = len(dictionary)
    for movie_name in dictionary.keys():
        #print movie_name
        vector = []
        for word in list(main_dict):
            #print word, dictionary[movie_name][word]
            x = dictionary[movie_name][word] * math.log(m/doc_count[word]['count'], 2)
            vector.append(x)
        #write to file
        bow_vec.write("%s" % movie_name)
        for item in vector:
            bow_vec.write("%s," % item)
        bow_vec.write("n")

trans_table = string.maketrans(string.string.punctuation,
                               ' '*len(string.punctuation)).lower()
words = movie_plot.translate(trans_table).split()

stops = stopwords.words('english')

stops = set(stopwords.words('english'))

In [49]: my_list = range(100)

In [50]: %timeit 10 in my_list
1000000 loops, best of 3: 193 ns per loop

In [51]: %timeit 101 in my_list
1000000 loops, best of 3: 1.49 us per loop

In [52]: my_set = set(my_list)

In [53]: %timeit 101 in my_set
10000000 loops, best of 3: 45.2 ns per loop

In [54]: %timeit 10 in my_set
10000000 loops, best of 3: 47.2 ns per loop

In [30]: %timeit words = 'This is a long; and meaningless - sentence'.split(split_let)
1000000 loops, best of 3: 271 ns per loop

In [31]: %timeit words = re.findall(r'w+', 'This is a long; and meaningless - sentence', flags = re.UNICODE | re.LOCALE)
100000 loops, best of 3: 3.08 us per loop

word_dict = {key: count for key, count in
             takewhile(lambda key_count: itemgetter(1) >= 5,
             main_dict.most_common())

with open(inp,'r') as plot_data:
    word_dict = Counter()
    file1, file2 = itertools.tee(plot_data, 2)
    line_one = itertools.islice(file1, 0, None, 4)
    line_two = itetools.islice(file2, 2, None, 4)
    all_stop_words = stopwords.words('english')
    movie_dict = defaultdict(Counter)
    stemmer_func = PorterStemmer().stem_word
    for movie_name, movie_plot in itertools.izip(line_one, line_two):
        movie_plot = movie_plot.lower()
        words = <see above - I am updating original post>
        all_words = [stemmer_func(word) for word in words
                     if not word in all_stop_words]
        current_word_counter = Counter(all_words)
        movie_dict[movie_name].update(current_word_counter)
        word_dict.update(current_word_counter)