Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import math
- import itertools
- from nltk.corpus import stopwords
- from nltk import PorterStemmer
- from collections import defaultdict
- from collections import Counter
- from itertools import dropwhile
- import sys, getopt
- inp = "inp_6000.txt" #input file name
- out = "bowfilter10" #output file name
- with open(inp,'r') as plot_data:
- main_dict = Counter()
- file1, file2 = itertools.tee(plot_data, 2)
- line_one = itertools.islice(file1, 0, None, 4)
- line_two = itertools.islice(file2, 2, None, 4)
- dictionary = defaultdict(Counter)
- doc_count = defaultdict(Counter)
- for movie_name, movie_plot in itertools.izip(line_one, line_two):
- movie_plot = movie_plot.lower()
- words = re.findall(r'w+', movie_plot, flags = re.UNICODE | re.LOCALE) #split words
- elemStopW = filter(lambda x: x not in stopwords.words('english'), words) #remove stop words, python nltk
- for word in elemStopW:
- word = PorterStemmer().stem_word(word) #use python stemmer class to do stemming
- #increment the word count of the movie in the particular movie synopsis
- dictionary[movie_name][word] += 1
- #increment the count of a partiular word in main dictionary which stores frequency of all documents.
- main_dict[word] += 1
- #This is done to calculate term frequency inverse document frequency. Takes note of the first occurance of the word in the synopsis and neglect all other.
- if doc_count[word]['this_mov']==0:
- doc_count[word].update(count=1, this_mov=1);
- for word in doc_count:
- doc_count[word].update(this_mov=-1)
- #print "---------main_dict---------"
- #print main_dict
- #Remove all the words with frequency less than 5 in whole set of movies
- for key, count in dropwhile(lambda key_count: key_count[1] >= 5, main_dict.most_common()):
- del main_dict[key]
- #print main_dict
- .#Write to file
- bow_vec = open(out, 'w');
- #calculate the the bog vector and write it
- m = len(dictionary)
- for movie_name in dictionary.keys():
- #print movie_name
- vector = []
- for word in list(main_dict):
- #print word, dictionary[movie_name][word]
- x = dictionary[movie_name][word] * math.log(m/doc_count[word]['count'], 2)
- vector.append(x)
- #write to file
- bow_vec.write("%s" % movie_name)
- for item in vector:
- bow_vec.write("%s," % item)
- bow_vec.write("n")
- trans_table = string.maketrans(string.string.punctuation,
- ' '*len(string.punctuation)).lower()
- words = movie_plot.translate(trans_table).split()
- stops = stopwords.words('english')
- stops = set(stopwords.words('english'))
- In [49]: my_list = range(100)
- In [50]: %timeit 10 in my_list
- 1000000 loops, best of 3: 193 ns per loop
- In [51]: %timeit 101 in my_list
- 1000000 loops, best of 3: 1.49 us per loop
- In [52]: my_set = set(my_list)
- In [53]: %timeit 101 in my_set
- 10000000 loops, best of 3: 45.2 ns per loop
- In [54]: %timeit 10 in my_set
- 10000000 loops, best of 3: 47.2 ns per loop
- In [30]: %timeit words = 'This is a long; and meaningless - sentence'.split(split_let)
- 1000000 loops, best of 3: 271 ns per loop
- In [31]: %timeit words = re.findall(r'w+', 'This is a long; and meaningless - sentence', flags = re.UNICODE | re.LOCALE)
- 100000 loops, best of 3: 3.08 us per loop
- word_dict = {key: count for key, count in
- takewhile(lambda key_count: itemgetter(1) >= 5,
- main_dict.most_common())
- with open(inp,'r') as plot_data:
- word_dict = Counter()
- file1, file2 = itertools.tee(plot_data, 2)
- line_one = itertools.islice(file1, 0, None, 4)
- line_two = itetools.islice(file2, 2, None, 4)
- all_stop_words = stopwords.words('english')
- movie_dict = defaultdict(Counter)
- stemmer_func = PorterStemmer().stem_word
- for movie_name, movie_plot in itertools.izip(line_one, line_two):
- movie_plot = movie_plot.lower()
- words = <see above - I am updating original post>
- all_words = [stemmer_func(word) for word in words
- if not word in all_stop_words]
- current_word_counter = Counter(all_words)
- movie_dict[movie_name].update(current_word_counter)
- word_dict.update(current_word_counter)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement