Advertisement
Guest User

Untitled

a guest
Apr 18th, 2015
178
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.28 KB | None | 0 0
  1. import re
  2. import math
  3. import itertools
  4. from nltk.corpus import stopwords
  5. from nltk import PorterStemmer
  6. from collections import defaultdict
  7. from collections import Counter
  8. from itertools import dropwhile
  9.  
  10. import sys, getopt
  11.  
  12. inp = "inp_6000.txt" #input file name
  13. out = "bowfilter10" #output file name
  14. with open(inp,'r') as plot_data:
  15. main_dict = Counter()
  16. file1, file2 = itertools.tee(plot_data, 2)
  17. line_one = itertools.islice(file1, 0, None, 4)
  18. line_two = itertools.islice(file2, 2, None, 4)
  19. dictionary = defaultdict(Counter)
  20. doc_count = defaultdict(Counter)
  21. for movie_name, movie_plot in itertools.izip(line_one, line_two):
  22. movie_plot = movie_plot.lower()
  23. words = re.findall(r'w+', movie_plot, flags = re.UNICODE | re.LOCALE) #split words
  24. elemStopW = filter(lambda x: x not in stopwords.words('english'), words) #remove stop words, python nltk
  25. for word in elemStopW:
  26. word = PorterStemmer().stem_word(word) #use python stemmer class to do stemming
  27. #increment the word count of the movie in the particular movie synopsis
  28. dictionary[movie_name][word] += 1
  29. #increment the count of a partiular word in main dictionary which stores frequency of all documents.
  30. main_dict[word] += 1
  31. #This is done to calculate term frequency inverse document frequency. Takes note of the first occurance of the word in the synopsis and neglect all other.
  32. if doc_count[word]['this_mov']==0:
  33. doc_count[word].update(count=1, this_mov=1);
  34. for word in doc_count:
  35. doc_count[word].update(this_mov=-1)
  36. #print "---------main_dict---------"
  37. #print main_dict
  38. #Remove all the words with frequency less than 5 in whole set of movies
  39. for key, count in dropwhile(lambda key_count: key_count[1] >= 5, main_dict.most_common()):
  40. del main_dict[key]
  41. #print main_dict
  42. .#Write to file
  43. bow_vec = open(out, 'w');
  44. #calculate the the bog vector and write it
  45. m = len(dictionary)
  46. for movie_name in dictionary.keys():
  47. #print movie_name
  48. vector = []
  49. for word in list(main_dict):
  50. #print word, dictionary[movie_name][word]
  51. x = dictionary[movie_name][word] * math.log(m/doc_count[word]['count'], 2)
  52. vector.append(x)
  53. #write to file
  54. bow_vec.write("%s" % movie_name)
  55. for item in vector:
  56. bow_vec.write("%s," % item)
  57. bow_vec.write("n")
  58.  
  59. trans_table = string.maketrans(string.string.punctuation,
  60. ' '*len(string.punctuation)).lower()
  61. words = movie_plot.translate(trans_table).split()
  62.  
  63. stops = stopwords.words('english')
  64.  
  65. stops = set(stopwords.words('english'))
  66.  
  67. In [49]: my_list = range(100)
  68.  
  69. In [50]: %timeit 10 in my_list
  70. 1000000 loops, best of 3: 193 ns per loop
  71.  
  72. In [51]: %timeit 101 in my_list
  73. 1000000 loops, best of 3: 1.49 us per loop
  74.  
  75. In [52]: my_set = set(my_list)
  76.  
  77. In [53]: %timeit 101 in my_set
  78. 10000000 loops, best of 3: 45.2 ns per loop
  79.  
  80. In [54]: %timeit 10 in my_set
  81. 10000000 loops, best of 3: 47.2 ns per loop
  82.  
  83. In [30]: %timeit words = 'This is a long; and meaningless - sentence'.split(split_let)
  84. 1000000 loops, best of 3: 271 ns per loop
  85.  
  86. In [31]: %timeit words = re.findall(r'w+', 'This is a long; and meaningless - sentence', flags = re.UNICODE | re.LOCALE)
  87. 100000 loops, best of 3: 3.08 us per loop
  88.  
  89. word_dict = {key: count for key, count in
  90. takewhile(lambda key_count: itemgetter(1) >= 5,
  91. main_dict.most_common())
  92.  
  93. with open(inp,'r') as plot_data:
  94. word_dict = Counter()
  95. file1, file2 = itertools.tee(plot_data, 2)
  96. line_one = itertools.islice(file1, 0, None, 4)
  97. line_two = itetools.islice(file2, 2, None, 4)
  98. all_stop_words = stopwords.words('english')
  99. movie_dict = defaultdict(Counter)
  100. stemmer_func = PorterStemmer().stem_word
  101. for movie_name, movie_plot in itertools.izip(line_one, line_two):
  102. movie_plot = movie_plot.lower()
  103. words = <see above - I am updating original post>
  104. all_words = [stemmer_func(word) for word in words
  105. if not word in all_stop_words]
  106. current_word_counter = Counter(all_words)
  107. movie_dict[movie_name].update(current_word_counter)
  108. word_dict.update(current_word_counter)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement