Guest User

Untitled

a guest
Oct 20th, 2017
78
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.51 KB | None | 0 0
  1. import logging
  2. import os
  3. import zipfile
  4. import multiprocessing
  5. from subprocess import call
  6.  
  7. from gensim.corpora.textcorpus import TextCorpus
  8. from gensim.corpora import Dictionary, MmCorpus
  9. from gensim.models import TfidfModel
  10. from gensim import utils
  11.  
  12.  
  13. def get_list_of_files(root=None, file_ext=None):
  14. """
  15. a. traverse directories
  16. b. make a list including file paths which have given file extension.
  17. c. return the list
  18. """
  19. filename_list = []
  20. for root, dirs, files in os.walk(root):
  21. for f in files:
  22. if f.endswith(file_ext):
  23. filename_list.append(os.path.join(root, f))
  24. return filename_list
  25.  
  26.  
  27. def get_zip_file_size(file_path=None):
  28. file_name = os.path.basename(file_path)
  29. file_name, file_extention = os.path.splitext(file_name)
  30. return_value = None
  31. try:
  32. with zipfile.ZipFile(file_path, 'r') as zf:
  33. for i in zf.infolist():
  34. if i.filename == file_name + ".txt":
  35. return_value = i.file_size
  36. except:
  37. e = sys.exc_info()[0]
  38. print "ERROR:", e, file_path
  39. return return_value
  40.  
  41.  
  42. def get_filtered_zip_files(root=None,
  43. file_ext=None,
  44. number_of_files=None,
  45. min_text_size=None,
  46. max_text_size=None):
  47. files_list = get_list_of_files(root, file_ext)
  48. filtered_list = []
  49. for file_path in files_list:
  50. if len(filtered_list) >= number_of_files:
  51. break
  52.  
  53. file_size = get_zip_file_size(file_path)
  54. if file_size > min_text_size and file_size < max_text_size:
  55. filtered_list.append(file_path)
  56. print ">>> number of files:", len(filtered_list)
  57. return filtered_list
  58.  
  59.  
  60. def read_zip_file(file_path=None):
  61. """
  62. a. read a zip file including text file
  63. b. return the text
  64. """
  65. try:
  66. # FIXME: work around
  67. call(['unzip', '-o', file_path])
  68. except:
  69. e = sys.exc_info()[0]
  70. print ">>> ERROR:", e
  71. file_name = os.path.basename(file_path)
  72. file_name, ext = os.path.splitext(file_name)
  73. unzipped_text_file_name = file_name + ".txt"
  74. with open(unzipped_text_file_name, 'rb') as fp:
  75. text = fp.read()
  76. os.remove(unzipped_text_file_name)
  77. return text
  78.  
  79.  
  80. def process_text(filename):
  81. text = read_zip_file(filename)
  82. if text is not None:
  83. text = utils.to_unicode(text, 'utf8', errors='ignore')
  84. text = utils.lemmatize(text)
  85. else:
  86. text = []
  87. return [filename, text]
  88.  
  89.  
  90. class GutenbergCorpus(TextCorpus):
  91.  
  92. def __init__(self, input=None):
  93. self.processes = max(1, multiprocessing.cpu_count())
  94. self.iteration = 0
  95. self.filenames = []
  96. super(GutenbergCorpus, self).__init__(input)
  97.  
  98. def get_texts(self):
  99. self.iteration += 1
  100. pool = multiprocessing.Pool(self.processes)
  101. file_names = get_filtered_zip_files(self.input, '.zip', 20000, 100000, 700000)
  102. for index, item in enumerate(pool.imap(process_text, file_names)):
  103. print ">> processing", index + 1, "/", len(file_names)
  104. if self.iteration >= 2 :
  105. self.filenames.append(item[0])
  106. yield item[1]
  107.  
  108.  
  109. DEFAULT_DICT_SIZE = 100000
  110.  
  111.  
  112. if __name__ == '__main__':
  113. import sys
  114. import logging
  115. import gensim
  116. import bz2
  117.  
  118. root = '../www.gutenberg.lib.md.us'
  119. prefix = 'gutenberg'
  120. gutenberg = GutenbergCorpus(root)
  121. """
  122. gutenberg.dictionary.filter_extremes(no_below=10,
  123. no_above=0.2,
  124. keep_n=DEFAULT_DICT_SIZE)
  125. """
  126. MmCorpus.serialize(prefix + '_bow.mm', gutenberg, progress_cnt=10000)
  127. gutenberg.dictionary.save_as_text(prefix + '_wordids.txt.bz2')
  128. with open('gutenberg_filename.txt', 'wb') as f:
  129. for filename in gutenberg.filenames:
  130. print >> f, filename
  131. dictionary = Dictionary.load_from_text(prefix + '_wordids.txt.bz2')
  132. del gutenberg
  133. mm = MmCorpus(prefix + '_bow.mm')
  134. tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
  135. MmCorpus.serialize(prefix + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
  136.  
  137. logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
  138. id2word = gensim.corpora.Dictionary.load_from_text('gutenberg_wordids.txt.bz2')
  139. mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm')
  140. lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=20, chunksize=100)
  141. lda.save('gutenberg_idf.model')
Add Comment
Please, Sign In to add comment