Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import logging
- import os
- import zipfile
- import multiprocessing
- from subprocess import call
- from gensim.corpora.textcorpus import TextCorpus
- from gensim.corpora import Dictionary, MmCorpus
- from gensim.models import TfidfModel
- from gensim import utils
- def get_list_of_files(root=None, file_ext=None):
- """
- a. traverse directories
- b. make a list including file paths which have given file extension.
- c. return the list
- """
- filename_list = []
- for root, dirs, files in os.walk(root):
- for f in files:
- if f.endswith(file_ext):
- filename_list.append(os.path.join(root, f))
- return filename_list
- def get_zip_file_size(file_path=None):
- file_name = os.path.basename(file_path)
- file_name, file_extention = os.path.splitext(file_name)
- return_value = None
- try:
- with zipfile.ZipFile(file_path, 'r') as zf:
- for i in zf.infolist():
- if i.filename == file_name + ".txt":
- return_value = i.file_size
- except:
- e = sys.exc_info()[0]
- print "ERROR:", e, file_path
- return return_value
- def get_filtered_zip_files(root=None,
- file_ext=None,
- number_of_files=None,
- min_text_size=None,
- max_text_size=None):
- files_list = get_list_of_files(root, file_ext)
- filtered_list = []
- for file_path in files_list:
- if len(filtered_list) >= number_of_files:
- break
- file_size = get_zip_file_size(file_path)
- if file_size > min_text_size and file_size < max_text_size:
- filtered_list.append(file_path)
- print ">>> number of files:", len(filtered_list)
- return filtered_list
- def read_zip_file(file_path=None):
- """
- a. read a zip file including text file
- b. return the text
- """
- try:
- # FIXME: work around
- call(['unzip', '-o', file_path])
- except:
- e = sys.exc_info()[0]
- print ">>> ERROR:", e
- file_name = os.path.basename(file_path)
- file_name, ext = os.path.splitext(file_name)
- unzipped_text_file_name = file_name + ".txt"
- with open(unzipped_text_file_name, 'rb') as fp:
- text = fp.read()
- os.remove(unzipped_text_file_name)
- return text
- def process_text(filename):
- text = read_zip_file(filename)
- if text is not None:
- text = utils.to_unicode(text, 'utf8', errors='ignore')
- text = utils.lemmatize(text)
- else:
- text = []
- return [filename, text]
- class GutenbergCorpus(TextCorpus):
- def __init__(self, input=None):
- self.processes = max(1, multiprocessing.cpu_count())
- self.iteration = 0
- self.filenames = []
- super(GutenbergCorpus, self).__init__(input)
- def get_texts(self):
- self.iteration += 1
- pool = multiprocessing.Pool(self.processes)
- file_names = get_filtered_zip_files(self.input, '.zip', 20000, 100000, 700000)
- for index, item in enumerate(pool.imap(process_text, file_names)):
- print ">> processing", index + 1, "/", len(file_names)
- if self.iteration >= 2 :
- self.filenames.append(item[0])
- yield item[1]
- DEFAULT_DICT_SIZE = 100000
- if __name__ == '__main__':
- import sys
- import logging
- import gensim
- import bz2
- root = '../www.gutenberg.lib.md.us'
- prefix = 'gutenberg'
- gutenberg = GutenbergCorpus(root)
- """
- gutenberg.dictionary.filter_extremes(no_below=10,
- no_above=0.2,
- keep_n=DEFAULT_DICT_SIZE)
- """
- MmCorpus.serialize(prefix + '_bow.mm', gutenberg, progress_cnt=10000)
- gutenberg.dictionary.save_as_text(prefix + '_wordids.txt.bz2')
- with open('gutenberg_filename.txt', 'wb') as f:
- for filename in gutenberg.filenames:
- print >> f, filename
- dictionary = Dictionary.load_from_text(prefix + '_wordids.txt.bz2')
- del gutenberg
- mm = MmCorpus(prefix + '_bow.mm')
- tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
- MmCorpus.serialize(prefix + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
- logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
- id2word = gensim.corpora.Dictionary.load_from_text('gutenberg_wordids.txt.bz2')
- mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm')
- lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=20, chunksize=100)
- lda.save('gutenberg_idf.model')
Add Comment
Please, Sign In to add comment