Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # coding: utf8
- from os import walk
- import json
- import io
- import math
- import time
- from multiprocessing import Process
- # минимальный логгер
- def log(text, writeType = 'a'):
- with open('logs/tfidf.log', writeType) as _file:
- _file.write(text + '\n')
- path = 'db/materials/'
- files = []
- for (dirpath, dirnames, filenames) in walk(path):
- files.extend(filenames)
- break
- log('files collected', 'w')
- docs_sourse = {}
- docs_lemmas = {}
- all_words = []
- docs_count_for_lemma = {}
- for _file in files:
- with io.open('db/materials/' + _file, 'r+', encoding='utf8') as data_file:
- data = json.load(data_file)
- docs_lemmas[_file] = []
- docs_sourse[_file] = data['lemmas']
- uniq_lemmas = set(data['lemmas'])
- for lemma in uniq_lemmas:
- _lemma = lemma.encode('utf-8')
- if (_lemma in docs_count_for_lemma):
- docs_count_for_lemma[_lemma] = docs_count_for_lemma[_lemma] + 1
- else:
- docs_count_for_lemma[_lemma] = 1
- all_words.append(_lemma)
- docs_lemmas[_file].append({'lemma': lemma, 'tf': data['lemmas'].count(lemma), 'idf': 0.0, 'tfidf': 0.0})
- all_words = set(all_words)
- log('lemmas inited - ' + str(len(all_words)))
- docs_count = len(docs_sourse)
- calculated_count = 0
- time_start = time.time()
- def calculate(files):
- for i in xrange(100000000): pass
- for _file in files:
- doc = docs_lemmas[_file]
- words_count = len(docs_sourse[_file])
- for lemma in doc:
- docs_count_lemma = docs_count_for_lemma[lemma['lemma'].encode('utf-8')]
- lemma['idf'] = float(docs_count) / float(docs_count_lemma)
- lemma['tfidf'] = lemma['tf'] * math.log(lemma['idf'], 2)
- #calculated_count = calculated_count + 1
- #log(str(calculated_count) + '/' + str(docs_count) + ' done')
- #calculate(docs_lemmas.keys()[:500])
- #calculate(docs_lemmas.keys()[500:])
- p1 = Process(target=calculate, args=(docs_lemmas.keys()[:500],))
- p2 = Process(target=calculate, args=(docs_lemmas.keys()[500:],))
- p1.start()
- p2.start()
- p1.join()
- p2.join()
- print(time.time() - time_start)
- log('tfidf calculated')
- for _file in files:
- with io.open('db/materials/' + _file, 'r+', encoding='utf8') as data_file:
- data = json.load(data_file)
- data['lemmas_tfidf'] = []
- if (docs_lemmas[_file]):
- for lemma in docs_lemmas[_file]:
- data['lemmas_tfidf'].append({'lemma': lemma['lemma'], 'tfidf': lemma['tfidf']})
- data_file.seek(0)
- data_file.write(json.dumps(data, sort_keys = False, indent = 4, ensure_ascii=False))
- data_file.truncate()
- log('tfidf saved')
- print('tfidf saved')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement