Advertisement
Guest User

tfidf.py

a guest
May 25th, 2019
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.50 KB | None | 0 0
  1. # coding: utf8
  2.  
  3. from os import walk
  4. import json
  5. import io
  6. import math
  7. import time
  8. from multiprocessing import Process
  9.  
  10. # минимальный логгер
  11. def log(text, writeType = 'a'):
  12. with open('logs/tfidf.log', writeType) as _file:
  13. _file.write(text + '\n')
  14.  
  15. path = 'db/materials/'
  16. files = []
  17.  
  18. for (dirpath, dirnames, filenames) in walk(path):
  19. files.extend(filenames)
  20. break
  21.  
  22. log('files collected', 'w')
  23.  
  24. docs_sourse = {}
  25. docs_lemmas = {}
  26. all_words = []
  27. docs_count_for_lemma = {}
  28.  
  29.  
  30. for _file in files:
  31. with io.open('db/materials/' + _file, 'r+', encoding='utf8') as data_file:
  32. data = json.load(data_file)
  33. docs_lemmas[_file] = []
  34. docs_sourse[_file] = data['lemmas']
  35. uniq_lemmas = set(data['lemmas'])
  36. for lemma in uniq_lemmas:
  37. _lemma = lemma.encode('utf-8')
  38. if (_lemma in docs_count_for_lemma):
  39. docs_count_for_lemma[_lemma] = docs_count_for_lemma[_lemma] + 1
  40. else:
  41. docs_count_for_lemma[_lemma] = 1
  42.  
  43. all_words.append(_lemma)
  44. docs_lemmas[_file].append({'lemma': lemma, 'tf': data['lemmas'].count(lemma), 'idf': 0.0, 'tfidf': 0.0})
  45.  
  46. all_words = set(all_words)
  47. log('lemmas inited - ' + str(len(all_words)))
  48.  
  49. docs_count = len(docs_sourse)
  50. calculated_count = 0
  51. time_start = time.time()
  52.  
  53. def calculate(files):
  54. for i in xrange(100000000): pass
  55. for _file in files:
  56. doc = docs_lemmas[_file]
  57. words_count = len(docs_sourse[_file])
  58. for lemma in doc:
  59. docs_count_lemma = docs_count_for_lemma[lemma['lemma'].encode('utf-8')]
  60.  
  61. lemma['idf'] = float(docs_count) / float(docs_count_lemma)
  62. lemma['tfidf'] = lemma['tf'] * math.log(lemma['idf'], 2)
  63. #calculated_count = calculated_count + 1
  64. #log(str(calculated_count) + '/' + str(docs_count) + ' done')
  65.  
  66. #calculate(docs_lemmas.keys()[:500])
  67. #calculate(docs_lemmas.keys()[500:])
  68.  
  69. p1 = Process(target=calculate, args=(docs_lemmas.keys()[:500],))
  70. p2 = Process(target=calculate, args=(docs_lemmas.keys()[500:],))
  71. p1.start()
  72. p2.start()
  73. p1.join()
  74. p2.join()
  75.  
  76. print(time.time() - time_start)
  77.  
  78. log('tfidf calculated')
  79.  
  80. for _file in files:
  81. with io.open('db/materials/' + _file, 'r+', encoding='utf8') as data_file:
  82. data = json.load(data_file)
  83. data['lemmas_tfidf'] = []
  84. if (docs_lemmas[_file]):
  85. for lemma in docs_lemmas[_file]:
  86. data['lemmas_tfidf'].append({'lemma': lemma['lemma'], 'tfidf': lemma['tfidf']})
  87. data_file.seek(0)
  88. data_file.write(json.dumps(data, sort_keys = False, indent = 4, ensure_ascii=False))
  89. data_file.truncate()
  90.  
  91. log('tfidf saved')
  92.  
  93. print('tfidf saved')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement