Guest User

Untitled

a guest
Oct 19th, 2018
89
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.13 KB | None | 0 0
  1. function getTermKey(term) {
  2. return term
  3. .normalize('NFD')
  4. .replace(/[\u0300-\u036f]/g, "")
  5. .replace(/[^\w]/g, "")
  6. .toLowerCase()
  7. };
  8.  
  9. function getTermsIn(document) {
  10. return document.split(/[\s_():.!?,;]+/)
  11. .map(getTermKey);
  12. }
  13.  
  14. function corpus() {
  15. const _documents = [];
  16. function add(document) {
  17. const termsInDocument = new Set(getTermsIn(document));
  18. _documents.push(termsInDocument);
  19. }
  20.  
  21. function idf(term) {
  22. const termKey = getTermKey(term);
  23. const numberOfDocumentsContainingTerm = _documents.reduce((prev, document) => document.has(termKey) ? prev + 1 : prev, 0);
  24. return Math.log(_documents.length / (1 + numberOfDocumentsContainingTerm)) / Math.log(10);
  25. }
  26.  
  27. function tf(term, document) {
  28. const termKey = getTermKey(term);
  29. const termsInDocument = getTermsIn(document)
  30. const termFrequency = termsInDocument.reduce((frequency, docTermKey) => docTermKey === termKey ? ++frequency : frequency, 0);
  31. return termFrequency / termsInDocument.length;
  32. }
  33.  
  34. function tfIdf(term, document) {
  35. return tf(term, document) * idf(term);
  36. }
  37.  
  38. return {
  39. add,
  40. tf,
  41. idf,
  42. tfIdf
  43. }
  44. }
Add Comment
Please, Sign In to add comment