Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- function getTermKey(term) {
- return term
- .normalize('NFD')
- .replace(/[\u0300-\u036f]/g, "")
- .replace(/[^\w]/g, "")
- .toLowerCase()
- };
- function getTermsIn(document) {
- return document.split(/[\s_():.!?,;]+/)
- .map(getTermKey);
- }
- function corpus() {
- const _documents = [];
- function add(document) {
- const termsInDocument = new Set(getTermsIn(document));
- _documents.push(termsInDocument);
- }
- function idf(term) {
- const termKey = getTermKey(term);
- const numberOfDocumentsContainingTerm = _documents.reduce((prev, document) => document.has(termKey) ? prev + 1 : prev, 0);
- return Math.log(_documents.length / (1 + numberOfDocumentsContainingTerm)) / Math.log(10);
- }
- function tf(term, document) {
- const termKey = getTermKey(term);
- const termsInDocument = getTermsIn(document)
- const termFrequency = termsInDocument.reduce((frequency, docTermKey) => docTermKey === termKey ? ++frequency : frequency, 0);
- return termFrequency / termsInDocument.length;
- }
- function tfIdf(term, document) {
- return tf(term, document) * idf(term);
- }
- return {
- add,
- tf,
- idf,
- tfIdf
- }
- }
Add Comment
Please, Sign In to add comment