Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // Salita.js
- // Extracts words from dialogues.
- // Prerequisites: Node.js
- // How to run: `node salita.js <file>`
- // (c) 2018 Ned Palacios
- const fs = require('fs');
- // Blacklisted words
- const blacklist = ['ang', 'ang mga', 'si', 'sina', 'so', 'ah', 'ahh', 'at', 'kung', 'po', 'sa', 'ka', 'ko', 'ay', 'ba', 'na', 'ng', 'hay', 'eh', 'oo', 'o', 'op', 'nyo', 'mo', 'umm', 'may', 'oh', 'ok', 'kahit', 'duterte', 'bali', 'nalang', 'tsaka', 'mga', 'ninyo', 'niyo', 'ako', 'din', 'pero'];
- // Get contents of the file
- const textContents = fs.readFileSync(process.argv[2], 'utf8');
- // Gets only the sentence.
- const sentences = textContents.split('\n').map(s => s.split(': ')[1]);
- // Sanitize: strips all symbols and blacklisted words
- function sanitize(sentence) {
- // Sentence -> words
- let words = sentence.split(' ');
- // Removes words from the blacklist and show only "truthy" values.
- return words.map(word => {
- return !blacklist.includes(word.replace(/[^a-zA-Z ]/g, "").toLowerCase()) && word.replace(/[^a-zA-Z ]/g, "");
- }).filter(Boolean);
- }
- // Analyze
- function analyze() {
- // Extract all words from all sentences.
- let wordsArray = sentences.map(s => sanitize(s));
- // This is where we will store the unique words.
- let words = [];
- // Checks if the word exists. If the word exists, the word will not be pushed to the word array.
- wordsArray.forEach(w => {
- w.map(word => {
- !words.includes(word.toLowerCase()) && words.push(word.toLowerCase());
- });
- });
- // Output final result to output.txt
- fs.writeFileSync('./output.txt', words.join(', \n'), { encoding: 'utf-8' });
- // Indicates success.
- console.log('Done.');
- console.log(`Original: ${sentences.map(s => s.split(' ').length).reduce((p, v) => p + v)} Words`);
- console.log(`Sanitized: ${wordsArray.map(w => w.length).reduce((p, v) => p + v)} Words`);
- console.log(`Reduced to ${words.length} words; ${Math.floor(words.length/5)} per member`);
- }
- // Execute
- analyze();
Add Comment
Please, Sign In to add comment