Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- function extractSamples()
- {# Extracting samples
- print "Extracting samples from subtitles..."
- subs = "./subs/";
- subs8 = "./subs8/";
- samp = "./05-samples.csv";
- uniq = "./05.1-uniq-samples.csv";
- out = "";
- system("cat /dev/null >"samp);
- FS="|"
- while (getline <zhwd)
- {# match every entry of `cnwd` or `twwd` againt subtitles files
- # find . -iname '*' -exec iconv -f BIG5 -t UTF8 \{} -o ../subs-utf8/\{} \; # re-encode from BIG5 to UTF8
- for (i=1; i<NF; i++)
- {
- word = $i
- cmd = sprintf("grep -h %s %s/*", word, subs8); # -h, --no-filename
- while (cmd | getline)
- {
- out = sprintf("INSERT INTO `samples` VALUES('%s', '%s')", mysqlEscape(word), $1);
- print out >samp;
- }
- close(cmd); # mandatory
- # system("uniq "samp" "uniq);
- }
- }
- print "\tOK"
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement