Advertisement
Guest User

Untitled

a guest
Sep 22nd, 2017
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Awk 0.84 KB | None | 0 0
  1. function extractSamples()
  2. {# Extracting samples
  3.   print "Extracting samples from subtitles..."
  4.   subs = "./subs/";
  5.   subs8 = "./subs8/";
  6.   samp = "./05-samples.csv";
  7.   uniq = "./05.1-uniq-samples.csv";
  8.   out = "";
  9.  
  10.   system("cat /dev/null >"samp);
  11.   FS="|"
  12.   while (getline <zhwd)
  13.   {# match every entry of `cnwd` or `twwd` againt subtitles files
  14.     # find . -iname '*' -exec iconv -f BIG5 -t UTF8 \{} -o ../subs-utf8/\{} \; # re-encode from BIG5 to UTF8
  15.     for (i=1; i<NF; i++)
  16.     {
  17.       word = $i
  18.         cmd = sprintf("grep -h %s %s/*", word, subs8); # -h, --no-filename
  19.         while (cmd | getline)
  20.         {
  21.           out = sprintf("INSERT INTO `samples` VALUES('%s', '%s')", mysqlEscape(word), $1);
  22.           print out >samp;
  23.         }
  24.       close(cmd); # mandatory
  25.     # system("uniq "samp" "uniq);
  26.     }
  27.   }
  28.   print "\tOK"
  29. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement