Guest User

Untitled

a guest
Jan 21st, 2018
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.87 KB | None | 0 0
  1. set mapred.max.split.size 20000;
  2. set pig.maxCombinedSplitSize 200000;
  3. set default_parallel 30;
  4.  
  5.  
  6. er = LOAD 'er' AS (en : chararray, er : chararray);
  7. tokenized = FOREACH er GENERATE TOKENIZE(en) AS en, TOKENIZE(er) AS er;
  8. pairs = FOREACH tokenized GENERATE FLATTEN(en) AS en_word, FLATTEN(er) AS er_word;
  9. pairs_long = FILTER pairs BY (SIZE(en_word) > 4) AND (SIZE(er_word) > 4);
  10. grped = GROUP pairs_long BY (en_word, er_word);
  11. pair_counts = FOREACH grped GENERATE group AS pair, COUNT(pairs_long) AS count;
  12. set pig.maxCombinedSplitSize 2000000;
  13.  
  14. counts_by_eng_word = GROUP pair_counts BY pair.$0;
  15. translations = FOREACH counts_by_eng_word {
  16. ordered = ORDER pair_counts BY count DESC;
  17. top_translation = LIMIT ordered 1;
  18. GENERATE group, FLATTEN(top_translation);
  19. };
  20. trans_good = FILTER translations BY $2 > 5;
  21. trans_ordered = ORDER translations BY $2 DESC;
  22.  
  23.  
  24. store trans_ordered into 'out';
Add Comment
Please, Sign In to add comment