Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- set mapred.max.split.size 20000;
- set pig.maxCombinedSplitSize 200000;
- set default_parallel 30;
- er = LOAD 'er' AS (en : chararray, er : chararray);
- tokenized = FOREACH er GENERATE TOKENIZE(en) AS en, TOKENIZE(er) AS er;
- pairs = FOREACH tokenized GENERATE FLATTEN(en) AS en_word, FLATTEN(er) AS er_word;
- pairs_long = FILTER pairs BY (SIZE(en_word) > 4) AND (SIZE(er_word) > 4);
- grped = GROUP pairs_long BY (en_word, er_word);
- pair_counts = FOREACH grped GENERATE group AS pair, COUNT(pairs_long) AS count;
- set pig.maxCombinedSplitSize 2000000;
- counts_by_eng_word = GROUP pair_counts BY pair.$0;
- translations = FOREACH counts_by_eng_word {
- ordered = ORDER pair_counts BY count DESC;
- top_translation = LIMIT ordered 1;
- GENERATE group, FLATTEN(top_translation);
- };
- trans_good = FILTER translations BY $2 > 5;
- trans_ordered = ORDER translations BY $2 DESC;
- store trans_ordered into 'out';
Add Comment
Please, Sign In to add comment