Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env bash
- OUT_DIR="out"
- NUM_REDUCERS=4
- hadoop fs -rm -r -skipTrash ${OUT_DIR}.tmp >/dev/null
- hadoop jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \
- -D mapreduce.job.name="113_1" \
- -D mapreduce.job.reduces=$NUM_REDUCERS \
- -files mapper.py,reducer.py \
- -mapper mapper.py \
- -reducer reducer.py \
- -input /data/wiki/en_articles \
- -output ${OUT_DIR}.tmp >/dev/null
- hadoop fs -rm -r -skipTrash ${OUT_DIR} >/dev/null
- hadoop jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \
- -D mapreduce.job.name="113_2" \
- -D mapreduce.job.reduces=1 \
- -D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
- -D mapred.text.key.comparator.options="-k2nr 1" \
- -D stream.num.map.output.key.fields=2 \
- -mapper cat \
- -reducer cat \
- -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
- -input ${OUT_DIR}.tmp \
- -output ${OUT_DIR} >/dev/null
- hadoop fs -cat ${OUT_DIR}/part-00000 2>/dev/null | head
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement