Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Matvey Kryuchkov, [14 Nov 2019 at 22:37:39]:
- #!/usr/bin/env python3
- import sys
- import random
- for line in sys.stdin:
- print(random.randint(1, 10**4), line.strip())
- #!/usr/bin/env python3
- import sys
- import random
- current_num = random.randint(1, 5)
- current_count = 0
- current_lines = []
- total_count = 0
- for line in sys.stdin:
- try:
- rand, id_ = line.strip().split(' ')
- current_count += 1
- if (current_count == current_num):
- for id1 in current_lines:
- print(id1 + ',', end='')
- print(id_)
- current_lines = []
- current_count = 0
- current_num = random.randint(1, 5)
- total_count += 1
- else:
- current_lines.append(id_)
- except ValueError as e:
- continue
- if (total_count > 50):
- break
- if (total_count < 50):
- for id1 in current_lines:
- print(id1 + ',', end='')
- print(current_lines[-1])
- #!/usr/bin/env bash
- OUT_DIR="out"
- NUM_REDUCERS=5
- hadoop fs -rm -r -skipTrash $OUT_DIR* > /dev/null
- yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar -D mapreduce.job.reduces=${NUM_REDUCERS} -files mapper.py,reducer.py -mapper mapper.py -reducer reducer.py -input /data/ids_part -output $OUT_DIR > /dev/null
- hdfs dfs -cat ${OUT_DIR}/part-00000
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement