Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- set job.name HDFS-CMD-COUNTER;
- set pig.maxCombinedSplitSize 1573741824;
- set pig.splitCombination true;
- set pig.exec.reducers.max 15000;
- set pig.tmpfilecompression true;
- set mapreduce.input.fileinputformat.split.minsize 1573741824;
- set mapreduce.input.fileinputformat.split.maxsize 1573741824;
- set mapreduce.task.io.sort.mb 2047;
- set mapreduce.map.memory.mb 4096;
- set mapreduce.map.java.opts '-Xmx3g';
- set mapreduce.reduce.memory.mb 16192;
- set mapreduce.reduce.java.opts '-Xmx15g';
- set tez.am.grouping.min-size 1573741824;
- set tez.am.grouping.max-size 1573741824;
- set tez.am.grouping.split-waves 1.0;
- set tez.runtime.io.sort.mb 2047;
- set tez.am.resource.memory.mb 32000;
- set tez.am.launch.cmd-opts '-XX:+PrintGCDetails -verbose:gc -XX:+PrintGCTimeStamps -XX:+UseNUMA -XX:+UseParallelGC -Xmx30g';
- set tez.task.resource.memory.mb 4096;
- set tez.task.launch.cmd-opts '-XX:+PrintGCDetails -verbose:gc -XX:+PrintGCTimeStamps -XX:+UseNUMA -XX:+UseParallelGC -Xmx3500m';
- rm hdfs-reports/cmduse/
- %DEFAULT USER 'jniemie8'
- %DEFAULT RANGERHDFSLOGSINPUT '/ranger/audit/$USER/hdfs/*/*'
- rawhdfsrangerlog = LOAD '$RANGERHDFSLOGSINPUT' using JsonLoader('repotype:chararray, repo:chararray, reqUser:chararray, evtTime:chararray, access:chararray, resource:chararray, resType:chararray, action:chararray, result:chararray, policy:chararray, enforcer:chararray, sess:chararray, cliType:chararray, cliIP:chararray, reqData:chararray, agentHost:chararray, logType:chararray, id:chararray, seq_num:chararray, event_count:chararray, event_dur_ms:chararray');
- types_proj = FOREACH rawhdfsrangerlog GENERATE access;
- types_group = GROUP types_proj BY access;
- types = DISTINCT types_group;
- TypeUserDate = FOREACH rawhdfsrangerlog GENERATE ToDate(evtTime, 'yyyy-MM-dd HH:mm:ss.SSS') as timstim, ToString(ToDate(evtTime, 'yyyy-MM-dd HH:mm:ss.SSS'), 'yyyy-MM-dd') as day, reqUser, access;
- TotalPerDayGroups = GROUP TypeUserDate BY day;
- PerDayCount = FOREACH TotalPerDayGroups GENERATE FLATTEN(group) as day, 'TOTAL', COUNT(TypeUserDate) as count;
- TotalPerUserGroups = GROUP TypeUserDate BY (day, reqUser);
- PerDayPerUserCount = FOREACH TotalPerUserGroups GENERATE FLATTEN(group), COUNT(TypeUserDate) as count;
- unionForStorage= UNION PerDayCount, PerDayPerUserCount;
- orderedStorageUnion = ORDER unionForStorage BY $0, $2 PARALLEL 1;
- STORE orderedStorageUnion INTO 'hdfs-reports/cmduse/' USING PigStorage();
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement