Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // "received_at": "2018-06-22T06:04:12-0400"
- filtered_dt = FILTER extracted BY (dt >= ToDate('2018-06-23T00:00:00.000-0400') ) AND (dt <= ToDate('2018-06-23T23:59:59.999-0400') );
- extracted = FOREACH raw GENERATE ToDate(REGEX_EXTRACT($0, '"received_at":"(.*)"', 1)) as dt, REGEX_EXTRACT($0, '"type":"(.*)","event"', 1) as type;
- set tez.am.resource.memory.mb 4096;
- set tez.task.resource.memory.mb 4096;
- raw = LOAD '/project/sina_hdfs_validator_out/*' using TextLoader();
- extracted = FOREACH raw GENERATE ToString(ToDate(REGEX_EXTRACT($0, '"received_at":"(.*)"', 1)), 'yyyy-MM-ddZ') as dt, REGEX_EXTRACT($0, '"type":"(.*)","event"', 1) as type;
- grouped = GROUP extracted by (type, dt);
- counts = FOREACH grouped GENERATE COUNT(extracted), FLATTEN(group);
- dump counts;
- lim = limit extracted 10;
- dump lim
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement