Advertisement
josephxsxn

sina_out_counter.pig

Jun 27th, 2018
372
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.79 KB | None | 0 0
  1. // "received_at": "2018-06-22T06:04:12-0400"
  2. filtered_dt = FILTER extracted BY (dt >= ToDate('2018-06-23T00:00:00.000-0400') ) AND (dt <= ToDate('2018-06-23T23:59:59.999-0400') );
  3. extracted = FOREACH raw GENERATE ToDate(REGEX_EXTRACT($0, '"received_at":"(.*)"', 1)) as dt, REGEX_EXTRACT($0, '"type":"(.*)","event"', 1) as type;
  4.  
  5. set tez.am.resource.memory.mb 4096;
  6. set tez.task.resource.memory.mb 4096;
  7. raw = LOAD '/project/sina_hdfs_validator_out/*' using TextLoader();
  8. extracted = FOREACH raw GENERATE ToString(ToDate(REGEX_EXTRACT($0, '"received_at":"(.*)"', 1)), 'yyyy-MM-ddZ') as dt, REGEX_EXTRACT($0, '"type":"(.*)","event"', 1) as type;
  9. grouped = GROUP extracted by (type, dt);
  10. counts = FOREACH grouped GENERATE COUNT(extracted), FLATTEN(group);
  11. dump counts;
  12.  
  13. lim = limit extracted 10;
  14. dump lim
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement