Advertisement
josephxsxn

hbase_filter_2

Mar 23rd, 2018
146
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 1.38 KB | None | 0 0
  1. /*
  2. run from cli with parameters like
  3. pig -x tez -f hbase_filter.pig -t ConstantCalculator -param START_DATE=1478790000000L -param END_DATE=1479790000000L -param STORAGE_DIR='/data/extract/TencentNews_stream/test_data' -param TABLE_NAME='TencentNews_stream' -param JSON_DATA_COLUMNFAM_CELL=cfInfo:value -param JOB_NAME=PIG_HBASE_EXTRACTION_TEST_RUN
  4. */
  5. %default START_DATE '1478790000000L'
  6. %default END_DATE '1479790000000L'
  7. %default STORAGE_DIR '/data/extract/TencentNews_stream/test_data'
  8. %default TABLE_NAME 'TencentNews_stream'
  9. %default JSON_DATA_COLUMNFAM_CELL 'cfInfo:tcArticle'
  10. %default JOB_NAME 'PIG_HBASE_EXTRACTION_TEST_RUN'
  11.  
  12. set job.name '$JOB_NAME';
  13. set tez.task.resource.memory.mb '2500';
  14. set mapreduce.map.memory.mb '2500';
  15. set mapreduce.map.java.opts '-Xmx2200m';
  16.  
  17. rm $STORAGE_DIR
  18.  
  19. define get_json_object HiveUDF('get_json_object');
  20.  
  21. raw = LOAD 'hbase://$TABLE_NAME'
  22.        USING org.apache.pig.backend.hadoop.hbase.HBaseStorage(
  23.        '$JSON_DATA_COLUMNFAM_CELL' , '-loadKey true -limit 1000')
  24.        AS (key:chararray,value:chararray);
  25. fix_date = FOREACH raw GENERATE key, value, ToMilliSeconds(ToDate(get_json_object(value, '$.crawlDate'))) as timstim;
  26. filtered_range = FILTER fix_date by (timstim >  $START_DATE AND timstim < $END_DATE);
  27. json_only_projection = FOREACH filtered_range GENERATE value;
  28. STORE json_only_projection INTO '$STORAGE_DIR' using PigStorage('\t');
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement