Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- run from cli with parameters like
- pig -x tez -f hbase_filter.pig -t ConstantCalculator -param START_DATE=1478790000000L -param END_DATE=1479790000000L -param STORAGE_DIR='/data/extract/TencentNews_stream/test_data' -param TABLE_NAME='TencentNews_stream' -param JSON_DATA_COLUMNFAM_CELL=cfInfo:value -param JOB_NAME=PIG_HBASE_EXTRACTION_TEST_RUN
- */
- %default START_DATE '1478790000000L'
- %default END_DATE '1479790000000L'
- %default STORAGE_DIR '/data/extract/TencentNews_stream/test_data'
- %default TABLE_NAME 'TencentNews_stream'
- %default JSON_DATA_COLUMNFAM_CELL 'cfInfo:tcArticle'
- %default JOB_NAME 'PIG_HBASE_EXTRACTION_TEST_RUN'
- set job.name '$JOB_NAME';
- set tez.task.resource.memory.mb '2500';
- set mapreduce.map.memory.mb '2500';
- set mapreduce.map.java.opts '-Xmx2200m';
- rm $STORAGE_DIR
- define get_json_object HiveUDF('get_json_object');
- raw = LOAD 'hbase://$TABLE_NAME'
- USING org.apache.pig.backend.hadoop.hbase.HBaseStorage(
- '$JSON_DATA_COLUMNFAM_CELL' , '-loadKey true -limit 1000')
- AS (key:chararray,value:chararray);
- fix_date = FOREACH raw GENERATE key, value, ToMilliSeconds(ToDate(get_json_object(value, '$.crawlDate'))) as timstim;
- filtered_range = FILTER fix_date by (timstim > $START_DATE AND timstim < $END_DATE);
- json_only_projection = FOREACH filtered_range GENERATE value;
- STORE json_only_projection INTO '$STORAGE_DIR' using PigStorage('\t');
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement