Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- drwxr-xr-x - nifi hdfs 0 2018-07-26 09:52 /project/jsn/rtsf/pq_reddit_std
- drwxr-xr-x - nifi hdfs 0 2018-07-26 09:50 /project/jsn/rtsf/rtsf_reddit_std
- /*
- "ExtKey":"(.*)","Pub
- {
- "ExtKey": "55424fadc.e30rtbv"
- }
- (558979e74.e30ru72)
- */
- /*
- {
- "report": {
- "postid": "559d67134.e30s1gp"
- }
- }
- (588986aa0.e30rw7j)
- */
- raw_pq = LOAD '/project/jsn/rtsf/pq_reddit_std_atbody_02' using TextLoader;
- pq_ids = FOREACH raw_pq GENERATE REGEX_EXTRACT($0, '"ExtKey":"(.*)","Pub', 1) as pq_id;
- pq_not_null = FILTER pq_ids BY pq_id is NOT null;
- raw_rtsf = LOAD '/project/jsn/rtsf/rtsf_reddit_std_atbody_02' using TextLoader();
- rtsf_ids = FOREACH raw_rtsf GENERATE REGEX_EXTRACT($0, '"postid":"(.*)","site', 1) as rtsf_id;
- rtsf_not_null = FILTER rtsf_ids BY rtsf_id is NOT null;
- joined = join rtsf_not_null by rtsf_id FULL OUTER, pq_not_null by pq_id;
- both = join rtsf_not_null by rtsf_id, pq_not_null by pq_id;
- rtsf_only = FILTER joined BY pq_id is null;
- pq_only = FILTER joined BY rtsf_id is null;
- /* counts
- Input(s):
- Successfully read 1026 records (2500986 bytes) from: "/project/jsn/rtsf/rtsf_reddit_std_atbody_01"
- Successfully read 98 records (185917 bytes) from: "/project/jsn/rtsf/pq_reddit_std_atbody_01"
- Output(s):
- Successfully stored 72 records (3132 bytes) in: "hdfs://hadoopdevha/tmp/temp-863063160/tmp2025957871
- Input(s):
- Successfully read 124 records (241016 bytes) from: "/project/jsn/rtsf/pq_reddit_std_atbody_01"
- Successfully read 1045 records (2548255 bytes) from: "/project/jsn/rtsf/rtsf_reddit_std_atbody_01"
- Output(s):
- Successfully stored 91 records (3966 bytes) in: "hdfs://hadoopdevha/tmp/temp-863063160/tmp-367750326"
- Input(s):
- Successfully read 137 records (268376 bytes) from: "/project/jsn/rtsf/pq_reddit_std_atbody_01"
- Successfully read 1054 records (2574792 bytes) from: "/project/jsn/rtsf/rtsf_reddit_std_atbody_01"
- Output(s):
- Successfully stored 101 records (4406 bytes) in: "hdfs://hadoopdevha/tmp/temp-863063160/tmp-115032914
- Input(s):
- Successfully read 165 records (331760 bytes) from: "/project/jsn/rtsf/pq_reddit_std_atbody_01"
- Successfully read 1070 records (2626027 bytes) from: "/project/jsn/rtsf/rtsf_reddit_std_atbody_01"
- Output(s):
- Successfully stored 118 records (5152 bytes) in: "hdfs://hadoopdevha/tmp/temp-863063160/tmp1019834540"
- Input(s):
- Successfully read 7579 records (19435127 bytes) from: "/project/jsn/rtsf/rtsf_reddit_std_atbody_01"
- Successfully read 8896 records (19825368 bytes) from: "/project/jsn/rtsf/pq_reddit_std_atbody_01"
- Output(s):
- Successfully stored 6072 records (266104 bytes) in: "hdfs://hadoopdevha/tmp/temp1770655473/tmp-108573611"
- ### BATCH2
- Input(s):
- Successfully read 571 records (1430316 bytes) from: "/project/jsn/rtsf/rtsf_reddit_std_atbody_02"
- Successfully read 638 records (1375314 bytes) from: "/project/jsn/rtsf/pq_reddit_std_atbody_02"
- Output(s):
- Successfully stored 504 records (22164 bytes) in: "hdfs://hadoopdevha/tmp/temp-1478985887/tmp-182892140"
- */
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement