Advertisement
josephxsxn

pig_rtsf_compare_pq.pig

Jul 26th, 2018
154
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 2.98 KB | None | 0 0
  1. drwxr-xr-x   - nifi hdfs          0 2018-07-26 09:52 /project/jsn/rtsf/pq_reddit_std
  2. drwxr-xr-x   - nifi hdfs          0 2018-07-26 09:50 /project/jsn/rtsf/rtsf_reddit_std
  3. /*
  4. "ExtKey":"(.*)","Pub
  5. {
  6. "ExtKey": "55424fadc.e30rtbv"
  7. }
  8. (558979e74.e30ru72)
  9. */
  10.  
  11.  
  12. /*
  13. {
  14.     "report": {
  15.         "postid": "559d67134.e30s1gp"
  16.    }
  17. }
  18.  
  19. (588986aa0.e30rw7j)
  20.  
  21.    */
  22.  
  23.  
  24. raw_pq = LOAD '/project/jsn/rtsf/pq_reddit_std_atbody_02' using TextLoader;
  25. pq_ids = FOREACH raw_pq GENERATE REGEX_EXTRACT($0, '"ExtKey":"(.*)","Pub', 1) as pq_id;
  26. pq_not_null = FILTER pq_ids BY pq_id is NOT null;
  27.    
  28. raw_rtsf = LOAD '/project/jsn/rtsf/rtsf_reddit_std_atbody_02' using TextLoader();
  29. rtsf_ids = FOREACH raw_rtsf GENERATE REGEX_EXTRACT($0, '"postid":"(.*)","site', 1) as rtsf_id;
  30. rtsf_not_null = FILTER rtsf_ids BY rtsf_id is NOT null;
  31.  
  32. joined = join rtsf_not_null by rtsf_id FULL OUTER, pq_not_null by pq_id;
  33.  
  34. both = join rtsf_not_null by rtsf_id, pq_not_null by pq_id;
  35. rtsf_only = FILTER joined BY pq_id is null;
  36. pq_only = FILTER joined BY rtsf_id is null;
  37.  
  38.  
  39. /* counts
  40. Input(s):
  41. Successfully read 1026 records (2500986 bytes) from: "/project/jsn/rtsf/rtsf_reddit_std_atbody_01"
  42. Successfully read 98 records (185917 bytes) from: "/project/jsn/rtsf/pq_reddit_std_atbody_01"
  43.  
  44. Output(s):
  45. Successfully stored 72 records (3132 bytes) in: "hdfs://hadoopdevha/tmp/temp-863063160/tmp2025957871
  46.  
  47. Input(s):
  48. Successfully read 124 records (241016 bytes) from: "/project/jsn/rtsf/pq_reddit_std_atbody_01"
  49. Successfully read 1045 records (2548255 bytes) from: "/project/jsn/rtsf/rtsf_reddit_std_atbody_01"
  50.  
  51. Output(s):
  52. Successfully stored 91 records (3966 bytes) in: "hdfs://hadoopdevha/tmp/temp-863063160/tmp-367750326"
  53.  
  54. Input(s):
  55. Successfully read 137 records (268376 bytes) from: "/project/jsn/rtsf/pq_reddit_std_atbody_01"
  56. Successfully read 1054 records (2574792 bytes) from: "/project/jsn/rtsf/rtsf_reddit_std_atbody_01"
  57.  
  58. Output(s):
  59. Successfully stored 101 records (4406 bytes) in: "hdfs://hadoopdevha/tmp/temp-863063160/tmp-115032914
  60.  
  61. Input(s):
  62. Successfully read 165 records (331760 bytes) from: "/project/jsn/rtsf/pq_reddit_std_atbody_01"
  63. Successfully read 1070 records (2626027 bytes) from: "/project/jsn/rtsf/rtsf_reddit_std_atbody_01"
  64.  
  65. Output(s):
  66. Successfully stored 118 records (5152 bytes) in: "hdfs://hadoopdevha/tmp/temp-863063160/tmp1019834540"
  67.  
  68. Input(s):
  69. Successfully read 7579 records (19435127 bytes) from: "/project/jsn/rtsf/rtsf_reddit_std_atbody_01"
  70. Successfully read 8896 records (19825368 bytes) from: "/project/jsn/rtsf/pq_reddit_std_atbody_01"
  71.  
  72. Output(s):
  73. Successfully stored 6072 records (266104 bytes) in: "hdfs://hadoopdevha/tmp/temp1770655473/tmp-108573611"
  74.  
  75.  
  76.  
  77. ### BATCH2
  78. Input(s):
  79. Successfully read 571 records (1430316 bytes) from: "/project/jsn/rtsf/rtsf_reddit_std_atbody_02"
  80. Successfully read 638 records (1375314 bytes) from: "/project/jsn/rtsf/pq_reddit_std_atbody_02"
  81.  
  82. Output(s):
  83. Successfully stored 504 records (22164 bytes) in: "hdfs://hadoopdevha/tmp/temp-1478985887/tmp-182892140"
  84.  
  85. */
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement