SHARE
TWEET

Untitled

a guest Sep 17th, 2019 95 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. /*
  2.  * event_tree.mode_lob and event_tree.event_lob contain contain textual data which refers to rows in other tables.
  3.  * It is similar to FK constraints, but there is no DBMS support. So we have to parse these columns and update parts
  4.  * that refer to rows in other tables with the new identifiers for those rows. Given that the structure of event_tree.event_lob
  5.  * is far from trivial, doing this in SQL is ugly and not easy, but the alternative
  6.  * (download (in pieces) all the data and process it in Java by making more requests to DBMS)
  7.  * is likely not a viable solution performance-wise.
  8.  *
  9.  * The strings are parsed by
  10.  * - splitting them into arrays;
  11.  * - unnesting arrays into tables;
  12.  * - updating elements of arrays (which are arrays on their own) that require updates;
  13.  * - aggregating unnested array elements back into arrays;
  14.  * - joining arrays back into strings.
  15.  * Note that event_tree.mode_lob effectively contains an array of arrays,
  16.  * while event_tree.event_lob may contain array of arrays of arrays (i.e. double nesting)
  17.  * which makes the job of updating event_tree.event_lob much more difficult.
  18.  *
  19.  * Note that indices of PostgreSQL arrays are 1-based
  20.  * and DBMS allows them to go out of bounds of an array in which case the value is considered null.
  21.  *
  22.  * Originally this was an update command, but it was changed to be a select and
  23.  * real tables were replaced with fake ones produced by using values lists, so the query could be run immediately anywhere.
  24.  */
  25. with
  26.   new_event_tree_model_lob as (--the format of event_tree.model_lob is specified below
  27.     select
  28.     event_tree_id,
  29.     string_agg(
  30.       array_to_string(
  31.         model_attr_array[:1] ||--a slice of the array with indices <=1, we need an array type here instead of an element of the array
  32.         --if there is no matching file then model_lob refers to a file that does not exist, which means the data is corrupted
  33.         check_not_null(
  34.           files.new_id,
  35.           array['files for event_tree', event_tree_id::text, 'model_idx', model_idx::text] || 'model_attr_array'::text || model_attr_array)::text ||
  36.         model_attr_array[3] ||
  37.         check_not_null(
  38.           stream_function.new_id,
  39.           array['stream_function for event_tree', event_tree_id::text, 'model_idx', model_idx::text] || 'model_attr_array'::text || model_attr_array)::text ||
  40.         check_not_null(
  41.           function_metrics.new_id,
  42.           array['function_metrics for event_tree', event_tree_id::text, 'model_idx', model_idx::text] || 'model_attr_array'::text || model_attr_array)::text,
  43.         ','
  44.       ),
  45.       ';' order by model_idx
  46.     ) as new_model_lob
  47.     from
  48.       (select id as event_tree_id, model_idx, string_to_array(model, ',') as model_attr_array
  49.       from
  50.         (values (1001, '0,111,1,113,114;0,121,1,123,124'), (1002, '0,211,1,213,214')) as event_tree (id, model_lob)
  51.         cross join unnest(string_to_array(model_lob, ';')) with ordinality as model (model, model_idx)--a table containing each model from model_lob in its own row
  52.       ) as parsed_model_lob--a table containing array of attributes representing each model (an array of attributes) from model_lob in its own row
  53.       --we use left join despite we must always find a corresponding row in the right-side table in order to be able to check this requirement by using pg_temp.check_not_null
  54.       left join (values (111, -111), (121, -121), (211, -211)) as files (id, new_id) on model_attr_array[2]::bigint = files.id
  55.       left join (values (113, -113), (123, -123), (213, -213)) as stream_function (id, new_id) on model_attr_array[4]::bigint = stream_function.id
  56.       left join (values (114, -114), (124, -124), (214, -214)) as function_metrics (id, new_id) on model_attr_array[5]::bigint = function_metrics.id
  57.     group by event_tree_id
  58.   ),
  59.   new_event_tree_event_lob as (--the format of event_tree.event_lob is specified below
  60.     with
  61.       parsed_event_lob_parsed_new_attr11 as (--same as parsed_event_lob_parsed_attr11 but with updated value of 11th attr
  62.         select
  63.           event_tree_id,
  64.           event_idx,
  65.           /*
  66.            * There is only a single row for each event_tree_id and event_idx
  67.            * that has non-null event_attr_array. We are using max aggregate function to extract that non-null value instead of
  68.            * having the value in all rows and group by not only event_tree_id and event_idx
  69.            * but also event_attr_array which would have been more resource-consuming.
  70.            *
  71.            * max is an immutable function, so PostgreSQL can (and most likely does) calculate the result only once per selected row,
  72.            * thus we should not worry about "calling" the function multiple times per row with the same arguments.
  73.            */
  74.           case when (max(event_attr_array)::text[])[1] = 'S'
  75.             then (max(event_attr_array)::text[])[:10] || string_agg(new_element, '&' order by elmnt_idx) || (max(event_attr_array)::text[])[12:]
  76.             else max(event_attr_array)
  77.           end event_attr_array_new_attr11
  78.         from
  79.           (with
  80.             parsed_event_lob_parsed_attr11 as (
  81.               select
  82.                 event_tree_id, event_idx,
  83.                 /*
  84.                  * Select array only for the first element.
  85.                  * This allows us to group by event_idx instead of grouping by both event_idx and event_attr_array_no_attr11 thus reduces costs
  86.                  * and also potentially reduces memory / temporary storage consumption.
  87.                  */
  88.                 case
  89.                   when elmnt_idx = 1 then event_attr_array
  90.                   else null
  91.                 end as event_attr_array,
  92.                 elmnt_idx,
  93.                 string_to_array(elmnt, ':') as element_value_array
  94.               from
  95.                 (select
  96.                   event_tree_id,
  97.                   event_idx,
  98.                   event_attr_array,
  99.                   string_to_array(
  100.                     case
  101.                       when event_attr_array[1] = 'S' then
  102.                         case
  103.                           --'null_attr11' is a special dummy value for empty 11th attr to make sure we are not loosing it
  104.                           when event_attr_array[11] = '' then 'null_attr11'
  105.                           else event_attr_array[11]
  106.                         end
  107.                       /*
  108.                        * Have to use a string and later specify it as a null-string in string_to_array
  109.                        * because otherwise string_to_array produces an array with no elements which when unnested results in a table with no rows,
  110.                        * and cross join (which is also used when no join type is specified) with such a table also produces no rows, which leads to us losing data.
  111.                        */
  112.                       else 'null'
  113.                     end,
  114.                     '&', 'null'--specify null-string as mentioned above
  115.                   ) as event_attr11_element_array
  116.                 from
  117.                   (select id as event_tree_id, event_idx, string_to_array(event, ',') as event_attr_array
  118.                   from
  119.                     (values (1001, 'C,111,112,0~0,0,-1,0,113,-1,0;P,111,112,0~0,0,-1,114,0,0,0,0'), (1002, 'G,211,212,0~0,0,-1;S,221,222,0~0,0,-1,0,-1,-1,0,,-;S,221,222,0~0,0,-1,0,-1,-1,0,com/File1.java:123:com.Cls.function1:223&com/File2.java:231:com.Cls.function2:224&com/File3.java:312:com.Cls.function3:225&/java/lang/Thread.java:-1:java.lang.Thread.run:-1,-'), (1003, 'R,211,212,0~0,0,-1,-')) as event_tree (id, event_lob)
  120.                     cross join unnest(string_to_array(event_lob, ';')) with ordinality as event (event, event_idx)--a table containing each event from event_lob in its own row
  121.                   where event_tree.id != 1--event tree 1 is an invalid dummy entry
  122.                   ) as parsed_event_lob--a table containing array of attributes representing each event from event_lob in its own row
  123.                 --a table containing array of attributes representing each event from event_lob in its own row with 11th attr extracted in a separate column and parsed into an array of elements
  124.                 ) as parsed_event_lob_preparsed_attr11
  125.                 cross join unnest(event_attr11_element_array) with ordinality as elmnt (elmnt, elmnt_idx)
  126.             )
  127.           select--a table with updated element_value_array concatenated into string
  128.             event_tree_id,
  129.             event_idx,
  130.             event_attr_array,
  131.             elmnt_idx,
  132.             case
  133.               when element_value_array[1] = 'null_attr11' then ''
  134.               else array_to_string(
  135.                 element_value_array[:3] ||
  136.                 case
  137.                   when element_value_array[4] != '-1'--file instance is supposed to be in DB
  138.                   then check_not_null(--if there is no matching file instance then event_lob refers to a file instance that does not exist, which means the data is corrupted
  139.                     file_instance.new_id,
  140.                     array['file_instance for event_tree ', event_tree_id::text, 'event_idx', event_idx::text, 'elmnt_idx=', elmnt_idx::text] || 'element_value_array'::text || element_value_array)::text
  141.                   else '-1'
  142.                 end,
  143.               ':')
  144.             end as new_element
  145.           from
  146.             parsed_event_lob_parsed_attr11
  147.             --we use left join despite we must always find a corresponding row in the right-side table in order to be able to check this requirement by using pg_temp.check_not_null
  148.             left join (values (223, -223), (224, -224), (225, -225)) as file_instance (id, new_id) on element_value_array[4]::bigint = file_instance.id
  149.           ) parsed_event_lob_parsed_new_attr11--same as parsed_event_lob_parsed_attr11 but with updated value of 11th attr
  150.         group by event_tree_id, event_idx
  151.       )
  152.     select
  153.     event_tree_id,
  154.     string_agg(
  155.       array_to_string(
  156.         event_attr_array_new_attr11[:1] ||--a slice of the array with indices <=1, we need an array type here instead of an element of the array
  157.         check_not_null(--if there is no matching event tag then event_lob refers to an event tag that does not exist, which means the data is corrupted
  158.           event_tag.new_id,
  159.           array['event_tag for event_tree', event_tree_id::text, 'event_idx', event_idx::text] || 'event_attr_array_new_attr11'::text || event_attr_array_new_attr11)::text ||
  160.         check_not_null(
  161.           event_description.new_id,
  162.           array['event_description for event_tree', event_tree_id::text, 'event_idx', event_idx::text] || 'event_attr_array_new_attr11'::text || event_attr_array_new_attr11)::text ||
  163.         case
  164.           when event_attr_array_new_attr11[1] = 'C' or event_attr_array_new_attr11[1] = 'S' then
  165.             event_attr_array_new_attr11[4:7] ||
  166.             case
  167.               when event_attr_array_new_attr11[8] != '-1'--file is supposed to be in DB
  168.               then check_not_null(
  169.                 files.new_id,
  170.                 array['files for event_tree', event_tree_id::text, 'event_idx', event_idx::text] || 'event_attr_array_new_attr11'::text || event_attr_array_new_attr11)::text
  171.               else '-1'
  172.             end ||
  173.             event_attr_array_new_attr11[9:12]
  174.           when event_attr_array_new_attr11[1] = 'P' then
  175.             event_attr_array_new_attr11[4:6] ||
  176.             case
  177.               when event_attr_array_new_attr11[7] != '-1'--path_condition is supposed to be in DB
  178.               then check_not_null(
  179.                 path_condition.new_id,
  180.                 array['path_condition for event_tree', event_tree_id::text, 'event_idx', event_idx::text] || 'event_attr_array_new_attr11'::text || event_attr_array_new_attr11)::text
  181.               else '-1'
  182.             end ||
  183.             event_attr_array_new_attr11[8:11]
  184.           else
  185.             --it is [4:7] for 'R' and [4:6] for 'G', but array_to_string skips null (and therefore non-existing) array elements
  186.             event_attr_array_new_attr11[4:7]
  187.         end,
  188.         ','
  189.       ),
  190.       ';' order by event_idx
  191.     ) as new_event_lob
  192.     from
  193.       parsed_event_lob_parsed_new_attr11
  194.       --we use left join despite we must always find a corresponding row in the right-side table in order to be able to check this requirement by using pg_temp.check_not_null
  195.       left join (values (111, -111), (211, -211), (221, -221)) as event_tag (id, new_id) on event_attr_array_new_attr11[2]::bigint = event_tag.id
  196.       left join (values (112, -112), (212, -212), (222, -222)) as event_description (id, new_id) on event_attr_array_new_attr11[3]::bigint = event_description.id
  197.       left join (values (113, -113)) as files (id, new_id) on
  198.         case
  199.           when event_attr_array_new_attr11[1] = 'C' or event_attr_array_new_attr11[1] = 'S'
  200.           then event_attr_array_new_attr11[8]::bigint = files.id
  201.           else false
  202.         end
  203.       left join (values (114, -114)) as path_condition (id, new_id) on
  204.         case
  205.           when event_attr_array_new_attr11[1] = 'P' then event_attr_array_new_attr11[7]::bigint = path_condition.id
  206.           else false
  207.         end
  208.     group by event_tree_id
  209.   )
  210. select
  211.   event_tree_id,
  212.   new_model_lob,
  213.   new_event_lob,
  214.   /*
  215.    * PostgreSQL md5 function accepts a string instead of bytes, which means its behaviour depends on server_encoding
  216.    * (use "show server_encoding;" to query it).
  217.    * Since the application uses UTF-8 charset for converting a string into bytes,
  218.    * we can use PostgreSQL md5 function only if server_encoding is UTF-8.
  219.    * Fortunately, we require using UTF-8, so we are all good.
  220.    */
  221.   md5(coalesce(new_model_lob, '') || '|' || coalesce(new_event_lob, '')) as events_md5
  222. from
  223.   new_event_tree_model_lob
  224.   full join new_event_tree_event_lob using (event_tree_id);
  225.  
  226. --this function is used by the query above
  227. create or replace function check_not_null(in v bigint, in msg text[]) returns bigint as $$
  228. begin
  229.   if v is null then
  230.     raise exception 'The argument must not be null. %', msg;
  231.   end if;
  232.   return v;
  233. end;
  234. $$ language plpgsql immutable parallel safe;
  235.  
  236. /*
  237. event_tree.model_lob format:
  238.   models are separated by ';'
  239.   pieces of model data (attributes) are separated by ','
  240.   attributes:
  241.     [1] irrelevant
  242.     [2] FK -> files.id
  243.     [3] irrelevant
  244.     [4] FK -> stream_function.id
  245.     [5] FK -> function_metrics.id
  246.  
  247. event_tree.event_lob format:
  248.   events are separated by ';'
  249.   pieces of event data (attributes) are separated by ','
  250.   attributes:
  251.     [0] event type (one of C, S, P, R, G),
  252.     [1] FK -> event_tag.id,
  253.     [2] FK -> event_description.id,
  254.     [3] irrelevant
  255.     [4] irrelevant
  256.     [5] irrelevant
  257.   "C"-specific attributes:
  258.     [6] irrelevant
  259.     [7] FK -> files.id//can be -1 which means "no value"
  260.     [8] irrelevant
  261.     [9] irrelevant
  262.     "S"-specific attributes:
  263.       [10] elements
  264.         format:
  265.           elements are separated by '&'
  266.           pieces of element data (values) are separated by ':'
  267.           values:
  268.             [0] irrelevant
  269.             [1] irrelevant
  270.             [2] irrelevant
  271.             [3] FK -> file_instance.id//can be -1 which means "no value"
  272.       [11] irrelevant
  273.   "P"-specific attributes:
  274.     [6] FK -> path_condition.id//can be -1 which means "no value"
  275.     [7] irrelevant
  276.     [8] irrelevant
  277.     [9] irrelevant
  278.     [10] irrelevant
  279.   "R"-specific attributes:
  280.     [6] irrelevant
  281.   "G"-specific attributes:
  282.     none
  283. */
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top