Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- * event_tree.mode_lob and event_tree.event_lob contain contain textual data which refers to rows in other tables.
- * It is similar to FK constraints, but there is no DBMS support. So we have to parse these columns and update parts
- * that refer to rows in other tables with the new identifiers for those rows. Given that the structure of event_tree.event_lob
- * is far from trivial, doing this in SQL is ugly and not easy, but the alternative
- * (download (in pieces) all the data and process it in Java by making more requests to DBMS)
- * is likely not a viable solution performance-wise.
- *
- * The strings are parsed by
- * - splitting them into arrays;
- * - unnesting arrays into tables;
- * - updating elements of arrays (which are arrays on their own) that require updates;
- * - aggregating unnested array elements back into arrays;
- * - joining arrays back into strings.
- * Note that event_tree.mode_lob effectively contains an array of arrays,
- * while event_tree.event_lob may contain array of arrays of arrays (i.e. double nesting)
- * which makes the job of updating event_tree.event_lob much more difficult.
- *
- * Note that indices of PostgreSQL arrays are 1-based
- * and DBMS allows them to go out of bounds of an array in which case the value is considered null.
- *
- * Originally this was an update command, but it was changed to be a select and
- * real tables were replaced with fake ones produced by using values lists, so the query could be run immediately anywhere.
- */
- with
- new_event_tree_model_lob as (--the format of event_tree.model_lob is specified below
- select
- event_tree_id,
- string_agg(
- array_to_string(
- model_attr_array[:1] ||--a slice of the array with indices <=1, we need an array type here instead of an element of the array
- --if there is no matching file then model_lob refers to a file that does not exist, which means the data is corrupted
- check_not_null(
- files.new_id,
- array['files for event_tree', event_tree_id::text, 'model_idx', model_idx::text] || 'model_attr_array'::text || model_attr_array)::text ||
- model_attr_array[3] ||
- check_not_null(
- stream_function.new_id,
- array['stream_function for event_tree', event_tree_id::text, 'model_idx', model_idx::text] || 'model_attr_array'::text || model_attr_array)::text ||
- check_not_null(
- function_metrics.new_id,
- array['function_metrics for event_tree', event_tree_id::text, 'model_idx', model_idx::text] || 'model_attr_array'::text || model_attr_array)::text,
- ','
- ),
- ';' order by model_idx
- ) as new_model_lob
- from
- (select id as event_tree_id, model_idx, string_to_array(model, ',') as model_attr_array
- from
- (values (1001, '0,111,1,113,114;0,121,1,123,124'), (1002, '0,211,1,213,214')) as event_tree (id, model_lob)
- cross join unnest(string_to_array(model_lob, ';')) with ordinality as model (model, model_idx)--a table containing each model from model_lob in its own row
- ) as parsed_model_lob--a table containing array of attributes representing each model (an array of attributes) from model_lob in its own row
- --we use left join despite we must always find a corresponding row in the right-side table in order to be able to check this requirement by using pg_temp.check_not_null
- left join (values (111, -111), (121, -121), (211, -211)) as files (id, new_id) on model_attr_array[2]::bigint = files.id
- left join (values (113, -113), (123, -123), (213, -213)) as stream_function (id, new_id) on model_attr_array[4]::bigint = stream_function.id
- left join (values (114, -114), (124, -124), (214, -214)) as function_metrics (id, new_id) on model_attr_array[5]::bigint = function_metrics.id
- group by event_tree_id
- ),
- new_event_tree_event_lob as (--the format of event_tree.event_lob is specified below
- with
- parsed_event_lob_parsed_new_attr11 as (--same as parsed_event_lob_parsed_attr11 but with updated value of 11th attr
- select
- event_tree_id,
- event_idx,
- /*
- * There is only a single row for each event_tree_id and event_idx
- * that has non-null event_attr_array. We are using max aggregate function to extract that non-null value instead of
- * having the value in all rows and group by not only event_tree_id and event_idx
- * but also event_attr_array which would have been more resource-consuming.
- *
- * max is an immutable function, so PostgreSQL can (and most likely does) calculate the result only once per selected row,
- * thus we should not worry about "calling" the function multiple times per row with the same arguments.
- */
- case when (max(event_attr_array)::text[])[1] = 'S'
- then (max(event_attr_array)::text[])[:10] || string_agg(new_element, '&' order by elmnt_idx) || (max(event_attr_array)::text[])[12:]
- else max(event_attr_array)
- end event_attr_array_new_attr11
- from
- (with
- parsed_event_lob_parsed_attr11 as (
- select
- event_tree_id, event_idx,
- /*
- * Select array only for the first element.
- * This allows us to group by event_idx instead of grouping by both event_idx and event_attr_array_no_attr11 thus reduces costs
- * and also potentially reduces memory / temporary storage consumption.
- */
- case
- when elmnt_idx = 1 then event_attr_array
- else null
- end as event_attr_array,
- elmnt_idx,
- string_to_array(elmnt, ':') as element_value_array
- from
- (select
- event_tree_id,
- event_idx,
- event_attr_array,
- string_to_array(
- case
- when event_attr_array[1] = 'S' then
- case
- --'null_attr11' is a special dummy value for empty 11th attr to make sure we are not loosing it
- when event_attr_array[11] = '' then 'null_attr11'
- else event_attr_array[11]
- end
- /*
- * Have to use a string and later specify it as a null-string in string_to_array
- * because otherwise string_to_array produces an array with no elements which when unnested results in a table with no rows,
- * and cross join (which is also used when no join type is specified) with such a table also produces no rows, which leads to us losing data.
- */
- else 'null'
- end,
- '&', 'null'--specify null-string as mentioned above
- ) as event_attr11_element_array
- from
- (select id as event_tree_id, event_idx, string_to_array(event, ',') as event_attr_array
- from
- (values (1001, 'C,111,112,0~0,0,-1,0,113,-1,0;P,111,112,0~0,0,-1,114,0,0,0,0'), (1002, 'G,211,212,0~0,0,-1;S,221,222,0~0,0,-1,0,-1,-1,0,,-;S,221,222,0~0,0,-1,0,-1,-1,0,com/File1.java:123:com.Cls.function1:223&com/File2.java:231:com.Cls.function2:224&com/File3.java:312:com.Cls.function3:225&/java/lang/Thread.java:-1:java.lang.Thread.run:-1,-'), (1003, 'R,211,212,0~0,0,-1,-')) as event_tree (id, event_lob)
- cross join unnest(string_to_array(event_lob, ';')) with ordinality as event (event, event_idx)--a table containing each event from event_lob in its own row
- where event_tree.id != 1--event tree 1 is an invalid dummy entry
- ) as parsed_event_lob--a table containing array of attributes representing each event from event_lob in its own row
- --a table containing array of attributes representing each event from event_lob in its own row with 11th attr extracted in a separate column and parsed into an array of elements
- ) as parsed_event_lob_preparsed_attr11
- cross join unnest(event_attr11_element_array) with ordinality as elmnt (elmnt, elmnt_idx)
- )
- select--a table with updated element_value_array concatenated into string
- event_tree_id,
- event_idx,
- event_attr_array,
- elmnt_idx,
- case
- when element_value_array[1] = 'null_attr11' then ''
- else array_to_string(
- element_value_array[:3] ||
- case
- when element_value_array[4] != '-1'--file instance is supposed to be in DB
- then check_not_null(--if there is no matching file instance then event_lob refers to a file instance that does not exist, which means the data is corrupted
- file_instance.new_id,
- array['file_instance for event_tree ', event_tree_id::text, 'event_idx', event_idx::text, 'elmnt_idx=', elmnt_idx::text] || 'element_value_array'::text || element_value_array)::text
- else '-1'
- end,
- ':')
- end as new_element
- from
- parsed_event_lob_parsed_attr11
- --we use left join despite we must always find a corresponding row in the right-side table in order to be able to check this requirement by using pg_temp.check_not_null
- left join (values (223, -223), (224, -224), (225, -225)) as file_instance (id, new_id) on element_value_array[4]::bigint = file_instance.id
- ) parsed_event_lob_parsed_new_attr11--same as parsed_event_lob_parsed_attr11 but with updated value of 11th attr
- group by event_tree_id, event_idx
- )
- select
- event_tree_id,
- string_agg(
- array_to_string(
- event_attr_array_new_attr11[:1] ||--a slice of the array with indices <=1, we need an array type here instead of an element of the array
- check_not_null(--if there is no matching event tag then event_lob refers to an event tag that does not exist, which means the data is corrupted
- event_tag.new_id,
- array['event_tag for event_tree', event_tree_id::text, 'event_idx', event_idx::text] || 'event_attr_array_new_attr11'::text || event_attr_array_new_attr11)::text ||
- check_not_null(
- event_description.new_id,
- array['event_description for event_tree', event_tree_id::text, 'event_idx', event_idx::text] || 'event_attr_array_new_attr11'::text || event_attr_array_new_attr11)::text ||
- case
- when event_attr_array_new_attr11[1] = 'C' or event_attr_array_new_attr11[1] = 'S' then
- event_attr_array_new_attr11[4:7] ||
- case
- when event_attr_array_new_attr11[8] != '-1'--file is supposed to be in DB
- then check_not_null(
- files.new_id,
- array['files for event_tree', event_tree_id::text, 'event_idx', event_idx::text] || 'event_attr_array_new_attr11'::text || event_attr_array_new_attr11)::text
- else '-1'
- end ||
- event_attr_array_new_attr11[9:12]
- when event_attr_array_new_attr11[1] = 'P' then
- event_attr_array_new_attr11[4:6] ||
- case
- when event_attr_array_new_attr11[7] != '-1'--path_condition is supposed to be in DB
- then check_not_null(
- path_condition.new_id,
- array['path_condition for event_tree', event_tree_id::text, 'event_idx', event_idx::text] || 'event_attr_array_new_attr11'::text || event_attr_array_new_attr11)::text
- else '-1'
- end ||
- event_attr_array_new_attr11[8:11]
- else
- --it is [4:7] for 'R' and [4:6] for 'G', but array_to_string skips null (and therefore non-existing) array elements
- event_attr_array_new_attr11[4:7]
- end,
- ','
- ),
- ';' order by event_idx
- ) as new_event_lob
- from
- parsed_event_lob_parsed_new_attr11
- --we use left join despite we must always find a corresponding row in the right-side table in order to be able to check this requirement by using pg_temp.check_not_null
- left join (values (111, -111), (211, -211), (221, -221)) as event_tag (id, new_id) on event_attr_array_new_attr11[2]::bigint = event_tag.id
- left join (values (112, -112), (212, -212), (222, -222)) as event_description (id, new_id) on event_attr_array_new_attr11[3]::bigint = event_description.id
- left join (values (113, -113)) as files (id, new_id) on
- case
- when event_attr_array_new_attr11[1] = 'C' or event_attr_array_new_attr11[1] = 'S'
- then event_attr_array_new_attr11[8]::bigint = files.id
- else false
- end
- left join (values (114, -114)) as path_condition (id, new_id) on
- case
- when event_attr_array_new_attr11[1] = 'P' then event_attr_array_new_attr11[7]::bigint = path_condition.id
- else false
- end
- group by event_tree_id
- )
- select
- event_tree_id,
- new_model_lob,
- new_event_lob,
- /*
- * PostgreSQL md5 function accepts a string instead of bytes, which means its behaviour depends on server_encoding
- * (use "show server_encoding;" to query it).
- * Since the application uses UTF-8 charset for converting a string into bytes,
- * we can use PostgreSQL md5 function only if server_encoding is UTF-8.
- * Fortunately, we require using UTF-8, so we are all good.
- */
- md5(coalesce(new_model_lob, '') || '|' || coalesce(new_event_lob, '')) as events_md5
- from
- new_event_tree_model_lob
- full join new_event_tree_event_lob using (event_tree_id);
- --this function is used by the query above
- create or replace function check_not_null(in v bigint, in msg text[]) returns bigint as $$
- begin
- if v is null then
- raise exception 'The argument must not be null. %', msg;
- end if;
- return v;
- end;
- $$ language plpgsql immutable parallel safe;
- /*
- event_tree.model_lob format:
- models are separated by ';'
- pieces of model data (attributes) are separated by ','
- attributes:
- [1] irrelevant
- [2] FK -> files.id
- [3] irrelevant
- [4] FK -> stream_function.id
- [5] FK -> function_metrics.id
- event_tree.event_lob format:
- events are separated by ';'
- pieces of event data (attributes) are separated by ','
- attributes:
- [0] event type (one of C, S, P, R, G),
- [1] FK -> event_tag.id,
- [2] FK -> event_description.id,
- [3] irrelevant
- [4] irrelevant
- [5] irrelevant
- "C"-specific attributes:
- [6] irrelevant
- [7] FK -> files.id//can be -1 which means "no value"
- [8] irrelevant
- [9] irrelevant
- "S"-specific attributes:
- [10] elements
- format:
- elements are separated by '&'
- pieces of element data (values) are separated by ':'
- values:
- [0] irrelevant
- [1] irrelevant
- [2] irrelevant
- [3] FK -> file_instance.id//can be -1 which means "no value"
- [11] irrelevant
- "P"-specific attributes:
- [6] FK -> path_condition.id//can be -1 which means "no value"
- [7] irrelevant
- [8] irrelevant
- [9] irrelevant
- [10] irrelevant
- "R"-specific attributes:
- [6] irrelevant
- "G"-specific attributes:
- none
- */
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement