-------------------------------------------------------------------------------
-- Test run on PostgreSQL 9.1.2 on 2012-02-21
--
-- CONCLUSION: Max in a sub-select seems to be the best approach in general.
-- It is the only approach that enables table elimination. For particular
-- queries using a row_number() = 1 can be faster, but it is not in general.
--
-- Clustering gave no performance boost whatsoever, likely because it does not
-- rearrange the actual table data in PostgreSQL.
-------------------------------------------------------------------------------
DROP VIEW IF EXISTS rowGenerator;
DROP TABLE IF EXISTS ANAT2_AnchorAttribute2 CASCADE;
DROP TABLE IF EXISTS ANAT1_AnchorAttribute1 CASCADE;
DROP TABLE IF EXISTS AN_Anchor CASCADE;
CREATE TABLE IF NOT EXISTS AN_Anchor (
AN_ID int NOT NULL,
CONSTRAINT pk_AN_Anchor PRIMARY KEY(AN_ID)
);
--ALTER TABLE AN_Anchor CLUSTER ON pk_AN_Anchor;
CREATE TABLE IF NOT EXISTS ANAT1_AnchorAttribute1 (
AN_ID int NOT NULL,
ANAT1_AnchorAttribute1Value char(10) NOT NULL,
CONSTRAINT pk_ANAT1_AnchorAttribute1
PRIMARY KEY(AN_ID),
CONSTRAINT fk_ANAT1_AnchorAttribute1
FOREIGN KEY(AN_ID) REFERENCES AN_Anchor(AN_ID)
);
--ALTER TABLE ANAT1_AnchorAttribute1 CLUSTER ON pk_ANAT1_AnchorAttribute1;
CREATE TABLE IF NOT EXISTS ANAT2_AnchorAttribute2 (
AN_ID int NOT NULL,
ANAT2_AnchorAttribute2Value char(10) NOT NULL,
ANAT2_FromDate date NOT NULL,
CONSTRAINT pk_ANAT2_AnchorAttribute2
PRIMARY KEY(AN_ID, ANAT2_FromDate),
CONSTRAINT fk_ANAT2_AnchorAttribute2
FOREIGN KEY(AN_ID) REFERENCES AN_Anchor(AN_ID)
);
--ALTER TABLE ANAT2_AnchorAttribute2 CLUSTER ON pk_ANAT2_AnchorAttribute2;
-------------------------------------------------------------------------------
-- The following will be used to generate som sample data.
-- Anchor: 1 million rows
-- Attribute1: 1 million rows (three values)
-- Attribute2: 3 million rows (three values at three different changing times)
-- It takes approximately 5 minutes to populate tables
-------------------------------------------------------------------------------
CREATE VIEW rowGenerator
AS
WITH RECURSIVE gen(n) AS (
VALUES (1)
UNION ALL
SELECT n+1 FROM gen WHERE n < 1000000
)
SELECT
n
FROM
gen;
INSERT INTO AN_Anchor
SELECT
n
FROM
rowGenerator;
INSERT INTO ANAT1_AnchorAttribute1
SELECT
n,
CASE n%3
WHEN 0 THEN 'Red'
WHEN 1 THEN 'Green'
WHEN 2 THEN 'Blue'
END
FROM
rowGenerator;
INSERT INTO ANAT2_AnchorAttribute2
SELECT
n,
'Car',
to_date('20120101','yyyymmdd')
FROM
rowGenerator;
INSERT INTO ANAT2_AnchorAttribute2
SELECT
n,
'Bus',
to_date('20120102','yyyymmdd')
FROM
rowGenerator;
INSERT INTO ANAT2_AnchorAttribute2
SELECT
n,
'Train',
to_date('20120103','yyyymmdd')
FROM
rowGenerator;
-------------------------------------------------------------------------------
-- Refresh the clustered indexes (1 minute).
-- Update statistics (20 seconds).
-------------------------------------------------------------------------------
/*
CLUSTER; -- needs to be run separately
VACUUM ANALYZE;
*/
-------------------------------------------------------------------------------
-- Looking at some rows from the data.
-------------------------------------------------------------------------------
SELECT * FROM AN_Anchor ORDER BY AN_ID LIMIT 10;
SELECT * FROM ANAT1_AnchorAttribute1 ORDER BY AN_ID LIMIT 10;
SELECT * FROM ANAT2_AnchorAttribute2 ORDER BY AN_ID LIMIT 10;
-------------------------------------------------------------------------------
-- Testing which approach will be the best for finding latest data.
-------------------------------------------------------------------------------
-- explain
SELECT
count(*)
FROM (
SELECT DISTINCT
AN_ID,
first_value(ANAT2_AnchorAttribute2Value) OVER w
AS ANAT2_AnchorAttribute2Value,
first_value(ANAT2_FromDate) OVER w
AS ANAT2_FromDate
FROM
ANAT2_AnchorAttribute2
WINDOW
w AS (PARTITION BY AN_ID ORDER BY ANAT2_FromDate DESC)
) x;
-- takes 16 - 30 seconds (inconsistent)
-- explain
SELECT
count(*)
FROM (
SELECT
AN_ID,
ANAT2_AnchorAttribute2Value,
ANAT2_FromDate
FROM
ANAT2_AnchorAttribute2 ANAT2
WHERE
ANAT2_FromDate = (
SELECT
max(sub.ANAT2_FromDate)
FROM
ANAT2_AnchorAttribute2 sub
WHERE
sub.AN_ID = ANAT2.AN_ID
)
) x;
-- takes 17 seconds (consistent)
-- explain
SELECT
count(*)
FROM (
SELECT
ANAT2.AN_ID,
ANAT2.ANAT2_AnchorAttribute2Value,
ANAT2.ANAT2_FromDate
FROM
ANAT2_AnchorAttribute2 ANAT2
JOIN (
SELECT
AN_ID,
max(ANAT2_FromDate) AS ANAT2_FromDate
FROM
ANAT2_AnchorAttribute2
GROUP BY
AN_ID
) sub
ON
sub.AN_ID = ANAT2.AN_ID
AND
sub.ANAT2_FromDate = ANAT2.ANAT2_FromDate
) x;
-- takes 13 seconds (consistent)
-- explain
SELECT
count(*)
FROM (
SELECT
AN_ID,
ANAT2_AnchorAttribute2Value,
ANAT2_FromDate,
row_number() OVER (
PARTITION BY
AN_ID
ORDER BY
ANAT2_FromDate DESC
) AS rowNumber
FROM
ANAT2_AnchorAttribute2
) x
WHERE
rowNumber = 1;
-- takes 7 - 11 seconds (inconsistent)
-------------------------------------------------------------------------------
-- Create the latest view for the anchor using the best approach found above.
-------------------------------------------------------------------------------
CREATE OR REPLACE VIEW lAN_Anchor_Alt1
AS
SELECT
AN.AN_ID,
ANAT1.ANAT1_AnchorAttribute1Value,
ANAT2.ANAT2_AnchorAttribute2Value,
ANAT2.ANAT2_FromDate
FROM
AN_Anchor AN
LEFT JOIN
ANAT1_AnchorAttribute1 ANAT1
ON
ANAT1.AN_ID = AN.AN_ID
LEFT JOIN (
SELECT
AN_ID,
ANAT2_AnchorAttribute2Value,
ANAT2_FromDate,
row_number() OVER (
PARTITION BY
AN_ID
ORDER BY
ANAT2_FromDate DESC
) AS rowNumber
FROM
ANAT2_AnchorAttribute2
) ANAT2
ON
ANAT2.rowNumber = 1
AND
ANAT2.AN_ID = AN.AN_ID;
SELECT
max(ANAT1_AnchorAttribute1Value)
FROM
lAN_Anchor_Alt1
WHERE
ANAT2_AnchorAttribute2Value = 'Train';
-- takes 20 seconds to run (consistent)
SELECT
max(ANAT1_AnchorAttribute1Value)
FROM
lAN_Anchor_Alt1;
-- takes 12 -15 seconds to run (inconsistent)
-------------------------------------------------------------------------------
-- Create the latest view for the anchor using the original approach.
-------------------------------------------------------------------------------
CREATE OR REPLACE VIEW lAN_Anchor_Alt2
AS
SELECT
AN.AN_ID,
ANAT1.ANAT1_AnchorAttribute1Value,
ANAT2.ANAT2_AnchorAttribute2Value,
ANAT2.ANAT2_FromDate
FROM
AN_Anchor AN
LEFT JOIN
ANAT1_AnchorAttribute1 ANAT1
ON
ANAT1.AN_ID = AN.AN_ID
LEFT JOIN
ANAT2_AnchorAttribute2 ANAT2
ON
ANAT2.AN_ID = AN.AN_ID
AND
ANAT2.ANAT2_FromDate = (
SELECT
max(sub.ANAT2_FromDate)
FROM
ANAT2_AnchorAttribute2 sub
WHERE
sub.AN_ID = AN.AN_ID
);
SELECT
max(ANAT1_AnchorAttribute1Value)
FROM
lAN_Anchor_Alt2
WHERE
ANAT2_AnchorAttribute2Value = 'Train';
-- takes 16 seconds to run (consistent)
SELECT
max(ANAT1_AnchorAttribute1Value)
FROM
lAN_Anchor_Alt2;
-- takes 2 seconds to run (consistent)
-------------------------------------------------------------------------------
-- Create the latest view for the anchor using the group by approach.
-------------------------------------------------------------------------------
CREATE OR REPLACE VIEW lAN_Anchor_Alt3
AS
SELECT
AN.AN_ID,
ANAT1.ANAT1_AnchorAttribute1Value,
ANAT2.ANAT2_AnchorAttribute2Value,
ANAT2.ANAT2_FromDate
FROM
AN_Anchor AN
LEFT JOIN
ANAT1_AnchorAttribute1 ANAT1
ON
ANAT1.AN_ID = AN.AN_ID
LEFT JOIN (
SELECT
aANAT2.AN_ID,
aANAT2.ANAT2_AnchorAttribute2Value,
aANAT2.ANAT2_FromDate
FROM
ANAT2_AnchorAttribute2 aANAT2
JOIN (
SELECT
AN_ID,
max(ANAT2_FromDate) AS ANAT2_FromDate
FROM
ANAT2_AnchorAttribute2
GROUP BY
AN_ID
) bANAT2
ON
bANAT2.AN_ID = aANAT2.AN_ID
AND
bANAT2.ANAT2_FromDate = aANAT2.ANAT2_FromDate
) ANAT2
ON
ANAT2.AN_ID = AN.AN_ID;
SELECT
max(ANAT1_AnchorAttribute1Value)
FROM
lAN_Anchor_Alt3
WHERE
ANAT2_AnchorAttribute2Value = 'Train';
-- takes 19 seconds to run (consistent)
SELECT
max(ANAT1_AnchorAttribute1Value)
FROM
lAN_Anchor_Alt3;
-- takes 16 seconds to run (consistent)
-------------------------------------------------------------------------------
-- Compare some explains when selecting all columns.
-------------------------------------------------------------------------------
explain select * from lAN_Anchor_Alt1;
/*
"Merge Left Join (cost=556437.96..733458.71 rows=1000000 width=30)"
" Merge Cond: (an.an_id = anat2.an_id)"
" -> Merge Left Join (cost=1.49..76797.25 rows=1000000 width=15)"
" Merge Cond: (an.an_id = anat1.an_id)"
" -> Index Scan using pk_an_anchor on an_anchor an (cost=0.00..30408.36 rows=1000000 width=4)"
" -> Index Scan using pk_anat1_anchorattribute1 on anat1_anchorattribute1 anat1 (cost=0.00..31389.36 rows=1000000 width=15)"
" -> Materialize (cost=556436.47..653973.97 rows=15000 width=19)"
" -> Subquery Scan on anat2 (cost=556436.47..653936.47 rows=15000 width=19)"
" Filter: (anat2.rownumber = 1)"
" -> WindowAgg (cost=556436.47..616436.47 rows=3000000 width=19)"
" -> Sort (cost=556436.47..563936.47 rows=3000000 width=19)"
" Sort Key: anat2_anchorattribute2.an_id, anat2_anchorattribute2.anat2_fromdate"
" -> Seq Scan on anat2_anchorattribute2 (cost=0.00..49109.00 rows=3000000 width=19)"
*/
explain select * from lAN_Anchor_Alt2;
/*
"Merge Left Join (cost=556437.96..11943249.83 rows=1000000 width=30)"
" Merge Cond: (an.an_id = anat2.an_id)"
" Join Filter: (anat2.anat2_fromdate = (SubPlan 2))"
" -> Merge Left Join (cost=1.49..76797.25 rows=1000000 width=15)"
" Merge Cond: (an.an_id = anat1.an_id)"
" -> Index Scan using pk_an_anchor on an_anchor an (cost=0.00..30408.36 rows=1000000 width=4)"
" -> Index Scan using pk_anat1_anchorattribute1 on anat1_anchorattribute1 anat1 (cost=0.00..31389.36 rows=1000000 width=15)"
" -> Materialize (cost=556436.47..571436.47 rows=3000000 width=19)"
" -> Sort (cost=556436.47..563936.47 rows=3000000 width=19)"
" Sort Key: anat2.an_id"
" -> Seq Scan on anat2_anchorattribute2 anat2 (cost=0.00..49109.00 rows=3000000 width=19)"
" SubPlan 2"
" -> Result (cost=4.05..4.06 rows=1 width=0)"
" InitPlan 1 (returns $1)"
" -> Limit (cost=0.00..4.05 rows=1 width=4)"
" -> Index Scan Backward using pk_anat2_anchorattribute2 on anat2_anchorattribute2 sub (cost=0.00..12.14 rows=3 width=4)"
" Index Cond: ((an_id = an.an_id) AND (anat2_fromdate IS NOT NULL))"
*/
explain select * from lAN_Anchor_Alt3;
/*
"Merge Left Join (cost=1051344.42..1237691.33 rows=1000000 width=30)"
" Merge Cond: (an.an_id = aanat2.an_id)"
" -> Merge Left Join (cost=1.49..76797.25 rows=1000000 width=15)"
" Merge Cond: (an.an_id = anat1.an_id)"
" -> Index Scan using pk_an_anchor on an_anchor an (cost=0.00..30408.36 rows=1000000 width=4)"
" -> Index Scan using pk_anat1_anchorattribute1 on anat1_anchorattribute1 anat1 (cost=0.00..31389.36 rows=1000000 width=15)"
" -> Materialize (cost=1051342.93..1158217.92 rows=15000 width=19)"
" -> Merge Join (cost=1051342.93..1158180.42 rows=15000 width=19)"
" Merge Cond: (anat2_anchorattribute2.an_id = aanat2.an_id)"
" Join Filter: (aanat2.anat2_fromdate = (max(anat2_anchorattribute2.anat2_fromdate)))"
" -> GroupAggregate (cost=494906.47..528223.13 rows=1081666 width=8)"
" -> Sort (cost=494906.47..502406.47 rows=3000000 width=8)"
" Sort Key: anat2_anchorattribute2.an_id"
" -> Seq Scan on anat2_anchorattribute2 (cost=0.00..49109.00 rows=3000000 width=8)"
" -> Materialize (cost=556436.47..571436.47 rows=3000000 width=19)"
" -> Sort (cost=556436.47..563936.47 rows=3000000 width=19)"
" Sort Key: aanat2.an_id"
" -> Seq Scan on anat2_anchorattribute2 aanat2 (cost=0.00..49109.00 rows=3000000 width=19)"
*/
-------------------------------------------------------------------------------
-- Verify table elimination by selecting only one column.
-------------------------------------------------------------------------------
explain select ANAT1_AnchorAttribute1Value from lAN_Anchor_Alt1;
-- NOTE THAT THERE IS _NO_ TABLE ELIMINIATION OF THE SECOND ATTRIBUTE!
/*
"Merge Left Join (cost=556437.96..733458.71 rows=1000000 width=11)"
" Merge Cond: (an.an_id = anat2.an_id)"
" -> Merge Left Join (cost=1.49..76797.25 rows=1000000 width=15)"
" Merge Cond: (an.an_id = anat1.an_id)"
" -> Index Scan using pk_an_anchor on an_anchor an (cost=0.00..30408.36 rows=1000000 width=4)"
" -> Index Scan using pk_anat1_anchorattribute1 on anat1_anchorattribute1 anat1 (cost=0.00..31389.36 rows=1000000 width=15)"
" -> Materialize (cost=556436.47..653973.97 rows=15000 width=4)"
" -> Subquery Scan on anat2 (cost=556436.47..653936.47 rows=15000 width=4)"
" Filter: (anat2.rownumber = 1)"
" -> WindowAgg (cost=556436.47..616436.47 rows=3000000 width=19)"
" -> Sort (cost=556436.47..563936.47 rows=3000000 width=19)"
" Sort Key: anat2_anchorattribute2.an_id, anat2_anchorattribute2.anat2_fromdate"
" -> Seq Scan on anat2_anchorattribute2 (cost=0.00..49109.00 rows=3000000 width=19)"
*/
explain select ANAT1_AnchorAttribute1Value from lAN_Anchor_Alt2;
-- NOTE THAT THERE _IS_ TABLE ELIMINIATION OF THE SECOND ATTRIBUTE!
/*
"Merge Left Join (cost=1.49..76797.25 rows=1000000 width=11)"
" Merge Cond: (an.an_id = anat1.an_id)"
" -> Index Scan using pk_an_anchor on an_anchor an (cost=0.00..30408.36 rows=1000000 width=4)"
" -> Index Scan using pk_anat1_anchorattribute1 on anat1_anchorattribute1 anat1 (cost=0.00..31389.36 rows=1000000 width=15)"
*/
explain select ANAT1_AnchorAttribute1Value from lAN_Anchor_Alt3;
-- NOTE THAT THERE IS _NO_ TABLE ELIMINIATION OF THE SECOND ATTRIBUTE!
/*
"Merge Left Join (cost=989814.42..1176161.33 rows=1000000 width=11)"
" Merge Cond: (an.an_id = aanat2.an_id)"
" -> Merge Left Join (cost=1.49..76797.25 rows=1000000 width=15)"
" Merge Cond: (an.an_id = anat1.an_id)"
" -> Index Scan using pk_an_anchor on an_anchor an (cost=0.00..30408.36 rows=1000000 width=4)"
" -> Index Scan using pk_anat1_anchorattribute1 on anat1_anchorattribute1 anat1 (cost=0.00..31389.36 rows=1000000 width=15)"
" -> Materialize (cost=989812.93..1096687.92 rows=15000 width=4)"
" -> Merge Join (cost=989812.93..1096650.42 rows=15000 width=4)"
" Merge Cond: (anat2_anchorattribute2.an_id = aanat2.an_id)"
" Join Filter: (aanat2.anat2_fromdate = (max(anat2_anchorattribute2.anat2_fromdate)))"
" -> GroupAggregate (cost=494906.47..528223.13 rows=1081666 width=8)"
" -> Sort (cost=494906.47..502406.47 rows=3000000 width=8)"
" Sort Key: anat2_anchorattribute2.an_id"
" -> Seq Scan on anat2_anchorattribute2 (cost=0.00..49109.00 rows=3000000 width=8)"
" -> Materialize (cost=494906.47..509906.47 rows=3000000 width=8)"
" -> Sort (cost=494906.47..502406.47 rows=3000000 width=8)"
" Sort Key: aanat2.an_id"
" -> Seq Scan on anat2_anchorattribute2 aanat2 (cost=0.00..49109.00 rows=3000000 width=8)"
*/