Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- -- Install DuckDB command line from
- -- https://duckdb.org/docs/installation/
- -- Download the data set:
- -- https://www.stats.govt.nz/large-datasets/csv-files-for-download/
- -- https://www3.stats.govt.nz/2018census/Age-sex-by-ethnic-group-grouped-total-responses-census-usually-resident-population-counts-2006-2013-2018-Censuses-RC-TA-SA2-DHB.zip
- -- Run DuckDB from the command line or in a GUI that supports DuckDB (ex: DBeaver Community edition)
- -- https://dbeaver.io/download/
- -- ================================================================================
- SET VARIABLE path = "C:\Users\bdill\Downloads\NZ-census-2006-2013-2018\";
- -- ================================================================================
- CREATE OR REPLACE TABLE fact_nz AS SELECT * FROM read_csv( CONCAT(getvariable('path'), 'Data8277.csv') ); -- 837 MB in 4.5 seconds
- CREATE OR REPLACE TABLE dim_age AS SELECT * FROM read_csv( CONCAT(getvariable('path'), 'DimenLookupAge8277.csv') );
- CREATE OR REPLACE TABLE dim_area AS SELECT * FROM read_csv( CONCAT(getvariable('path'), 'DimenLookupArea8277.csv') );
- CREATE OR REPLACE TABLE dim_ethnic AS SELECT * FROM read_csv( CONCAT(getvariable('path'), 'DimenLookupEthnic8277.csv') );
- CREATE OR REPLACE TABLE dim_sex AS SELECT * FROM read_csv( CONCAT(getvariable('path'), 'DimenLookupSex8277.csv') ); --1,2,9
- CREATE OR REPLACE TABLE dim_year AS SELECT * FROM read_csv( CONCAT(getvariable('path'), 'DimenLookupYear8277.csv') );
- -- ================================================================================
- -- convert the 800+ MB csv to parquet just to see how fast it is and how much compression we get.
- COPY (SELECT * FROM fact_nz) -- 837 MB -> 75 MB IN 2.1 sec
- TO "C:\Users\bdill\Downloads\NZ-census-2006-2013-2018\Data8777.parquet" (FORMAT parquet);
- -- ================================================================================
- -- SELECT counts for all areas, sex and ethnicities
- -- GROUP BY ages
- -- and PIVOT over the 3 years
- SELECT *
- FROM (
- SELECT fact_nz.YEAR
- --, fact_nz.age
- , da.description AS age_desc
- , da.SortOrder
- --, fact_nz.ethnic
- --, fact_nz.sex
- --, fact_nz.area
- , fact_nz.count AS n
- FROM fact_nz
- JOIN dim_age AS da ON da.code = fact_nz.age
- WHERE area IN ('9999') -- 9999 = Total - New Zealand by Regional Council/SA2
- AND sex = '9' -- 9 = Total people - sex
- AND ethnic = '9999' -- 9999 = Total people
- --AND age IN ('999999') = Total people - age group
- --AND age IN ('1', '2', '3', '4') --
- --AND age IN (SELECT code FROM dim_age WHERE LEN(code) = 2) -- double digit age codes - 5 year bins
- AND age IN (SELECT code FROM dim_age WHERE LEN(code) = 3 AND code < '130') -- triple digit age codes - single year bins
- ) AS X
- PIVOT (
- MAX(N) FOR "YEAR" IN ('2006', '2013', '2018')
- ) AS PIV
- ORDER BY SortOrder
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement