Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- Pairwise distance between every single members of the parliament
- active in that specific term, for every single voting that happened.
- Generates a 61GB dataset for the 14,452 voting events.
- Next step is reducing the amount of data, using roling window aggregation
- on the time dimension, and clustering on the people dimension.
- */
- WITH
- extended AS (
- SELECT
- person,
- party,
- vote_timestamp,
- -- to reduce data volume, we aggregate
- -- distances down to weekly granurality
- --
- -- also, analytic functions don't work
- -- on date/time data types, we need number
- DATE(vote_timestamp) AS date_week, -- human readable
- DATE_DIFF( -- for analytic functions
- DATE(vote_timestamp),
- DATE('2009-01-01'),
- WEEK
- ) AS week_no,
- vote_subject_hash,
- cycle,
- (
- CASE
- WHEN vote = 'igen'
- THEN 1
- WHEN vote = 'nem'
- THEN 0
- -- considering absence
- -- and not voting equal
- ELSE .5
- END
- ) AS vote
- FROM
- `atlo-parliament-polarization.scraped.ready_for_distance_calculation`
- )
- SELECT
- date_week,
- week_no,
- vote_timestamp,
- vote_subject_hash,
- cycle,
- ARRAY_AGG(
- STRUCT(
- q1.person AS source_person,
- q1.party AS source_party,
- q2.person AS target_person,
- q2.party AS target_party,
- ABS(q1.vote - q2.vote) AS distance
- )
- ) AS edges
- FROM
- extended AS q1
- FULL OUTER JOIN
- extended AS q2
- USING(
- date_week,
- week_no,
- cycle,
- vote_timestamp,
- vote_subject_hash
- )
- WHERE
- -- calculating edges only once
- q1.person > q2.person
- GROUP BY
- date_week,
- week_no,
- vote_timestamp,
- vote_subject_hash,
- cycle
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement