Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- column_names = [
- 'studyId',
- 'leadVariantId',
- 'tagVariantId',
- 'pValueMantissa',
- 'pValueExponent',
- 'R_overall',
- ]
- data = [
- # variant 1 (independent association):
- ('s1', 'v1', 'v1', 1, -9, 1.0),
- # variant 2, not resolved in ld matrix (independent association):
- ('s1', 'v2', None, 1, -18, None),
- # variant 3, most significant p-value (independent association):
- ('s1', 'v3', 'v4', 1, -18, 1.0),
- ('s1', 'v3', 'v5', 1, -18, 0.98),
- # variant 4: higher p-value, explained by v3:
- ('s1', 'v4', 'v3', 4, -9, 0.98),
- ('s1', 'v4', 'v5', 4, -9, 0.98),
- ('s1', 'v4', 'v6', 4, -9, 1.0),
- # variant 5: higher p-value, explained by v3 and v4:
- ('s1', 'v5', 'v3', 1, -8, 1.0),
- ('s1', 'v5', 'v4', 1, -8, 0.98),
- ('s1', 'v5', 'v6', 1, -8, 0.98),
- # variant 6: higher p-value, explained by v5,v4 and v7 but not v3:
- ('s1', 'v6', 'v5', 5, -8, 1.0),
- ('s1', 'v6', 'v4', 5, -8, 1.0),
- ('s1', 'v6', 'v7', 5, -8, 1.0),
- # variant 7: low p-value (independent association).
- ('s1', 'v7', 'v6', 1, -18, 1.0),
- ]
- df = (
- spark.createDataFrame(data, column_names)
- .withColumn('qualityControl', f.array())
- .persist()
- )
- df.show()
- '''
- Input:
- +-------+-------------+------------+--------------+--------------+---------+--------------+
- |studyId|leadVariantId|tagVariantId|pValueMantissa|pValueExponent|R_overall|qualityControl|
- +-------+-------------+------------+--------------+--------------+---------+--------------+
- | s1| v1| v1| 1| -9| 1.0| []|
- | s1| v2| null| 1| -18| null| []|
- | s1| v3| v4| 1| -18| 1.0| []|
- | s1| v3| v5| 1| -18| 0.98| []|
- | s1| v4| v3| 4| -9| 0.98| []|
- | s1| v4| v5| 4| -9| 0.98| []|
- | s1| v4| v6| 4| -9| 1.0| []|
- | s1| v5| v3| 1| -8| 1.0| []|
- | s1| v5| v4| 1| -8| 0.98| []|
- | s1| v5| v6| 1| -8| 0.98| []|
- | s1| v6| v5| 5| -8| 1.0| []|
- | s1| v6| v4| 5| -8| 1.0| []|
- | s1| v6| v7| 5| -8| 1.0| []|
- | s1| v7| v6| 1| -18| 1.0| []|
- +-------+-------------+------------+--------------+--------------+---------+--------------+
- Expected output:
- +-------+---------+------------+--------------+--------------+---------+----------------------------------+
- |studyId|variantId|tagVariantId|pValueMantissa|pValueExponent|R_overall|qualityControl |
- +-------+---------+------------+--------------+--------------+---------+----------------------------------+
- |s1 |v1 |v1 |1 |-9 |1.0 |[] |
- |s1 |v2 |null |1 |-18 |null |[] |
- |s1 |v3 |v5 |1 |-18 |0.98 |[] |
- |s1 |v3 |v4 |1 |-18 |1.0 |[] |
- |s1 |v4 |null |4 |-9 |null |[Association explained by: v3] |
- |s1 |v5 |null |1 |-8 |null |[Association explained by: v3] |
- |s1 |v6 |null |5 |-8 |null |[Association explained by: v3, v7]|
- |s1 |v7 |v6 |1 |-18 |1.0 |[] |
- +-------+---------+------------+--------------+--------------+---------+----------------------------------+
- '''
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement