Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## UKB Phenotypic Correlation Matrices
- ##
- ## Generates a phenotypic correlation matrix for all GWAS'ed
- ## traits, residualized for all GWAS covariates. Also outputs
- ## a matrix of pairwise N's.
- ##
- ## Created by Caitlin E Carey (ccarey@broadinstitute.org)
- ##
- ## Last updated 9/27/2019
- import pandas as pd
- import numpy as np
- import statsmodels.formula.api as sm
- import functools
- import sys
- # flush the buffer to std.out
- print = functools.partial(print,flush=True)
- # grab commandline input specifying sex to be analyzed
- biosex=sys.argv[1]
- # load the most current manifest file
- manifest = pd.read_table('/stanley/robinson/ccarey/UKBB/ukb_files/UKBBmegaGWAS_manifest_downloaded20190910.tsv')
- # grab all non-covar GWAS phenotypes in manifest
- phenolist = [x for x in manifest.loc[manifest.Sex==biosex,"Phenotype Code"].values.tolist() if pd.notnull(x) and x not in ["age","sex"]]
- phenolist = np.unique(phenolist)
- # grab all participants in GWAS sample remove those who subsequently withdrew
- sampleIDs = [line.rstrip('\n') for line in open('/psych/genetics_data/projects/ukb31063/ukb31063.neale_gwas_samples.'+biosex+'.txt')][1:]
- withdrawn = [line.rstrip('\n') for line in open('/psych/genetics_data/projects/ukb31063/ukb31063.withdrawn_samples.txt')][1:]
- sampleIDs = [int(x) for x in sampleIDs if x not in withdrawn]
- # load all GWAS covariates
- covars = pd.read_table('/stanley/robinson/ccarey/UKBB/ukb_files/megaGWAS_covars/ukb31063.neale_gwas_covariates.'+biosex+'.tsv.bgz',index_col="s",compression="gzip")
- # load all relevant phenotype files
- phesant = pd.read_table('/psych/genetics_data/projects/ukb31063/ukb31063.PHESANT_January_2019.'+biosex+'.tsv.bgz',dtype=object,index_col="s",compression="gzip")
- finngen = pd.read_table('/psych/genetics_data/projects/ukb31063/ukb31063.FINNGEN_phenotypes.'+biosex+'.tsv.bgz',index_col="s",compression="gzip")
- icd10 = pd.read_table('/psych/genetics_data/projects/ukb31063/ukb31063.ICD10_phenotypes.'+biosex+'.tsv.bgz',index_col="s",compression="gzip")
- biomarkers = pd.read_csv('/psych/genetics_data/ccarey/UKBB/ukb_files/biomarkers_megaGWAS/ukb31063.biomarkers_gwas.'+biosex+'.tsv',dtype=object,index_col="s")
- # grab all GWAS'ed phenotypes
- phesant_phenos = np.intersect1d(phesant.columns,phenolist)
- finngen_phenos = np.intersect1d(finngen.columns,phenolist)
- icd10_phenos = np.intersect1d(icd10.columns,phenolist)
- biomarkers_phenos = np.intersect1d(biomarkers.columns,phenolist)
- filephenos = np.concatenate((phesant_phenos,finngen_phenos,icd10_phenos,biomarkers_phenos))
- # extract data for all GWAS'ed phenotypes
- phesant_extracted = phesant.loc[sampleIDs,phesant_phenos]
- finngen_extracted = finngen.loc[sampleIDs,finngen_phenos]
- icd10_extracted = icd10.loc[sampleIDs,icd10_phenos]
- biomarkers_extracted = biomarkers.loc[sampleIDs,biomarkers_phenos]
- manifest_phenos = pd.concat([phesant_extracted,finngen_extracted,icd10_extracted,biomarkers_extracted],axis=1)
- # replace boolean strings with actual booleans
- manifest_phenos.replace('TRUE',True,inplace=True)
- manifest_phenos.replace('FALSE',False,inplace=True)
- # function to residualize GWAS phenotypes by GWAS covars
- def residualize(column):
- temp = manifest_phenos[column].astype(float).to_frame().join(covars,how="left")
- temp['y'] = temp[column]
- model = sm.ols("y ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20 + age + age_isFemale + age_squared + age_squared_isFemale + isFemale", data=temp,missing="drop").fit()
- return(model.resid)
- # residualize all phenotypes
- manifest_phenos_resid = manifest_phenos.apply(lambda x: residualize(x.name))
- # add in GWAS'ed covariates
- if biosex=="both_sexes":
- isfemale_mod = sm.ols("isFemale.astype(int) ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20 + age + age_squared", data=covars,missing="drop").fit()
- manifest_phenos_resid["isFemale"]=isfemale_mod.resid
- age_mod = sm.ols("age ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20 + isFemale", data=covars,missing="drop").fit()
- manifest_phenos_resid["age"]=age_mod.resid
- else:
- age_mod = sm.ols("age ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20", data=covars,missing="drop").fit()
- manifest_phenos_resid["age"]=age_mod.resid
- # generate residualized phenotypic correlation matrix
- manifest_phenos_resid_corr = manifest_phenos_resid.corr()
- print("correlation matrix generated")
- # save resulting correlation matrix
- manifest_phenos_resid_corr.to_csv('/psych/genetics_data/ccarey/UKBB/h2_corrmat/corrmat/'+biosex+'_resid_corrmat.csv')
- print("saved corrmat to file")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement