Advertisement
Guest User

Untitled

a guest
Oct 16th, 2019
94
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.73 KB | None | 0 0
  1. ## UKB Phenotypic Correlation Matrices
  2. ##
  3. ## Generates a phenotypic correlation matrix for all GWAS'ed
  4. ## traits, residualized for all GWAS covariates. Also outputs
  5. ## a matrix of pairwise N's.
  6. ##
  7. ## Created by Caitlin E Carey (ccarey@broadinstitute.org)
  8. ##
  9. ## Last updated 9/27/2019
  10.  
  11. import pandas as pd
  12. import numpy as np
  13. import statsmodels.formula.api as sm
  14. import functools
  15. import sys
  16.  
  17. # flush the buffer to std.out
  18. print = functools.partial(print,flush=True)
  19.  
  20. # grab commandline input specifying sex to be analyzed
  21. biosex=sys.argv[1]
  22.  
  23. # load the most current manifest file
  24. manifest = pd.read_table('/stanley/robinson/ccarey/UKBB/ukb_files/UKBBmegaGWAS_manifest_downloaded20190910.tsv')
  25.  
  26. # grab all non-covar GWAS phenotypes in manifest
  27. phenolist = [x for x in manifest.loc[manifest.Sex==biosex,"Phenotype Code"].values.tolist() if pd.notnull(x) and x not in ["age","sex"]]
  28. phenolist = np.unique(phenolist)
  29.  
  30. # grab all participants in GWAS sample remove those who subsequently withdrew
  31. sampleIDs = [line.rstrip('\n') for line in open('/psych/genetics_data/projects/ukb31063/ukb31063.neale_gwas_samples.'+biosex+'.txt')][1:]
  32. withdrawn = [line.rstrip('\n') for line in open('/psych/genetics_data/projects/ukb31063/ukb31063.withdrawn_samples.txt')][1:]
  33. sampleIDs = [int(x) for x in sampleIDs if x not in withdrawn]
  34.  
  35. # load all GWAS covariates
  36. covars = pd.read_table('/stanley/robinson/ccarey/UKBB/ukb_files/megaGWAS_covars/ukb31063.neale_gwas_covariates.'+biosex+'.tsv.bgz',index_col="s",compression="gzip")
  37.  
  38. # load all relevant phenotype files
  39. phesant = pd.read_table('/psych/genetics_data/projects/ukb31063/ukb31063.PHESANT_January_2019.'+biosex+'.tsv.bgz',dtype=object,index_col="s",compression="gzip")
  40. finngen = pd.read_table('/psych/genetics_data/projects/ukb31063/ukb31063.FINNGEN_phenotypes.'+biosex+'.tsv.bgz',index_col="s",compression="gzip")
  41. icd10 = pd.read_table('/psych/genetics_data/projects/ukb31063/ukb31063.ICD10_phenotypes.'+biosex+'.tsv.bgz',index_col="s",compression="gzip")
  42. biomarkers = pd.read_csv('/psych/genetics_data/ccarey/UKBB/ukb_files/biomarkers_megaGWAS/ukb31063.biomarkers_gwas.'+biosex+'.tsv',dtype=object,index_col="s")
  43.  
  44. # grab all GWAS'ed phenotypes
  45. phesant_phenos = np.intersect1d(phesant.columns,phenolist)
  46. finngen_phenos = np.intersect1d(finngen.columns,phenolist)
  47. icd10_phenos = np.intersect1d(icd10.columns,phenolist)
  48. biomarkers_phenos = np.intersect1d(biomarkers.columns,phenolist)
  49. filephenos = np.concatenate((phesant_phenos,finngen_phenos,icd10_phenos,biomarkers_phenos))
  50.  
  51. # extract data for all GWAS'ed phenotypes
  52. phesant_extracted = phesant.loc[sampleIDs,phesant_phenos]
  53. finngen_extracted = finngen.loc[sampleIDs,finngen_phenos]
  54. icd10_extracted = icd10.loc[sampleIDs,icd10_phenos]
  55. biomarkers_extracted = biomarkers.loc[sampleIDs,biomarkers_phenos]
  56. manifest_phenos = pd.concat([phesant_extracted,finngen_extracted,icd10_extracted,biomarkers_extracted],axis=1)
  57.  
  58. # replace boolean strings with actual booleans
  59. manifest_phenos.replace('TRUE',True,inplace=True)
  60. manifest_phenos.replace('FALSE',False,inplace=True)
  61.  
  62. # function to residualize GWAS phenotypes by GWAS covars
  63. def residualize(column):
  64. temp = manifest_phenos[column].astype(float).to_frame().join(covars,how="left")
  65. temp['y'] = temp[column]
  66. model = sm.ols("y ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20 + age + age_isFemale + age_squared + age_squared_isFemale + isFemale", data=temp,missing="drop").fit()
  67. return(model.resid)
  68.  
  69. # residualize all phenotypes
  70. manifest_phenos_resid = manifest_phenos.apply(lambda x: residualize(x.name))
  71.  
  72. # add in GWAS'ed covariates
  73. if biosex=="both_sexes":
  74. isfemale_mod = sm.ols("isFemale.astype(int) ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20 + age + age_squared", data=covars,missing="drop").fit()
  75. manifest_phenos_resid["isFemale"]=isfemale_mod.resid
  76.  
  77. age_mod = sm.ols("age ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20 + isFemale", data=covars,missing="drop").fit()
  78. manifest_phenos_resid["age"]=age_mod.resid
  79.  
  80. else:
  81. age_mod = sm.ols("age ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20", data=covars,missing="drop").fit()
  82. manifest_phenos_resid["age"]=age_mod.resid
  83.  
  84. # generate residualized phenotypic correlation matrix
  85. manifest_phenos_resid_corr = manifest_phenos_resid.corr()
  86. print("correlation matrix generated")
  87.  
  88. # save resulting correlation matrix
  89. manifest_phenos_resid_corr.to_csv('/psych/genetics_data/ccarey/UKBB/h2_corrmat/corrmat/'+biosex+'_resid_corrmat.csv')
  90. print("saved corrmat to file")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement