Advertisement
Guest User

Untitled

a guest
Mar 16th, 2019
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 16.57 KB | None | 0 0
  1. ########################################################################
  2. # Title: Illumina EPIC methylation array preprocessing, normalization and statistics
  3. # Author: Leticia Spindola
  4. # Date: 23/05/2018
  5. # Purpose: Fazer o QC, normalization, and etc with INPD samples and CAG controls
  6. #######################################################################
  7.  
  8. library(wateRmelon)
  9. library(minfi)
  10. require(dplyr)
  11. library(ggplot2)
  12. library(qqman)
  13. library(DMRcate)
  14. library(limma)
  15. library(RColorBrewer)
  16. library(Gviz)
  17.  
  18. dataDirectory <- "~/Documents/ILS_INPD_48"
  19. setwd("~/Documents/ILS_INPD_48")
  20.  
  21. ############################################################
  22. ################ IMPORTANDO DADOS EPIC  ####################
  23. ############################################################
  24. # Importando os dados fenotipicos das amostras
  25. targets_HRC <- read.csv2(file = "~/Documents/ILS_INPD_48/pData_INPDmeth_final_03_07_2018_minfi.csv", header = T)
  26. class(targets_HRC) # [1] "data.frame"
  27.  
  28. # Importando os arquivos brutos do EPIC array (.dat)
  29. RGset_HRC <- read.metharray.exp(targets = targets_HRC, extended = TRUE)
  30. dim(pData(RGset_HRC)) # [1] 48 49
  31.  
  32. annotation(RGset_HRC)
  33. # array                                           annotation
  34. # "IlluminaHumanMethylationEPIC"                 "ilm10b2.hg19"
  35.  
  36. # Renomeando os dados brutos com os IDs corretos das amostras
  37. sampleNames(RGset_HRC) <- targets_HRC$sampleID
  38. RGset_HRC
  39. # class: RGChannelSetExtended
  40. # dim: 1,051,943 48
  41. # rownames(1051943): 1600101 1600111 ... 99810990 99810992
  42. # colnames(48): I211_W0 I331_W0 ... I236_W1 I378_W1
  43. # colData names(49): BARCODES sampleID ... Basename filenames
  44. # array: IlluminaHumanMethylationEPIC
  45. # annotation: ilm10b2.hg19
  46.  
  47. ############################################################
  48. ############ IMPORTANDO DADOS CAG CONTROLS  ################
  49. ############################################################
  50. targets_controls <- read.csv2(file = "~/Documents/ILS_INPD_48/Controls_CAG_minfi_04_07_2018.csv", header = T)
  51. class(targets_controls) # [1] "data.frame"
  52.  
  53. RGset_controls <- read.metharray.exp(targets = targets_controls, extended = TRUE)
  54. dim(pData(RGset_controls)) # [1] 140  11
  55.  
  56. annotation(RGset_controls)
  57. # array                                           annotation
  58. # "IlluminaHumanMethylation450k"                  "ilmn12.hg19"
  59.  
  60. # Renomeando os dados brutos com os IDs corretos das amostras
  61. targets_controls$number <- 1:140
  62. sampleNames(RGset_controls) <- targets_controls$number
  63. RGset_controls
  64. # class: RGChannelSetExtended
  65. # dim: 622399 140
  66. # rownames(622399): 10600313 10600322 ... 74810490 74810492
  67. # colnames(140): 1 2 ... 139 140
  68. # Annotation
  69. # array: IlluminaHumanMethylation450k
  70. # annotation: ilmn12.hg19
  71.  
  72. ############################################################
  73. ################### CONVERTING ARRAY  ######################
  74. ############################################################
  75. # Transformando todas as colunas factor em chacteres
  76. factor_vars <- lapply(colData(RGset_HRC), class) == "factor"
  77. colData(RGset_HRC)[, factor_vars] <- lapply(colData(RGset_HRC)[, factor_vars], as.character)
  78.  
  79. factor_vars <- lapply(colData(RGset_controls), class) == "factor"
  80. colData(RGset_controls)[, factor_vars] <- lapply(colData(RGset_controls)[, factor_vars], as.character)
  81.  
  82. # Juntando os dados dos arrays
  83. rgset_all <- combineArrays(RGset_controls,RGset_HRC,outType = ("IlluminaHumanMethylation450k"))
  84.  
  85. rgset_all
  86. # class: RGChannelSetExtended
  87. # dim: 575385 188
  88. # rownames(575385): 10600322 10600328 ... 74810485 74810492
  89. # colnames(188): 1 2 ... I236_W1 I378_W1
  90. # colData names(60): Methylation.Chip.ID CAG_ID ... AGE ArrayTypes
  91. # Annotation
  92. # array: IlluminaHumanMethylation450k
  93. # annotation: ilmn12.hg19
  94.  
  95. ############################################################
  96. #################### QUALITY CONTROL  ######################
  97. ############################################################
  98. # calculate the detection p-values
  99. detP <- detectionP(rgset_all)
  100. head(detP[,1:5])
  101.  
  102. # Importando informacoes fenotipicas participantes HRC + controles
  103. target <- read.csv2(file = "~/Documents/ILS_INPD_48/Comparacao_Controles_CAG/targets.csv", header = T)
  104. str(target)
  105. target$sampleID <- as.character(target$sampleID)
  106. a <- target$sampleID
  107. b <- colnames(rgset_all)
  108. identical(a,b) # [1] TRUE
  109. gender <- as.numeric(target$GENDER)
  110.  
  111. # Examine mean detection p-values across all samples to identify any failed samples
  112. pal <- brewer.pal(8,"Dark2")
  113. par(mfrow=c(1,2))
  114. barplot(colMeans(detP), col=pal[factor(target$Group)], las=2,
  115.         cex.names=0.8,ylab="Mean detection p-values")
  116. abline(h=0.01,col="red")
  117. legend("topleft", legend=levels(factor(target$Group)), fill=pal,
  118.        bg="white")
  119. barplot(colMeans(detP), col=pal[factor(target$Group)], las=2,
  120.         cex.names=0.8, ylim = c(0,0.002), ylab="Mean detection p-values")
  121. legend("topleft", legend=levels(factor(target$Group)), fill=pal,
  122.        bg="white")
  123.  
  124. qcReport(rgset_all, sampNames=target$sampleID, sampGroups=target$Group,
  125.          pdf="qcReport.pdf")
  126.  
  127. # remove poor quality samples
  128. keep <- colMeans(detP) < 0.05
  129. rgset_all <- rgset_all[,keep]
  130. rgset_all
  131. # class: RGChannelSetExtended
  132. # dim: 575385 188
  133.  
  134. ############################################################
  135. #################### NORMALISATION  ########################
  136. ############################################################
  137. # Normalization by preprocessFunnorm (faz tbm a NOOB background correction)
  138. rgset_norm <- preprocessFunnorm(rgset_all, nPCs=2, sex = gender, bgCorr = TRUE,
  139.                                 dyeCorr = TRUE, keepCN = TRUE, ratioConvert = TRUE,
  140.                                 verbose = TRUE)
  141.  
  142. # visualise what the data looks like before and after normalisation
  143. par(mfrow=c(1,2))
  144. densityPlot(rgset_all, sampGroups=target$Group,main="Raw", legend=FALSE)
  145. legend("top", legend = levels(factor(target$Group)),
  146.        text.col=brewer.pal(8,"Dark2"))
  147. densityPlot(getBeta(rgset_norm), sampGroups=target$Group,
  148.             main="Normalized", legend=FALSE)
  149. legend("top", legend = levels(factor(target$Group)),
  150.        text.col=brewer.pal(8,"Dark2"))
  151.  
  152. ############################################################
  153. ################### DATA EXPLORATION #######################
  154. ############################################################
  155. # MDS plots to look at largest sources of variation
  156. pal <- brewer.pal(8,"Dark2")
  157. plotMDS(getM(rgset_norm), top=1000, gene.selection="common",
  158.         col=pal[factor(target$Group)])
  159. legend("top", legend=levels(factor(target$Group)), text.col=pal,
  160.        bg="white", cex=0.7)
  161. plotMDS(getM(rgset_norm), top=1000, gene.selection="common",
  162.         col=pal[factor(target$GENDER)])
  163. legend("top", legend=levels(factor(target$GENDER)), text.col=pal,
  164.        bg="white", cex=0.7)
  165.  
  166. # Examine higher dimensions to look at other sources of variation
  167. plotMDS(getM(rgset_norm), top=1000, gene.selection="common",
  168.         col=pal[factor(target$Group)], dim=c(1,3))
  169. legend("top", legend=levels(factor(target$Group)), text.col=pal,
  170.        bg="white", cex=0.7)
  171. plotMDS(getM(rgset_norm), top=1000, gene.selection="common",
  172.         col=pal[factor(target$Group)], dim=c(2,3))
  173. legend("top", legend=levels(factor(target$Group)), text.col=pal,
  174.        bg="white", cex=0.7)
  175. plotMDS(getM(rgset_norm), top=1000, gene.selection="common",
  176.         col=pal[factor(target$Group)], dim=c(3,4))
  177. legend("top", legend=levels(factor(target$Group)), text.col=pal,
  178.        bg="white", cex=0.7)
  179.  
  180. ############################################################
  181. ###################### FILTERING  ##########################
  182. ############################################################
  183. # ensure probes are in the same order in the rgset_norm and detP objects
  184. detP <- detP[match(featureNames(rgset_norm),rownames(detP)),]
  185.  
  186. # remove any probes that have failed in one or more samples
  187. keep <- rowSums(detP < 0.01) == ncol(rgset_norm)
  188. table(keep)
  189. # FALSE   TRUE
  190. # 8477  444355
  191.  
  192. rgset_norm_Flt <- rgset_norm[keep,]
  193. rgset_norm_Flt
  194. # class: GenomicRatioSet
  195. # dim: 444355 188
  196.  
  197. # if your data includes males and females, remove probes on the sex chromosomes
  198. ann450k <- getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19)
  199. head(ann450k)
  200.  
  201. keep <- !(featureNames(rgset_norm_Flt) %in% ann450k$Name[ann450k$chr %in% c("chrX","chrY")])
  202.  
  203. table(keep)
  204. # FALSE   TRUE
  205. # 9909    434446
  206.  
  207. rgset_norm_Flt <- rgset_norm_Flt[keep,]
  208. rgset_norm_Flt
  209. # class: GenomicRatioSet
  210. # dim: 434446 188
  211.  
  212. # remove probes with SNPs at CpG site
  213. rgset_norm_Flt <- dropLociWithSnps(rgset_norm_Flt)
  214. rgset_norm_Flt
  215. # class: GenomicRatioSet
  216. # dim: 420347 188
  217.  
  218. # exclude cross reactive probes
  219. # OBSERVACAO: non-CpG- targeting probes (Probe ID prefix = “ch”)
  220. xReactiveProbes <- read.csv2(file=paste(dataDirectory,
  221.                                         "Comparacao_Controles_CAG/48639-non-specific-probes-Illumina450k.csv",
  222.                                         sep="/"))
  223. keep <- !(featureNames(rgset_norm_Flt) %in% xReactiveProbes$TargetID)
  224. table(keep)
  225. # FALSE   TRUE
  226. # 24,495 395,852
  227.  
  228. rgset_norm_Flt <- rgset_norm_Flt[keep, ]
  229. rgset_norm_Flt
  230. # class: GenomicRatioSet
  231. # dim: 395852 188
  232.  
  233. # calculate M-values for statistical analysis
  234. mvalues <- getM(rgset_norm_Flt)
  235.  
  236. # remove probes with SNPs at CpG site according to DMRcate package
  237. mvalues <- rmSNPandCH(mvalues, dist=2, mafcut=0.1, rmXY = TRUE, rmcrosshyb = TRUE)
  238. dim(mvalues) # [1] 389,662    188
  239.  
  240. # Excluir amostras 36, 55 (outlier nos graficos) 94
  241. # 36: report = female; no grafico se agrupa como male
  242. mvalues <- mvalues[,-c(36,55,94)]
  243. dim(mvalues) # [1] 389,662    185
  244.  
  245. target <- target[-c(36,55,94), ]
  246. dim(target) # [1] 185  10
  247.  
  248. a <- target$sampleID
  249. b <- colnames(mvalues)
  250. identical(a,b) # [1] TRUE
  251.  
  252. ############################################################
  253. ################ EWAS USING AGE as IV ######################
  254. ############################################################
  255. # IV: independent variable
  256.  
  257. # Selecionando so os controles e excluindo amostras do HRC
  258. mvalues_age <- mvalues[,1:137]
  259. target_age <- target[1:137, ]
  260.  
  261. a <- target_age$sampleID
  262. b <- colnames(mvalues_age)
  263. identical(a,b) # [1] TRUE
  264.  
  265. # Variaveis no modelo
  266. age <- target_age$AGE
  267.  
  268. # Modelo estatistico
  269. design <- model.matrix(~age)
  270. head(design)
  271.  
  272. myannotation <- cpg.annotate(datatype = c("array"), mvalues_age, what=c("M"),arraytype=c("450K"),
  273.                              analysis.type="differential", design = design, fdr = 0.05, coef="age")
  274. # Your contrast returned 646 individually significant probes
  275.  
  276. cpgs <- data.frame(ID = myannotation$ID, weights = abs(myannotation$stat),CHR = as.character(myannotation$CHR),
  277.                    pos = myannotation$pos,betafc = myannotation$betafc,indfdr = myannotation$indfdr,
  278.                    is.sig = myannotation$is.sig)
  279.  
  280. # Anotacoes genomicas do 450K
  281. ann450k <- getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19)
  282. ann450k <- as.data.frame(ann450k)
  283. dim(ann450k) # [1] 485,512     33
  284. dim(cpgs) # [1] 389,662      7
  285.  
  286. cpgs$ID <- as.character(cpgs$ID)
  287. ann450k$ID <- rownames(ann450k)
  288.  
  289. cpgs_anotado <- left_join(cpgs,ann450k, by = "ID")
  290. dim(cpgs_anotado) # [1] 389,662     40
  291.  
  292. # Salvando cpgs_anotado:
  293. write.csv2(cpgs_anotado, file = "Associacao_Idade_TODAS_Informacoes.csv",
  294.            quote = F, row.names = F)
  295. write.table(cpgs_anotado,file="Associacao_Idade_TODAS_Informacoes.txt", sep="\t",
  296.             row.names=F,quote=FALSE)
  297.  
  298. ############################################################
  299. ######## Excluding Probes Age and Puberty Associated  ######
  300. ############################################################
  301. # Probes to remove: Todas as probes associadas com idade
  302. # na analise anterior utilizando os controles do Hakon +
  303. # Probes encontradas pelo Almstrup et al., 2016 como
  304.  
  305. # Importando os IDs das probes que serao removidas
  306. probes_to_remove <- read.csv2(file = "CpG_Ids_ParaRemover.csv",header = T)
  307. probes_to_remove$ID <- as.character(probes_to_remove$ID)
  308. dim(probes_to_remove) # [1] 757   1
  309.  
  310. # Selecionando so as informacoes do HRC
  311. mvalues_b <- mvalues[,138:185]
  312. dim(mvalues_b) # [1] 389,662    48
  313.  
  314. target_b <- target[138:185, ]
  315. dim(target_b) # [1] 48  10
  316.  
  317. a <- target_b$sampleID
  318. b <- colnames(mvalues_b)
  319. identical(a,b) # [1] TRUE
  320.  
  321. # Removendo de Mvalues as probes associadas com idade e puberdade
  322. remove <- probes_to_remove$ID
  323.  
  324. mvalues_b <- as.data.frame(mvalues_b)
  325. mvalues_b$ID <- rownames(mvalues_b)
  326.  
  327. mvalues_b <- mvalues_b[!(mvalues_b$ID %in% remove), ]
  328. dim(mvalues_b) # [1] 388,924     49
  329. # 389,662 - 388,924 = 738 probes
  330.  
  331. mvalues_b <- mvalues_b[,1:48]
  332. mvalues_b <- as.matrix(mvalues_b)
  333.  
  334. ############################################################
  335. ################# Finding DMP and DMR ######################
  336. ############################################################
  337. # Analise so com os dados do HRC
  338.  
  339. # Como targets_HRC e target_b sao iguais, da para usar as informacoes fenotipicas das duas data frames
  340. a <- as.character(targets_HRC$sampleID)
  341. b <- target_b$sampleID
  342. identical(a,b) # TRUE
  343.  
  344. # Coeficients
  345. time <- factor(targets_HRC$TIME, levels=c("Baseline", "Followup"))
  346. pair <- factor(targets_HRC$pair)
  347. # Covariables
  348. chip_number <- targets_HRC$CHIP_NUMBER
  349. chip_position <- as.numeric(targets_HRC$CHIP_POSITION)
  350.  
  351. # Modelo
  352. design <- model.matrix(~time+pair+chip_number+chip_position)
  353. head(design)
  354.  
  355. ######## DMP ######
  356. myannotation <- cpg.annotate(datatype = c("array"), mvalues_b, what=c("M"),arraytype=c("450K"), analysis.type="differential",
  357.                              design = design, fdr = 0.05, coef="timeFollowup")
  358. # Your contrast returned 663 individually significant probes.
  359.  
  360. ######## DMR ######
  361. DMRoutput <- dmrcate(myannotation, lambda=1000, C=2,p.adjust.method = "BH", min.cpgs = 3)
  362. dim(DMRoutput$results) # [1] 90  6
  363. DMR <- DMRoutput$results
  364.  
  365. write.csv2(DMR, file = "DMR_Longitudinal.csv",
  366.            quote = F, row.names = F)
  367. write.table(DMR,file="DMR_Longitudinal.txt", sep="\t",
  368.             row.names=F,quote=FALSE)
  369.  
  370. ############################################################
  371. ################ GENE EXPRESSION ANALYSIS  #################
  372. ############################################################
  373. # Importando os dados de expressao genica
  374. load("Dados_Expressao_eset_final.RData")
  375. eset_final # 6782 features, 228 samples
  376.  
  377. # selecionando os dados do grupo incidente
  378. pdat <- pData(eset_final)
  379. pdat <- pdat[order(pdat$Sample_Group), ]
  380. pdat_inc <- pdat[58:109,]
  381.  
  382. # 20214,20228 e 21044 esta a mais
  383. # tirar 20795 e 20815
  384. pdat_inc <- pdat_inc[order(pdat_inc$Sample_ID), ]
  385. pdat_met <- pdat_inc[-c(9:12,49,50,33,34), ]
  386.  
  387. # Selecionando os dados das probes
  388. met <- as.character(pdat_met$sampleID)
  389. eset_met <- eset_final[,met]
  390. dim(eset_met)
  391. # Features  Samples
  392. # 6782       44
  393.  
  394. probes <- fData(eset_final)
  395. genes_no_array <- probes[,c(1:3)]
  396. write.csv2(genes_no_array, file = "genes_que_tem_no_GENEX.csv")
  397.  
  398. # CpGs
  399. genes <- read.csv2(file = "Genes_DifMet_ComExpressao_GENEX.csv", header = T)
  400. dim(genes) # [1] 122   3
  401. length(unique(genes$UCSC_RefGene_Name)) # [1] 103
  402.  
  403. # Usando table para verificar se tem duplicacao:
  404. n_occur <- data.frame(table(genes$UCSC_RefGene_Name)) # me da uma tabela com a lista de IDs e o numero de vezes que elas ocorrem
  405. n_occur[n_occur$Freq > 1,] # mostra quais sao os IDs duplicados
  406. genes[genes$UCSC_RefGene_Name %in% n_occur$Var1[n_occur$Freq > 1],] # mostra as linhas duplicadas da data frame
  407.  
  408. genes_row <- as.character(genes$Tem_Expressao_GENEX_Rowname) # 122 caracteres
  409.  
  410. # Dados brutos de expressao genica
  411. d_matrix <- exprs(eset_met)
  412. d_matrix <- as.data.frame(d_matrix)
  413.  
  414. # selecionando os dados dos genes diferencialmente metilados do artigo 1
  415. d_matrix_met <- d_matrix[genes_row, ]
  416. d_matrix_met <- as.matrix(d_matrix_met)
  417. dim(d_matrix_met) # [1] 122  44
  418.  
  419. group <- factor(pdat_met$PHENOTYPE, levels=c("A", "D"))
  420. levels(group)[levels(group)=="A"] <- "Baseline"
  421. levels(group)[levels(group)=="D"] <- "Follow-up"
  422.  
  423. n <- 2
  424. pair <- as.factor(rep(1:22,each=n))
  425.  
  426. # Modelo Estatistico
  427. design <- model.matrix(~group+pair)
  428. head(design)
  429.  
  430. fit <- lmFit(d_matrix_met, design)
  431. fit <- eBayes(fit)
  432.  
  433. results <- topTable(fit, coef="groupFollow-up", adjust.method="BH", number = nrow(d_matrix_met))
  434.  
  435. dim(results) # [1] 122   7
  436. results$probe_row <- rownames(results)
  437. genes$UCSC_RefGene_Name <- as.character(genes$UCSC_RefGene_Name)
  438. genes$Tem_Expressao_GENEX_Rowname <- as.character(genes$Tem_Expressao_GENEX_Rowname)
  439. names(genes)[2] <- "probe_row"
  440.  
  441. results <- full_join(results,genes,by="probe_row")
  442. results <- results[,c(1:6,8,9,7)]
  443.  
  444. # Salvando cpg_genes:
  445. write.csv2(results, file = "GENEX_longitudunal.csv",
  446.            quote = F, row.names = T)
  447. write.table(results,file="GENEX_longitudunal.txt", sep="\t",
  448.             row.names=T,quote=FALSE)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement