Advertisement
GenomicsBootCamp

varLD Tutorial script

Oct 5th, 2021
420
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # Full script to analyse selections signatures based on variation of linkage disequilibrium
  2. # Software: varLD
  3. # Presented by: Gabor Meszaros, see Genomics Boot Camp YouTube channel
  4.  
  5. # Workflow:
  6.  
  7. # 1) Install Java
  8. # 2) Aquire the data for two populations, for example: http://widde.toulouse.inra.fr/widde/widde/main.do?module=cattle
  9. # 3) Aquire the program: https://blog.nus.edu.sg/sshsphphg/varld/
  10. #    Link in the publication does not work
  11. # 4) Check the manual, i.e. the README for instructions
  12. # 5) Follow the script below to figure out how it is done
  13.  
  14.  
  15. # Clear workspace and load packages
  16. rm(list = ls())
  17. library(tidyverse)
  18.  
  19. # Set the location of the working directory
  20. PATH <- "d:/analysis/2020_GenomicsBootCamp_Demo/varLD/"
  21. setwd(PATH)
  22. # Set the number of autosomes for the loops
  23. autosomeNr <- 29
  24.  
  25.  
  26. ####
  27. # Quality control
  28. ####
  29. # breed 1
  30. system("plink --file Charolais --cow --autosome --geno 0.1 --mind 0.1 --hwe 0.000001 --nonfounders --allow-no-sex --make-bed --out breed1_QC")
  31.  
  32. # breed 2
  33. system("plink --file Holstein --cow --autosome --geno 0.1 --mind 0.1 --hwe 0.000001 --nonfounders --allow-no-sex --make-bed --out breed2_QC")
  34.  
  35.  
  36. ####
  37. # Find the SNP appearing in both populations
  38. ####
  39. map1 <- read_tsv("breed1_QC.bim",col_names = F)
  40. map2 <- read_tsv("breed2_QC.bim",col_names = F)
  41.  
  42. mapOverlap <- inner_join(map1, map2, by="X2") %>%
  43.   select(X2) %>%
  44.   write_tsv("commonSnpPop1Pop2.txt", col_names = F)
  45.  
  46. # START MAIN LOOP
  47. for (chrNr in 1:autosomeNr) {
  48.  
  49. #chrNr <- 1
  50.  
  51. ####
  52. # Population 1
  53. ####
  54.  
  55. # change PLINK file to format close to varLD requirement: A-transpose
  56. system(str_c("plink --bfile breed1_QC --cow --allow-no-sex --nonfounders --chr ", chrNr ,
  57.              " --extract commonSnpPop1Pop2.txt --recode A-transpose --out pop1_forVarLD"))
  58.  
  59.  
  60. # read in and modify the traw file to varLD format
  61. traw <- read_tsv("pop1_forVarLD.traw")
  62.  
  63. # add 1 to all genotypes, to meet the requirements of varLD for AA=1, AB=2, BB=3 - missing values remain NA
  64. traw[,7:ncol(traw)] <- traw[,7:ncol(traw)]+1
  65.  
  66. # replace missing NA values with 4, according to varLD requirements and keep only selected columns
  67. # very unfortunate column names, therefore the index references with numbers
  68. pop1_VarLD <- traw %>%
  69.   mutate(across(starts_with("CHL_"), ~replace_na(.x, 4))) %>%
  70.   select(-c(1,3,5,6)) %>%
  71.   write_tsv(paste0("pop1_chr",chrNr,"_varLD.txt"), col_names = F)
  72.  
  73.  
  74. ####
  75. # Population 2
  76. ####
  77.  
  78. # change PLINK file to format close to varLD requirement: A-transpose
  79. system(str_c("plink --bfile breed2_QC --cow --allow-no-sex --nonfounders --chr ", chrNr ,
  80.              " --extract commonSnpPop1Pop2.txt --recode A-transpose --out pop2_forVarLD"))
  81.  
  82.  
  83. # read in and modify the traw file to varLD format
  84. traw <- read_tsv("pop2_forVarLD.traw")
  85.  
  86. # add 1 to all genotypes, to meet the requirements of varLD for AA=1, AB=2, BB=3 - missing values remain NA
  87. traw[,7:ncol(traw)] <- traw[,7:ncol(traw)]+1
  88.  
  89. # replace missing NA values with 4, according to varLD requirements and keep only selected columns
  90. # very unfortunate column names, therefore the index references with numbers
  91. pop2_VarLD <- traw %>%
  92.   mutate(across(starts_with("HOL_"), ~replace_na(.x, 4))) %>%
  93.   select(-c(1,3,5,6)) %>%
  94.   write_tsv(paste0("pop2_chr",chrNr,"_varLD.txt"), col_names = F)
  95.  
  96.  
  97. ####
  98. # run varLD
  99. ####
  100. system(paste0("java -jar rgenetics-1.0.jar -p VarLD pop1_chr",chrNr,"_varLD.txt pop2_chr",chrNr,"_varLD.txt -o varLD_chr",chrNr,".txt"))
  101.  
  102. ####
  103. # Change decimal comma to decimal point - regional setting Windows
  104. # Use if necessary...
  105. ####
  106. #outVarLD <- read.table(str_c("varLD_chr",chrNr,".txt"), sep="\t", header = T)
  107. #outVarLD$position <- gsub(',', '.', outVarLD$position)
  108. #outVarLD$raw_score <- gsub(',', '.', outVarLD$raw_score)
  109. #write.table(outVarLD,str_c("pointvarLD_chr",chrNr,".txt"), sep="\t", quote = F)
  110.  
  111. # END MAIN LOOP
  112. }
  113.  
  114.  
  115. ####
  116. # Standardize varLD values
  117. ####
  118.  
  119.  
  120. # This software is supplied without any warranty or guaranteed support whatsoever.
  121. # NUS CME can not be responsible for its use, misuse, or functionality.
  122. #
  123. # This R script can be used for standardizing the genome-wide scores to have
  124. # a mean of 0 and a variance of 1, and to output files with the standardized scores.
  125. # It assumes that varld scores are available for all 22 autosomal chromosomes.
  126. # This code also identifies the varLD scores that correspond to the stated percentiles.
  127. #
  128. # Author: rick and yy teo
  129. # Version: 1.0
  130. # Last Updated : 07 March 2010
  131. ######################################################################################
  132. ## global variables to modify for own use
  133. # specifies the folder containing all varld output for 22 autosomal chromosomes
  134. #PATH = "XXXXXXXX"
  135. # filename prefix before the chromosome number, i.e. CHS_INS_chr1, CHS_INS_chr2
  136. FILENAME = "varLD_chr"
  137. # percentile of the genomewide distribution to highlight, default to 95%, 99%, 99.9% and 99.99%.
  138. percentile.out = c(0.95, 0.99, 0.999, 0.9999)
  139.  
  140.  
  141. varLD.out <- {}
  142. chr.store <- {}
  143. for (chr in 1:autosomeNr){
  144.   varLD.temp <- read.table(paste(PATH, FILENAME, chr, ".txt", sep=""), sep="\t", header = T)
  145.   varLD.out <- rbind(varLD.out, varLD.temp)
  146.   chr.store <- c(chr.store, rep(chr, dim(varLD.temp)[1]))
  147.   print(paste("completed reading in unstandardized varLD output file for chromosome ", chr, sep=""))
  148. }
  149. varLD.mean <- mean(varLD.out[,"raw_score"])
  150. varLD.sd <- sd(varLD.out[,"raw_score"])
  151. standardized_score <- (varLD.out[,"raw_score"] - varLD.mean)/varLD.sd
  152. varLD.out <- cbind(varLD.out, standardized_score)
  153. varLD.threshold <- quantile(standardized_score, probs = percentile.out)
  154.  
  155. for (chr in 1:autosomeNr){
  156.   chr.flag <- which(chr.store == chr)
  157.   write.table(varLD.out[chr.flag,], paste(PATH, FILENAME, chr, "_standardized.out", sep=""), sep="\t", quote=F, row.names=F)
  158.   print(paste("completed writing out standardized varLD output file for chromosome ", chr, sep=""))
  159. }
  160. n.length.percentile <- length(percentile.out)
  161. for (i in 1:n.length.percentile){
  162.   print(paste("varLD threshold for ", percentile.out[i], " = ", varLD.threshold[i], sep=""))
  163. }
  164.  
  165.  
  166.  
  167.  
  168.  
  169. ####
  170. # plot VarLD values
  171. ####
  172. # This software is supplied without any warranty or guaranteed support whatsoever.
  173. # NUS CME can not be responsible for its use, misuse, or functionality.
  174. #
  175. # This is example code for producing the varLD region plot of VKORC1 that is similar to
  176. # Figure 1 in the Bioinformatics Application Note.
  177. # Anybody who is moderately proficient in R should be able to modify the codes here
  178. # to customise the plot required.
  179. #
  180. # Author: rick and yy teo
  181. # Version: 1.0
  182. # Last Updated : 07 March 2010
  183. ######################################################################################
  184. # Loop for all chromosomes
  185. for (chrPlot in 1:autosomeNr) {  
  186. ## global variables to modify for own use
  187. # specifies the folder where the output files from varLD is stored
  188. #PATH = "XXXXXXXX"
  189. # filename prefix before the chromosome number, i.e. CHS_INS_chr1, CHS_INS_chr2
  190. FILENAME = "varLD_chr"
  191. # chromosome of region to plot
  192. chr = chrPlot
  193. # percentile of the genomewide distribution to highlight
  194. percentile.out  = c(0.95, 0.99, 0.999, 0.9999)
  195. # varLD score corresponding to the stated percentiles in percentile.out
  196. # !!! skip this, as already computed by the previous script
  197. #varLD.threshold    = c(1.8, 3.4, 6.0, 7.0)
  198. # start and end base-pair coordinates of the region to plot
  199. region.start = 0
  200. region.end = 320000000
  201.  
  202.  
  203. temp.in <- read.table(paste(PATH, FILENAME, chr, "_standardized.out", sep=""), sep="\t", header = T)
  204. region.flag <- which(temp.in[,"position"] >= region.start & temp.in[,"position"] <= region.end)
  205.  
  206. plot(0, 1, xlab = paste0("Physical position (Mb) - ",chrPlot), ylab = "Standardized score", las = 1, type = "n", xlim = c(region.start, region.end)/10^6, ylim = c(min(varLD.threshold), max(temp.in[region.flag,"standardized_score"])), bty = "n", cex.lab = 1.4, cex.main = 1.4)
  207. points(temp.in[region.flag, "position"]/10^6, temp.in[region.flag, "standardized_score"], pch = 16, col = "red")
  208. n.length.threshold <- length(varLD.threshold)
  209. if (n.length.threshold >= 2){
  210.   for (i in 2:n.length.threshold){
  211.     abline(h = varLD.threshold[i], lty = 2)
  212.     text(region.start/10^6 + 0.2, varLD.threshold[i] + 0.15, labels = paste("Top ", round((1 - percentile.out[i]) * 100, 2), "%", sep=""))
  213.   }
  214. }
  215.  
  216.  
  217. }
  218.  
  219.  
  220.  
  221. ####
  222. # Full manhattan plot
  223. ####
  224. # read all files and join to one big file
  225. fullData <- NULL
  226. for (tmpCounter in 1:autosomeNr){
  227.  tmp <- read_tsv(str_c("varLD_chr",tmpCounter,"_standardized.out"), col_names = T)
  228.  tmp$chr <- tmpCounter
  229.  fullData <- rbind(fullData,tmp)
  230.  
  231. }
  232.  
  233. # select only the required columns
  234. forManhattan <- fullData %>%
  235.   select(chr, position, pop1, standardized_score)
  236.  
  237. # draw the manhattan plot with qqman
  238. library(qqman)
  239. manhattan(forManhattan, chr = "chr", bp = "position", p = "standardized_score", snp = "pop1", logp = F,
  240.           xlab = "Chromosomes", ylab="Standardized score", suggestiveline=varLD.threshold[3],
  241.           genomewideline = varLD.threshold[4])
  242.  
  243.  
Advertisement
RAW Paste Data Copied
Advertisement