Advertisement
Guest User

Untitled

a guest
Feb 23rd, 2019
179
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.89 KB | None | 0 0
  1. setwd("D://mdr_project/ecoli_raw_fastq/e.coli_output/alignment_result/")
  2. library(tidyverse) # load required libraries
  3. library(viridis)
  4. library(ggridges)
  5. library(cowplot)
  6. library(RColorBrewer)
  7. arg_out <- read.table("../../output/alignment_result/argannot_result_all.txt") # read blastn output against arg-annot database
  8. dim(arg_out) # dimension of the output table
  9. outfmt <- "qseqid sseqid pident qlen slen sseq length mismatch gaps qstart qend sstart send evalue bitscore" # string of headers
  10. col_names <- unlist(strsplit(outfmt,split = " ")) # split the outfmt string to separate header for each column
  11. colnames(arg_out) <- col_names # add headers to each column
  12. dim(arg_out)
  13. split_col <- function(pattern, col, part){ # function for splitting a string into a vector according to a certain pattern
  14. col.chr <- as.character(col)
  15. lst <- sapply(strsplit(col.chr, split = pattern),`[`, part)
  16. vctr <- unlist(lst)
  17. return(vctr)
  18. }
  19.  
  20. sample_id <- split_col("\\..*",arg_out$qseqid,1)
  21. accesion_no <- split_col(":",arg_out$sseqid,2) # extract accession number of each gene
  22. antibiotic_gene <- split_col(":",arg_out$sseqid,1) # extract (antibiotic class) gene_id
  23. gene_id <- gsub("^\\([a-zA-Z_]{3,9}\\)","",antibiotic_gene) # extract gene_id
  24. antibiotic <- gsub(").+$","",antibiotic_gene) ; antibiotic_class <- gsub("^.","",antibiotic) #extract antibiotic class
  25. gene_sample<-paste(gene_id,sample_id,sep = ":") # concatenate gene_id with sample_id
  26. arg_out<- cbind(sample_id, antibiotic_class,accesion_no,gene_id,gene_sample,arg_out) # add new columns to the output table
  27. dim(arg_out)
  28.  
  29. arg_out_flt <- arg_out%>% # select rows wih perecnt-identity more than or equal to 97
  30. filter(pident>=90 & evalue <=1e-4)
  31.  
  32. {ttl_smpl_rds <- nrow(arg_out_flt) # total number of reads
  33. unique_args<- unique(arg_out_flt[,c("gene_sample","slen","antibiotic_class")])
  34. nrow_unique <- nrow(unique_args)
  35. genes <- numeric(nrow_unique)
  36. sample_id <- numeric(nrow_unique)
  37. drug_class <- numeric(nrow_unique)
  38. coverage <- numeric(nrow_unique)
  39. gene_gaps <- numeric(nrow_unique)
  40. no_reads <- numeric(nrow_unique)
  41. gen_cp_num <- numeric(nrow_unique)
  42. hit_thsnd_rds <- numeric(nrow_unique)
  43. nrm_cnt <- numeric(nrow_unique)
  44. test <- numeric(nrow_unique)}
  45. #
  46. for (i in 1:nrow_unique){ # loop over all unique rows in gene_sample column
  47. gene <- as.character((unique_args$gene_sample)[i])
  48. rows <- which((arg_out_flt$gene_sample)==gene) # indecis of gene_sample column equal to gene
  49. genes[i] <- gene # add the unique gene_sample row to "genes" numeric
  50. positions <- vector() # initialize empty numeric
  51. seq_len <- unique_args$slen[i] # gene-sequence length
  52. antibiotic <- unique_args$antibiotic_class[i]
  53. for (n in rows){ # loop over all rows with gene_sample equal to gene
  54. start <- arg_out_flt$sstart[n] # start position of the alignment in gene-sequence
  55. end <- arg_out_flt$send[n] # end position of the alignment in the gene-sequence
  56. read <- start:end # generate a sequence of numbers from start to end
  57. positions <- append(positions,read) # add the sequence of numbers between the start and end of the read to "positions" numeric
  58. }
  59. drug_class[i] <- as.character(antibiotic) # add the antibiotic class of the gene to "drug_class" numeric
  60. no_reads[i] <- length(rows) # add the number of reads aligned to a gene to "no_reads" numeric
  61. total_positions <- length(unique(positions)) # get the length of gene covered by aligned reads (-gaps)
  62. coverage[i] <- (total_positions/seq_len)*100# calculate the proportion of covered sequences and add it to "coverage" numeric
  63. gen_cp_num[i] <- length(rows)/seq_len # calculate gene copy number by dividing the number of reads aligned to a gene over the length of the gene sequence
  64. hit_thsnd_rds[i] <- length(rows)/ttl_smpl_rds*1000 # calculate hits per thousand reads for a gene by dividing the number of reads aligned to this gene over the total number of reads
  65. }
  66. sapply(list(genes,drug_class,coverage,no_reads,gen_cp_num,hit_thsnd_rds),length) # confirm that numerics are of the same length
  67. gene_id <- split_col("\\:", genes, 1) # extract the id of genes
  68. sample_id <- split_col("\\:", genes, 2) # extract the sample id
  69. gene_cov_arg <- data.frame(sample_id,gene_id,drug_class,coverage,no_reads,gen_cp_num,hit_thsnd_rds) # combine the numerics into a dataframe
  70. dim(gene_cov_arg)
  71.  
  72. arg_result <- gene_cov_arg%>% # select genes with coverage equal or more than 90
  73. filter(coverage >= 90)
  74.  
  75. dim(arg_result)
  76.  
  77. #rename the samples and antibiotic classes
  78. arg_result <- arg_result%>%
  79. rename(drug_abbrv = drug_class)%>% # rename the anitbiotic column
  80. mutate(gene_drug = paste(gene_id,drug_abbrv,sep = " : "))%>% # make a new column by concatenating both of the gene_id and drug class
  81. mutate(drug_class=case_when(drug_abbrv== "AGly" ~ "AMG", # replace the abbreviations with the drug full name
  82. drug_abbrv== "Sul" ~ "SUL", #sulphonamide
  83. drug_abbrv== "Bla" ~ "BLCTM", #beta-lactamase
  84. drug_abbrv== "Phe" ~ "PHEN", #phenicol
  85. drug_abbrv== "Tmt" ~ "TMP", #trimethoprim
  86. drug_abbrv== "MLS" ~ "MCLD", #macrolide
  87. drug_abbrv== "Flq" ~ "FLQ", #quinolone
  88. drug_abbrv== "Rif" ~ "RIF", #rifampicin
  89. drug_abbrv== "AGly_Flqn" ~ "AMG_QNL", #aminoglycoside_quinolone
  90. drug_abbrv== "Tet" ~ "TET")) #tetracycline
  91. arg_result$drug_class <- factor(arg_result$drug_class, levels = c("AMG","BLCTM","MCLD","PHEN","AMG_QNL","RIF","SUL","TET","TMP"))
  92. microdata <- read.csv("../../result/R_project/micro.csv")
  93. clindata <- read.csv("../../result/R_project/clinical.csv")
  94. clin_microdata <- inner_join(clindata[,!colnames(clindata)%in% c("MRN", "source","diagnosis")], microdata, c("date"="date"))
  95. arg_result <- left_join(arg_result,clin_microdata,c("sample_id"="sample_name"))
  96.  
  97. # point plot distribution of gene copy number for each gene
  98. arg_result%>%
  99. ggplot(aes(x = log2(gen_cp_num+1), y = gene_id, fill = patient_order))+
  100. geom_point(size = 6, shape = 21, alpha = 0.8, stroke = 1)+
  101. scale_fill_manual(values = c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4',
  102. '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff',
  103. '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1',
  104. '#000075', '#808080', '#ffffff', '#000000'))+
  105. theme_minimal()+
  106. ggtitle("gene copy number of each gene across patients")
  107.  
  108. # point plot distribution of gene coverage
  109. arg_result%>%
  110. ggplot(aes(x = coverage, y = gene_id,fill= patient_order))+
  111. geom_point(size = 6, shape = 21, alpha = 0.8, stroke = 1)+
  112. geom_jitter()+
  113. scale_fill_manual(values = c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4',
  114. '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff',
  115. '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1',
  116. '#000075', '#808080', '#ffffff', '#000000'))+
  117. theme_minimal()+
  118. ggtitle("coverage of each gene across patients")
  119.  
  120. # horizontal plot of the proportion of drug classes in ech sample
  121. ggplot(data=arg_result, aes(x=patient_order, fill=drug_class)) +
  122. geom_bar(stat="count", position=position_fill())+
  123. scale_fill_brewer(palette="Spectral")+
  124. scale_y_continuous(expand = c(0.001, 0) )+
  125. coord_flip()+
  126. theme_minimal()+
  127. ggtitle("proportion of AMR genes of each patient ")+
  128. ylab( "proportion")
  129.  
  130. #
  131. patients <- unique(as.data.frame(arg_result)[,c("patient","patient_order","location_abbrev","diagnosis_abbrev")])%>%
  132. arrange(patient_order)
  133.  
  134. levels_cols <- levels(patients$location_abbrev)
  135. cols_type <-plyr::mapvalues(patients$location_abbrev,
  136. levels_cols,
  137. c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231',
  138. '#911eb4', '#46f0f0', '#f032e6', '#bcf60c'))
  139. drugs <- unique(as.data.frame(arg_result)[,c("drug_abbrv","gene_id")])%>%
  140. arrange(gene_id)
  141.  
  142. levels_rows <- levels(drugs$drug_abbrv)
  143. rows_type <-plyr::mapvalues(drugs$drug_abbrv,
  144. levels_rows,
  145. c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231',
  146. '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe'))
  147.  
  148. arg_result%>%
  149. ungroup()%>%
  150. select(gene_id,patient_order,gen_cp_num)%>%
  151. spread(gene_id,gen_cp_num)%>%
  152. mutate_all(funs(replace(.,is.na(.),0)))%>%
  153. as.data.frame()%>%
  154. column_to_rownames("patient_order")%>%
  155. t()%>%
  156. heatmap(scale = "column", #scale the data column-wise
  157. cexCol = 1.5, #size of the x labels
  158. cexRow = 1)
  159.  
  160. arg_result%>%
  161. ungroup()%>%
  162. count(patient_order,drug_class)%>%
  163. spread(drug_class,n)%>%
  164. mutate_all(funs(replace(.,is.na(.),0)))%>%
  165. as.data.frame()%>%
  166. column_to_rownames("patient_order")%>%
  167. t()%>%
  168. heatmap(scale = "column", #scale the data column-wise
  169. cexCol = 1.5, #size of the x labels
  170. cexRow = 1,
  171. col= colorRampPalette(brewer.pal(8, "Blues"))(25))
  172.  
  173. library(gplots)
  174.  
  175. library(RColorBrewer)
  176. cols <- brewer.pal(10, "Spectral")
  177. x <- arg_result %>%
  178. select(patient_order, gene_id, coverage) %>%
  179. spread(gene_id, coverage) %>%
  180. mutate_all(funs(replace(.,is.na(.),0))) %>%
  181. column_to_rownames("patient_order") %>%
  182. data.matrix()
  183. heatmap.2(x, Rowv = NULL,Colv = NULL,trace = "none", col = cols,cellnote =round(x))
  184.  
  185. cols_2 <- c("blue","yellow", "red")
  186. cols_2 <- colorRampPalette(cols_2)(100)
  187. cols_3 <- rev(brewer.pal(11,"Spectral"))
  188.  
  189. y <- gene_cov_arg %>%
  190. select(sample_id, gene_id, coverage) %>%
  191. spread(gene_id, coverage) %>%
  192. mutate_all(funs(replace(., is.na(.),0))) %>%
  193. column_to_rownames("sample_id") %>%
  194. data.matrix()
  195.  
  196. heatmap.2(y, Rowv = NULL,Colv = NULL,trace = "none", col = cols_2)
  197.  
  198. #heatmap.2(y, Rowv = NULL,Colv = NULL,trace = "none", col = cols_2, key = TRUE, lwid = c(1.5, 24), lhei = c(1.5, 40))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement