Untitled

setwd("D://mdr_project/ecoli_raw_fastq/e.coli_output/alignment_result/")
library(tidyverse) # load required libraries
library(viridis)
library(ggridges)
library(cowplot)
library(RColorBrewer)
arg_out <- read.table("../../output/alignment_result/argannot_result_all.txt") # read blastn output against arg-annot database
dim(arg_out) # dimension of the output table
outfmt <- "qseqid sseqid pident qlen slen sseq length mismatch gaps qstart qend sstart send evalue bitscore" # string of headers
col_names <- unlist(strsplit(outfmt,split = " ")) # split the outfmt string to separate header for each column
colnames(arg_out) <- col_names # add headers to each column
dim(arg_out)
split_col <- function(pattern, col, part){ # function for splitting a string into a vector according to a certain pattern
  col.chr <- as.character(col)
  lst <- sapply(strsplit(col.chr, split = pattern),`[`, part)
  vctr <- unlist(lst)
  return(vctr)
}

sample_id <- split_col("\\..*",arg_out$qseqid,1)
accesion_no <- split_col(":",arg_out$sseqid,2) # extract accession number of each gene
antibiotic_gene <- split_col(":",arg_out$sseqid,1) # extract (antibiotic class) gene_id
gene_id <- gsub("^\\([a-zA-Z_]{3,9}\\)","",antibiotic_gene) # extract gene_id
antibiotic <- gsub(").+$","",antibiotic_gene) ; antibiotic_class <- gsub("^.","",antibiotic) #extract antibiotic class
gene_sample<-paste(gene_id,sample_id,sep = ":") # concatenate gene_id with sample_id
arg_out<- cbind(sample_id, antibiotic_class,accesion_no,gene_id,gene_sample,arg_out) # add new columns to the output table
dim(arg_out)

arg_out_flt <- arg_out%>% # select rows wih perecnt-identity more than or equal to 97
  filter(pident>=90 & evalue <=1e-4)

{ttl_smpl_rds <- nrow(arg_out_flt) # total number of reads
  unique_args<- unique(arg_out_flt[,c("gene_sample","slen","antibiotic_class")])
  nrow_unique <- nrow(unique_args)
  genes <- numeric(nrow_unique)
  sample_id <- numeric(nrow_unique)
  drug_class <- numeric(nrow_unique)
  coverage <- numeric(nrow_unique)
  gene_gaps <- numeric(nrow_unique)
  no_reads <- numeric(nrow_unique)
  gen_cp_num <- numeric(nrow_unique)
  hit_thsnd_rds <- numeric(nrow_unique)
  nrm_cnt <- numeric(nrow_unique)
  test <- numeric(nrow_unique)}
#
for (i in 1:nrow_unique){ # loop over all unique rows in gene_sample column
    gene <- as.character((unique_args$gene_sample)[i])
    rows <- which((arg_out_flt$gene_sample)==gene) # indecis of gene_sample column equal to gene
    genes[i] <- gene # add the  unique gene_sample row to "genes" numeric
    positions <- vector() # initialize empty numeric
    seq_len <- unique_args$slen[i] # gene-sequence length
    antibiotic <- unique_args$antibiotic_class[i]
    for (n in rows){ # loop over all rows with gene_sample equal to gene
      start <- arg_out_flt$sstart[n] # start position of the alignment in gene-sequence
      end <- arg_out_flt$send[n] # end position of the alignment in the gene-sequence
      read <- start:end # generate a sequence of numbers from start to end
      positions <- append(positions,read) # add the sequence of numbers between the start and end of the read to "positions" numeric
    }
    drug_class[i] <- as.character(antibiotic) # add the antibiotic class of the gene to "drug_class" numeric
    no_reads[i] <- length(rows) # add the number of reads aligned to a gene to "no_reads" numeric
    total_positions <- length(unique(positions)) # get the length of gene covered by aligned reads (-gaps)
    coverage[i] <- (total_positions/seq_len)*100# calculate the proportion of covered sequences and add it to "coverage" numeric
    gen_cp_num[i] <- length(rows)/seq_len # calculate gene copy number by dividing the number of reads aligned to a gene over the length of the gene sequence
    hit_thsnd_rds[i] <- length(rows)/ttl_smpl_rds*1000 # calculate hits per thousand reads for a gene by dividing the number of reads aligned to this gene over the total number of reads
  }
sapply(list(genes,drug_class,coverage,no_reads,gen_cp_num,hit_thsnd_rds),length) # confirm that numerics are of the same length
gene_id <- split_col("\\:", genes, 1) # extract the id of genes
sample_id <- split_col("\\:", genes, 2) # extract the sample id
gene_cov_arg <- data.frame(sample_id,gene_id,drug_class,coverage,no_reads,gen_cp_num,hit_thsnd_rds) # combine the numerics into a dataframe
dim(gene_cov_arg)

arg_result <- gene_cov_arg%>% # select genes with coverage equal or more than 90
  filter(coverage >= 90)

dim(arg_result)

#rename the samples and antibiotic classes
arg_result <- arg_result%>%
  rename(drug_abbrv = drug_class)%>% # rename the anitbiotic column
  mutate(gene_drug = paste(gene_id,drug_abbrv,sep = " : "))%>% # make a new column by concatenating both of the gene_id and drug class
  mutate(drug_class=case_when(drug_abbrv== "AGly" ~ "AMG", # replace the abbreviations with the drug full name
                              drug_abbrv== "Sul" ~ "SUL", #sulphonamide
                              drug_abbrv== "Bla" ~ "BLCTM", #beta-lactamase
                              drug_abbrv== "Phe" ~ "PHEN", #phenicol
                              drug_abbrv== "Tmt" ~ "TMP", #trimethoprim
                              drug_abbrv== "MLS" ~ "MCLD", #macrolide
                              drug_abbrv== "Flq" ~ "FLQ", #quinolone
                              drug_abbrv== "Rif" ~ "RIF", #rifampicin
                              drug_abbrv== "AGly_Flqn" ~ "AMG_QNL", #aminoglycoside_quinolone
                              drug_abbrv== "Tet" ~ "TET")) #tetracycline
arg_result$drug_class <- factor(arg_result$drug_class, levels = c("AMG","BLCTM","MCLD","PHEN","AMG_QNL","RIF","SUL","TET","TMP"))
microdata <- read.csv("../../result/R_project/micro.csv")
clindata <- read.csv("../../result/R_project/clinical.csv")
clin_microdata <-  inner_join(clindata[,!colnames(clindata)%in% c("MRN", "source","diagnosis")], microdata, c("date"="date"))
arg_result <- left_join(arg_result,clin_microdata,c("sample_id"="sample_name"))

# point plot distribution of gene copy number for each gene
arg_result%>%
  ggplot(aes(x = log2(gen_cp_num+1), y = gene_id, fill = patient_order))+
  geom_point(size = 6, shape = 21, alpha = 0.8, stroke = 1)+
  scale_fill_manual(values = c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4',
                               '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff',
                               '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1',
                               '#000075', '#808080', '#ffffff', '#000000'))+
  theme_minimal()+
  ggtitle("gene copy number of each gene across patients")

# point plot distribution of gene coverage
arg_result%>%
  ggplot(aes(x = coverage, y = gene_id,fill= patient_order))+
  geom_point(size = 6, shape = 21, alpha = 0.8, stroke = 1)+
  geom_jitter()+
  scale_fill_manual(values = c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4',
                               '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff',
                               '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1',
                               '#000075', '#808080', '#ffffff', '#000000'))+
  theme_minimal()+
  ggtitle("coverage of each gene across patients")

# horizontal plot of the proportion of drug classes in ech sample
ggplot(data=arg_result, aes(x=patient_order, fill=drug_class)) +
  geom_bar(stat="count", position=position_fill())+
  scale_fill_brewer(palette="Spectral")+
  scale_y_continuous(expand = c(0.001, 0) )+
  coord_flip()+
  theme_minimal()+
  ggtitle("proportion of AMR genes of each patient ")+
  ylab( "proportion")

#
patients <- unique(as.data.frame(arg_result)[,c("patient","patient_order","location_abbrev","diagnosis_abbrev")])%>%
  arrange(patient_order)

levels_cols <- levels(patients$location_abbrev)
cols_type <-plyr::mapvalues(patients$location_abbrev,
                            levels_cols,
                            c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231',
                              '#911eb4', '#46f0f0', '#f032e6', '#bcf60c'))
drugs <- unique(as.data.frame(arg_result)[,c("drug_abbrv","gene_id")])%>%
  arrange(gene_id)

levels_rows <- levels(drugs$drug_abbrv)
rows_type <-plyr::mapvalues(drugs$drug_abbrv,
                            levels_rows,
                            c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231',
                              '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe'))

arg_result%>%
  ungroup()%>%
  select(gene_id,patient_order,gen_cp_num)%>%
  spread(gene_id,gen_cp_num)%>%
  mutate_all(funs(replace(.,is.na(.),0)))%>%
  as.data.frame()%>%
  column_to_rownames("patient_order")%>%
  t()%>%
  heatmap(scale = "column", #scale the data column-wise
          cexCol = 1.5, #size of the x labels
          cexRow = 1)

arg_result%>%
  ungroup()%>%
  count(patient_order,drug_class)%>%
  spread(drug_class,n)%>%
  mutate_all(funs(replace(.,is.na(.),0)))%>%
  as.data.frame()%>%
  column_to_rownames("patient_order")%>%
  t()%>%
  heatmap(scale = "column", #scale the data column-wise
          cexCol = 1.5, #size of the x labels
          cexRow = 1,
          col= colorRampPalette(brewer.pal(8, "Blues"))(25))

library(gplots)

library(RColorBrewer)
cols <- brewer.pal(10, "Spectral")
x <- arg_result %>%
  select(patient_order, gene_id, coverage) %>%
  spread(gene_id, coverage) %>%
  mutate_all(funs(replace(.,is.na(.),0))) %>%
  column_to_rownames("patient_order") %>%
  data.matrix()
heatmap.2(x, Rowv = NULL,Colv = NULL,trace = "none", col = cols,cellnote =round(x))

cols_2 <- c("blue","yellow", "red")
cols_2 <- colorRampPalette(cols_2)(100)
cols_3 <- rev(brewer.pal(11,"Spectral"))

y <- gene_cov_arg %>%
  select(sample_id, gene_id, coverage) %>%
  spread(gene_id, coverage) %>%
  mutate_all(funs(replace(., is.na(.),0))) %>%
  column_to_rownames("sample_id") %>%
  data.matrix()

heatmap.2(y, Rowv = NULL,Colv = NULL,trace = "none", col = cols_2)

#heatmap.2(y, Rowv = NULL,Colv = NULL,trace = "none", col = cols_2, key = TRUE, lwid = c(1.5, 24), lhei = c(1.5, 40))