Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- setwd("D://mdr_project/ecoli_raw_fastq/e.coli_output/alignment_result/")
- library(tidyverse) # load required libraries
- library(viridis)
- library(ggridges)
- library(cowplot)
- library(RColorBrewer)
- arg_out <- read.table("../../output/alignment_result/argannot_result_all.txt") # read blastn output against arg-annot database
- dim(arg_out) # dimension of the output table
- outfmt <- "qseqid sseqid pident qlen slen sseq length mismatch gaps qstart qend sstart send evalue bitscore" # string of headers
- col_names <- unlist(strsplit(outfmt,split = " ")) # split the outfmt string to separate header for each column
- colnames(arg_out) <- col_names # add headers to each column
- dim(arg_out)
- split_col <- function(pattern, col, part){ # function for splitting a string into a vector according to a certain pattern
- col.chr <- as.character(col)
- lst <- sapply(strsplit(col.chr, split = pattern),`[`, part)
- vctr <- unlist(lst)
- return(vctr)
- }
- sample_id <- split_col("\\..*",arg_out$qseqid,1)
- accesion_no <- split_col(":",arg_out$sseqid,2) # extract accession number of each gene
- antibiotic_gene <- split_col(":",arg_out$sseqid,1) # extract (antibiotic class) gene_id
- gene_id <- gsub("^\\([a-zA-Z_]{3,9}\\)","",antibiotic_gene) # extract gene_id
- antibiotic <- gsub(").+$","",antibiotic_gene) ; antibiotic_class <- gsub("^.","",antibiotic) #extract antibiotic class
- gene_sample<-paste(gene_id,sample_id,sep = ":") # concatenate gene_id with sample_id
- arg_out<- cbind(sample_id, antibiotic_class,accesion_no,gene_id,gene_sample,arg_out) # add new columns to the output table
- dim(arg_out)
- arg_out_flt <- arg_out%>% # select rows wih perecnt-identity more than or equal to 97
- filter(pident>=90 & evalue <=1e-4)
- {ttl_smpl_rds <- nrow(arg_out_flt) # total number of reads
- unique_args<- unique(arg_out_flt[,c("gene_sample","slen","antibiotic_class")])
- nrow_unique <- nrow(unique_args)
- genes <- numeric(nrow_unique)
- sample_id <- numeric(nrow_unique)
- drug_class <- numeric(nrow_unique)
- coverage <- numeric(nrow_unique)
- gene_gaps <- numeric(nrow_unique)
- no_reads <- numeric(nrow_unique)
- gen_cp_num <- numeric(nrow_unique)
- hit_thsnd_rds <- numeric(nrow_unique)
- nrm_cnt <- numeric(nrow_unique)
- test <- numeric(nrow_unique)}
- #
- for (i in 1:nrow_unique){ # loop over all unique rows in gene_sample column
- gene <- as.character((unique_args$gene_sample)[i])
- rows <- which((arg_out_flt$gene_sample)==gene) # indecis of gene_sample column equal to gene
- genes[i] <- gene # add the unique gene_sample row to "genes" numeric
- positions <- vector() # initialize empty numeric
- seq_len <- unique_args$slen[i] # gene-sequence length
- antibiotic <- unique_args$antibiotic_class[i]
- for (n in rows){ # loop over all rows with gene_sample equal to gene
- start <- arg_out_flt$sstart[n] # start position of the alignment in gene-sequence
- end <- arg_out_flt$send[n] # end position of the alignment in the gene-sequence
- read <- start:end # generate a sequence of numbers from start to end
- positions <- append(positions,read) # add the sequence of numbers between the start and end of the read to "positions" numeric
- }
- drug_class[i] <- as.character(antibiotic) # add the antibiotic class of the gene to "drug_class" numeric
- no_reads[i] <- length(rows) # add the number of reads aligned to a gene to "no_reads" numeric
- total_positions <- length(unique(positions)) # get the length of gene covered by aligned reads (-gaps)
- coverage[i] <- (total_positions/seq_len)*100# calculate the proportion of covered sequences and add it to "coverage" numeric
- gen_cp_num[i] <- length(rows)/seq_len # calculate gene copy number by dividing the number of reads aligned to a gene over the length of the gene sequence
- hit_thsnd_rds[i] <- length(rows)/ttl_smpl_rds*1000 # calculate hits per thousand reads for a gene by dividing the number of reads aligned to this gene over the total number of reads
- }
- sapply(list(genes,drug_class,coverage,no_reads,gen_cp_num,hit_thsnd_rds),length) # confirm that numerics are of the same length
- gene_id <- split_col("\\:", genes, 1) # extract the id of genes
- sample_id <- split_col("\\:", genes, 2) # extract the sample id
- gene_cov_arg <- data.frame(sample_id,gene_id,drug_class,coverage,no_reads,gen_cp_num,hit_thsnd_rds) # combine the numerics into a dataframe
- dim(gene_cov_arg)
- arg_result <- gene_cov_arg%>% # select genes with coverage equal or more than 90
- filter(coverage >= 90)
- dim(arg_result)
- #rename the samples and antibiotic classes
- arg_result <- arg_result%>%
- rename(drug_abbrv = drug_class)%>% # rename the anitbiotic column
- mutate(gene_drug = paste(gene_id,drug_abbrv,sep = " : "))%>% # make a new column by concatenating both of the gene_id and drug class
- mutate(drug_class=case_when(drug_abbrv== "AGly" ~ "AMG", # replace the abbreviations with the drug full name
- drug_abbrv== "Sul" ~ "SUL", #sulphonamide
- drug_abbrv== "Bla" ~ "BLCTM", #beta-lactamase
- drug_abbrv== "Phe" ~ "PHEN", #phenicol
- drug_abbrv== "Tmt" ~ "TMP", #trimethoprim
- drug_abbrv== "MLS" ~ "MCLD", #macrolide
- drug_abbrv== "Flq" ~ "FLQ", #quinolone
- drug_abbrv== "Rif" ~ "RIF", #rifampicin
- drug_abbrv== "AGly_Flqn" ~ "AMG_QNL", #aminoglycoside_quinolone
- drug_abbrv== "Tet" ~ "TET")) #tetracycline
- arg_result$drug_class <- factor(arg_result$drug_class, levels = c("AMG","BLCTM","MCLD","PHEN","AMG_QNL","RIF","SUL","TET","TMP"))
- microdata <- read.csv("../../result/R_project/micro.csv")
- clindata <- read.csv("../../result/R_project/clinical.csv")
- clin_microdata <- inner_join(clindata[,!colnames(clindata)%in% c("MRN", "source","diagnosis")], microdata, c("date"="date"))
- arg_result <- left_join(arg_result,clin_microdata,c("sample_id"="sample_name"))
- # point plot distribution of gene copy number for each gene
- arg_result%>%
- ggplot(aes(x = log2(gen_cp_num+1), y = gene_id, fill = patient_order))+
- geom_point(size = 6, shape = 21, alpha = 0.8, stroke = 1)+
- scale_fill_manual(values = c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4',
- '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff',
- '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1',
- '#000075', '#808080', '#ffffff', '#000000'))+
- theme_minimal()+
- ggtitle("gene copy number of each gene across patients")
- # point plot distribution of gene coverage
- arg_result%>%
- ggplot(aes(x = coverage, y = gene_id,fill= patient_order))+
- geom_point(size = 6, shape = 21, alpha = 0.8, stroke = 1)+
- geom_jitter()+
- scale_fill_manual(values = c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4',
- '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff',
- '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1',
- '#000075', '#808080', '#ffffff', '#000000'))+
- theme_minimal()+
- ggtitle("coverage of each gene across patients")
- # horizontal plot of the proportion of drug classes in ech sample
- ggplot(data=arg_result, aes(x=patient_order, fill=drug_class)) +
- geom_bar(stat="count", position=position_fill())+
- scale_fill_brewer(palette="Spectral")+
- scale_y_continuous(expand = c(0.001, 0) )+
- coord_flip()+
- theme_minimal()+
- ggtitle("proportion of AMR genes of each patient ")+
- ylab( "proportion")
- #
- patients <- unique(as.data.frame(arg_result)[,c("patient","patient_order","location_abbrev","diagnosis_abbrev")])%>%
- arrange(patient_order)
- levels_cols <- levels(patients$location_abbrev)
- cols_type <-plyr::mapvalues(patients$location_abbrev,
- levels_cols,
- c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231',
- '#911eb4', '#46f0f0', '#f032e6', '#bcf60c'))
- drugs <- unique(as.data.frame(arg_result)[,c("drug_abbrv","gene_id")])%>%
- arrange(gene_id)
- levels_rows <- levels(drugs$drug_abbrv)
- rows_type <-plyr::mapvalues(drugs$drug_abbrv,
- levels_rows,
- c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231',
- '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe'))
- arg_result%>%
- ungroup()%>%
- select(gene_id,patient_order,gen_cp_num)%>%
- spread(gene_id,gen_cp_num)%>%
- mutate_all(funs(replace(.,is.na(.),0)))%>%
- as.data.frame()%>%
- column_to_rownames("patient_order")%>%
- t()%>%
- heatmap(scale = "column", #scale the data column-wise
- cexCol = 1.5, #size of the x labels
- cexRow = 1)
- arg_result%>%
- ungroup()%>%
- count(patient_order,drug_class)%>%
- spread(drug_class,n)%>%
- mutate_all(funs(replace(.,is.na(.),0)))%>%
- as.data.frame()%>%
- column_to_rownames("patient_order")%>%
- t()%>%
- heatmap(scale = "column", #scale the data column-wise
- cexCol = 1.5, #size of the x labels
- cexRow = 1,
- col= colorRampPalette(brewer.pal(8, "Blues"))(25))
- library(gplots)
- library(RColorBrewer)
- cols <- brewer.pal(10, "Spectral")
- x <- arg_result %>%
- select(patient_order, gene_id, coverage) %>%
- spread(gene_id, coverage) %>%
- mutate_all(funs(replace(.,is.na(.),0))) %>%
- column_to_rownames("patient_order") %>%
- data.matrix()
- heatmap.2(x, Rowv = NULL,Colv = NULL,trace = "none", col = cols,cellnote =round(x))
- cols_2 <- c("blue","yellow", "red")
- cols_2 <- colorRampPalette(cols_2)(100)
- cols_3 <- rev(brewer.pal(11,"Spectral"))
- y <- gene_cov_arg %>%
- select(sample_id, gene_id, coverage) %>%
- spread(gene_id, coverage) %>%
- mutate_all(funs(replace(., is.na(.),0))) %>%
- column_to_rownames("sample_id") %>%
- data.matrix()
- heatmap.2(y, Rowv = NULL,Colv = NULL,trace = "none", col = cols_2)
- #heatmap.2(y, Rowv = NULL,Colv = NULL,trace = "none", col = cols_2, key = TRUE, lwid = c(1.5, 24), lhei = c(1.5, 40))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement