Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #=========================Preparing GSE63990 data
- library(R.matlab)
- # Load data
- dat = readMat("~/Desktop/Thesis/Data/March/GSE63990/GSE63990.mat")
- #70 bacterial, 115 viral, 88 non-infectious illness = 273
- #table(dat$status) have to unlist first
- status = as.character(unlist(dat$status))
- status1 = as.character(unlist(dat$status1))
- rowlistsubset = as.character(unlist(dat$rowlistsubset)); length(rowlistsubset)
- rowlist = as.character(unlist(dat$rowlist)); length(rowlist)
- collist = as.character(unlist(dat$collist)); length(collist)
- collist1 = as.character(unlist(dat$collist1)); length(collist1)
- X = as.matrix(dat$X) #22,2777 genes x 273 observations
- row.names(X) = rowlist
- colnames(X) = collist
- dim(X)
- head(X)[270:273]
- #X is now the expression data: rows = probe IDs, columns = geoID#s
- #library("hgu133plus2.db")
- #library("hgu133a.db")
- library("hgu133a2.db")
- x = hgu133a2SYMBOL
- mapped_genes = mappedkeys(x)
- symbols = as.data.frame(x[mapped_genes])
- # rm(X)
- # Merge data
- X2 = merge(x = as.data.frame(X), y = symbols, by.x = "row.names", by.y ="probe_id")
- row.names(X2) = X2$Row.names ; X2 = X2[,-1]
- colnames(X2) = gsub(pattern = "symbol", replacement = "Gene.symbol", x = colnames(X2))
- # Clean duplicates
- X2_clean = clean_duplicates(tT = X2); X2_clean = X2_clean[,-grep(pattern = "Gene.symbol", x=colnames(X2_clean))]
- #Write non-transposed expression data to csv for future use
- #instead of write.csv, save X2_clean as object within an R data file called "GSE3990genes":
- #save(X2_clean, file = "~/Desktop/GSE63990genes.RData")
- #If had written X2_clean to .csv it's slower, can comapare times as follows:
- #system.time({X2_clean = read.csv("~/Desktop/GSE63990genes.csv", row.names="X")})
- #system.time({load("~/Desktop/GSE63990genes2.RData")})
- # To add signature scores: (no reason to do this here, better to add them to pheno data, as done below)
- X2clean = as.data.frame(t(X2_clean))
- X2clean$pv = rowSums(calc_pan_viral(X = X2clean, logtrans = F))
- X2clean$lab = calculate_SC_score(df = X2clean, microarray = T, log_t = F)
- X2clean$lab2 = calculate_SC_v2(df = X2clean, log_t = F)
- head(X2clean$pv)
- head(X2clean$lab2)
- #Prepare pheno data:
- #here are two datasets: one with bacteria and virus, another with bacteria and SIRS. missing the healthies!
- #This means that for COCONUT, the 0 = SIRS patients
- #First, import "GSE63990_Bact_vs_Viral_samples"
- class(GSE63990_Bact_vs_Viral_samples)
- GSE63990_Bact_vs_Viral_samples = as.data.frame(GSE63990_Bact_vs_Viral_samples)
- row.names(GSE63990_Bact_vs_Viral_samples) = GSE63990_Bact_vs_Viral_samples$geo_ID
- #First, import "GSE63990_samples"
- GSE63990_samples = as.data.frame(GSE63990_samples)
- row.names(GSE63990_samples) = GSE63990_samples$geo_ID
- pheno = merge(GSE63990_Bact_vs_Viral_samples, GSE63990_samples, by="geo_ID", all=TRUE)
- table(pheno$matlab.y) #70 bacterial, 88 non-infectious illness
- table(pheno$matlab.x) #70 bacterial, 115 viral
- levels(as.factor(pheno$matlab.x)) #checking for white space
- #70+115+88 = 273 - so can take the NAs from matlab.y and replace them with viral (or the NAs from matlab.x and replace with non-infectious illness)
- #is.na(pheno$matlab.y) <- "viral" doesn't work, need:
- pheno[c("matlab.y")][is.na(pheno[c("matlab.y")])] <- "viral"
- table(pheno$matlab.y) #Now we have our 70 bacterial + 115 viral + 88 non-infectious illness cases
- pheno$diagnosis = pheno$matlab.y #rename matlab.y as something more intuitive..."diagnosis"
- pheno$matlab.y = NULL
- pheno$matlab.x = NULL
- pheno$orig.x = NULL
- pheno$orig.y = NULL
- pheno$X1.x = NULL
- pheno$X1.y = NULL
- pheno$X1 = NULL
- #Now add a column for case (infection) vs control (non-infectious illness)
- pheno$caseorctrl = pheno$diagnosis #Make new column for the consolidated responses
- pheno$caseorctrl = factor(pheno$caseorctrl) #Specify it as a factor
- levels(pheno$caseorctrl) #check for white spaces
- levels(pheno$caseorctrl) = list("SIRS" = c("non-infectious illness"),
- "case" = c("bacterial", "viral"))
- table(pheno$caseorctr) #185 cases, 88 controls (non-infectious illness)
- pheno$matlab.x = NULL
- pheno$orig.x = NULL
- pheno$orig.y = NULL
- pheno$X1.x = NULL
- pheno$X1.y = NULL
- #write.csv(pheno, "~/Desktop/GSE63990pheno.csv") #if you want to save as CSV
- #save(pheno, X2_clean, file="~/Desktop/Thesis/Data/March/GSE63900.RData")
- #================== Data prep finished =========#
- #======================== Add scores
- genesT = data.frame(t(GSE63990genes))
- GSE63990pheno$lab = calculate_SC_score(df = genesT, microarray = T, log_t = F)
- GSE63990pheno$lab2 = calculate_SC_v2(df = genesT, log_t = F)
- GSE63990pheno$pv = rowSums(calc_pan_viral(X = genesT, logtrans = F))
- ###### Analysis - First plot the general cases...
- p11 = ggplot(GSE63990pheno, aes(x = diagnosis, y = pv, colour = diagnosis)) + geom_boxplot(outlier.size = NA) + geom_point(position = position_jitter(width = 0.4)) + labs(y = "Pan-Viral")
- p22 = ggplot(GSE63990pheno, aes(x = diagnosis, y = lab, colour = diagnosis)) + geom_boxplot(outlier.size = NA) + geom_point(position = position_jitter(width = 0.4)) + labs(y = "LAB V1")
- p33 = ggplot(GSE63990pheno, aes(x = diagnosis, y = lab2, colour = diagnosis)) + geom_boxplot(outlier.size = NA) + geom_point(position = position_jitter(width = 0.4)) + labs(y = "LAB V2")
- p11; p22; p33
- #Get all viral patients who have lab2 scores above 0:
- virals = subset(GSE63990pheno, diagnosis == "viral")
- above0 = subset(virals, pv >= 0)
- below0 = subset(virals, pv < 0)
- #96 captured / 115 total = 83% of virals were classified as viral
- nonvirals = subset(GSE63990pheno, diagnosis != "viral")
- above0 = subset(nonvirals, pv >= 0)
- below0 = subset(nonvirals, pv < 0)
- #107 true negatives
- #158 conditional negatives (#57 false positives)
- #107/158 true negative / conditional negative = 67% specificity
- #70 + 115virus + 88 = 273 total samples
- save(GSE63990genes, GSE63990pheno, file="~/Desktop/Thesis/Data/March/GSE63990.RData")
Add Comment
Please, Sign In to add comment