Untitled

#=========================Preparing GSE63990 data
library(R.matlab)
# Load data
dat = readMat("~/Desktop/Thesis/Data/March/GSE63990/GSE63990.mat")
#70 bacterial, 115 viral, 88 non-infectious illness = 273
#table(dat$status) have to unlist first
status = as.character(unlist(dat$status))
status1 = as.character(unlist(dat$status1))
rowlistsubset = as.character(unlist(dat$rowlistsubset)); length(rowlistsubset)
rowlist = as.character(unlist(dat$rowlist)); length(rowlist)
collist = as.character(unlist(dat$collist)); length(collist)
collist1 = as.character(unlist(dat$collist1)); length(collist1)

X = as.matrix(dat$X) #22,2777 genes x 273 observations
row.names(X) = rowlist
colnames(X) = collist
dim(X)
head(X)[270:273]
#X is now the expression data: rows = probe IDs, columns = geoID#s

#library("hgu133plus2.db")
#library("hgu133a.db")
library("hgu133a2.db")
x = hgu133a2SYMBOL
mapped_genes = mappedkeys(x)
symbols = as.data.frame(x[mapped_genes])

# rm(X)
# Merge data
X2 = merge(x = as.data.frame(X), y = symbols, by.x = "row.names", by.y ="probe_id")
row.names(X2) = X2$Row.names ;  X2 = X2[,-1]
colnames(X2) = gsub(pattern = "symbol", replacement = "Gene.symbol", x = colnames(X2))

# Clean duplicates
X2_clean = clean_duplicates(tT = X2); X2_clean = X2_clean[,-grep(pattern = "Gene.symbol", x=colnames(X2_clean))]
#Write non-transposed expression data to csv for future use
#instead of write.csv, save X2_clean as object within an R data file called "GSE3990genes":
#save(X2_clean, file = "~/Desktop/GSE63990genes.RData")
#If had written X2_clean to .csv it's slower, can comapare times as follows:
#system.time({X2_clean = read.csv("~/Desktop/GSE63990genes.csv", row.names="X")})
#system.time({load("~/Desktop/GSE63990genes2.RData")})

# To add signature scores: (no reason to do this here, better to add them to pheno data, as done below)
X2clean = as.data.frame(t(X2_clean))
X2clean$pv = rowSums(calc_pan_viral(X = X2clean, logtrans = F))
X2clean$lab = calculate_SC_score(df = X2clean, microarray = T, log_t = F)
X2clean$lab2 = calculate_SC_v2(df = X2clean, log_t = F)
head(X2clean$pv)
head(X2clean$lab2)
#Prepare pheno data:
#here are two datasets: one with bacteria and virus, another with bacteria and SIRS. missing the healthies!
#This means that for COCONUT, the 0 = SIRS patients
#First, import "GSE63990_Bact_vs_Viral_samples"
class(GSE63990_Bact_vs_Viral_samples)
GSE63990_Bact_vs_Viral_samples = as.data.frame(GSE63990_Bact_vs_Viral_samples)
row.names(GSE63990_Bact_vs_Viral_samples) = GSE63990_Bact_vs_Viral_samples$geo_ID

#First, import "GSE63990_samples"
GSE63990_samples = as.data.frame(GSE63990_samples)
row.names(GSE63990_samples) = GSE63990_samples$geo_ID

pheno = merge(GSE63990_Bact_vs_Viral_samples, GSE63990_samples, by="geo_ID", all=TRUE)
table(pheno$matlab.y) #70 bacterial, 88 non-infectious illness
table(pheno$matlab.x) #70 bacterial, 115 viral
levels(as.factor(pheno$matlab.x)) #checking for white space
#70+115+88 = 273 - so can take the NAs from matlab.y and replace them with viral (or the NAs from matlab.x and replace with non-infectious illness)
#is.na(pheno$matlab.y) <- "viral" doesn't work, need:
pheno[c("matlab.y")][is.na(pheno[c("matlab.y")])] <- "viral"
table(pheno$matlab.y) #Now we have our 70 bacterial + 115 viral + 88 non-infectious illness cases
pheno$diagnosis = pheno$matlab.y #rename matlab.y as something more intuitive..."diagnosis"
pheno$matlab.y = NULL
pheno$matlab.x = NULL
pheno$orig.x = NULL
pheno$orig.y = NULL
pheno$X1.x = NULL
pheno$X1.y = NULL
pheno$X1 = NULL


#Now add a column for case (infection) vs control (non-infectious illness)
pheno$caseorctrl = pheno$diagnosis #Make new column for the consolidated responses
pheno$caseorctrl = factor(pheno$caseorctrl) #Specify it as a factor
levels(pheno$caseorctrl) #check for white spaces
levels(pheno$caseorctrl) = list("SIRS" = c("non-infectious illness"),
                            "case" = c("bacterial", "viral"))
table(pheno$caseorctr) #185 cases, 88 controls (non-infectious illness)

pheno$matlab.x = NULL
pheno$orig.x = NULL
pheno$orig.y = NULL
pheno$X1.x = NULL
pheno$X1.y = NULL


#write.csv(pheno, "~/Desktop/GSE63990pheno.csv") #if you want to save as CSV
#save(pheno, X2_clean, file="~/Desktop/Thesis/Data/March/GSE63900.RData")
#================== Data prep finished =========#


#========================  Add scores
genesT = data.frame(t(GSE63990genes))
GSE63990pheno$lab = calculate_SC_score(df = genesT, microarray = T, log_t = F)
GSE63990pheno$lab2 = calculate_SC_v2(df = genesT, log_t = F)
GSE63990pheno$pv = rowSums(calc_pan_viral(X = genesT, logtrans = F))
###### Analysis - First plot the general cases...
p11 = ggplot(GSE63990pheno, aes(x = diagnosis, y = pv, colour = diagnosis)) + geom_boxplot(outlier.size = NA) + geom_point(position = position_jitter(width = 0.4)) + labs(y = "Pan-Viral")
p22 = ggplot(GSE63990pheno, aes(x = diagnosis, y = lab, colour = diagnosis)) + geom_boxplot(outlier.size = NA) + geom_point(position = position_jitter(width = 0.4)) + labs(y = "LAB V1")
p33 = ggplot(GSE63990pheno, aes(x = diagnosis, y = lab2, colour = diagnosis)) + geom_boxplot(outlier.size = NA) + geom_point(position = position_jitter(width = 0.4)) + labs(y = "LAB V2")
p11; p22; p33
#Get all viral patients who have lab2 scores above 0:
virals = subset(GSE63990pheno, diagnosis == "viral")
above0 = subset(virals, pv >= 0)
below0 = subset(virals, pv < 0)
#96 captured / 115 total = 83% of virals were classified as viral

nonvirals = subset(GSE63990pheno, diagnosis != "viral")
above0 = subset(nonvirals, pv >= 0)
below0 = subset(nonvirals, pv < 0)
#107 true negatives
#158 conditional negatives (#57 false positives)
#107/158 true negative / conditional negative = 67% specificity

#70 + 115virus + 88 = 273 total samples

save(GSE63990genes, GSE63990pheno, file="~/Desktop/Thesis/Data/March/GSE63990.RData")