Guest User

Untitled

a guest
Aug 19th, 2018
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.72 KB | None | 0 0
  1. #=========================Preparing GSE63990 data
  2. library(R.matlab)
  3. # Load data
  4. dat = readMat("~/Desktop/Thesis/Data/March/GSE63990/GSE63990.mat")
  5. #70 bacterial, 115 viral, 88 non-infectious illness = 273
  6. #table(dat$status) have to unlist first
  7. status = as.character(unlist(dat$status))
  8. status1 = as.character(unlist(dat$status1))
  9. rowlistsubset = as.character(unlist(dat$rowlistsubset)); length(rowlistsubset)
  10. rowlist = as.character(unlist(dat$rowlist)); length(rowlist)
  11. collist = as.character(unlist(dat$collist)); length(collist)
  12. collist1 = as.character(unlist(dat$collist1)); length(collist1)
  13.  
  14. X = as.matrix(dat$X) #22,2777 genes x 273 observations
  15. row.names(X) = rowlist
  16. colnames(X) = collist
  17. dim(X)
  18. head(X)[270:273]
  19. #X is now the expression data: rows = probe IDs, columns = geoID#s
  20.  
  21. #library("hgu133plus2.db")
  22. #library("hgu133a.db")
  23. library("hgu133a2.db")
  24. x = hgu133a2SYMBOL
  25. mapped_genes = mappedkeys(x)
  26. symbols = as.data.frame(x[mapped_genes])
  27.  
  28. # rm(X)
  29. # Merge data
  30. X2 = merge(x = as.data.frame(X), y = symbols, by.x = "row.names", by.y ="probe_id")
  31. row.names(X2) = X2$Row.names ; X2 = X2[,-1]
  32. colnames(X2) = gsub(pattern = "symbol", replacement = "Gene.symbol", x = colnames(X2))
  33.  
  34. # Clean duplicates
  35. X2_clean = clean_duplicates(tT = X2); X2_clean = X2_clean[,-grep(pattern = "Gene.symbol", x=colnames(X2_clean))]
  36. #Write non-transposed expression data to csv for future use
  37. #instead of write.csv, save X2_clean as object within an R data file called "GSE3990genes":
  38. #save(X2_clean, file = "~/Desktop/GSE63990genes.RData")
  39. #If had written X2_clean to .csv it's slower, can comapare times as follows:
  40. #system.time({X2_clean = read.csv("~/Desktop/GSE63990genes.csv", row.names="X")})
  41. #system.time({load("~/Desktop/GSE63990genes2.RData")})
  42.  
  43. # To add signature scores: (no reason to do this here, better to add them to pheno data, as done below)
  44. X2clean = as.data.frame(t(X2_clean))
  45. X2clean$pv = rowSums(calc_pan_viral(X = X2clean, logtrans = F))
  46. X2clean$lab = calculate_SC_score(df = X2clean, microarray = T, log_t = F)
  47. X2clean$lab2 = calculate_SC_v2(df = X2clean, log_t = F)
  48. head(X2clean$pv)
  49. head(X2clean$lab2)
  50. #Prepare pheno data:
  51. #here are two datasets: one with bacteria and virus, another with bacteria and SIRS. missing the healthies!
  52. #This means that for COCONUT, the 0 = SIRS patients
  53. #First, import "GSE63990_Bact_vs_Viral_samples"
  54. class(GSE63990_Bact_vs_Viral_samples)
  55. GSE63990_Bact_vs_Viral_samples = as.data.frame(GSE63990_Bact_vs_Viral_samples)
  56. row.names(GSE63990_Bact_vs_Viral_samples) = GSE63990_Bact_vs_Viral_samples$geo_ID
  57.  
  58. #First, import "GSE63990_samples"
  59. GSE63990_samples = as.data.frame(GSE63990_samples)
  60. row.names(GSE63990_samples) = GSE63990_samples$geo_ID
  61.  
  62. pheno = merge(GSE63990_Bact_vs_Viral_samples, GSE63990_samples, by="geo_ID", all=TRUE)
  63. table(pheno$matlab.y) #70 bacterial, 88 non-infectious illness
  64. table(pheno$matlab.x) #70 bacterial, 115 viral
  65. levels(as.factor(pheno$matlab.x)) #checking for white space
  66. #70+115+88 = 273 - so can take the NAs from matlab.y and replace them with viral (or the NAs from matlab.x and replace with non-infectious illness)
  67. #is.na(pheno$matlab.y) <- "viral" doesn't work, need:
  68. pheno[c("matlab.y")][is.na(pheno[c("matlab.y")])] <- "viral"
  69. table(pheno$matlab.y) #Now we have our 70 bacterial + 115 viral + 88 non-infectious illness cases
  70. pheno$diagnosis = pheno$matlab.y #rename matlab.y as something more intuitive..."diagnosis"
  71. pheno$matlab.y = NULL
  72. pheno$matlab.x = NULL
  73. pheno$orig.x = NULL
  74. pheno$orig.y = NULL
  75. pheno$X1.x = NULL
  76. pheno$X1.y = NULL
  77. pheno$X1 = NULL
  78.  
  79.  
  80.  
  81. #Now add a column for case (infection) vs control (non-infectious illness)
  82. pheno$caseorctrl = pheno$diagnosis #Make new column for the consolidated responses
  83. pheno$caseorctrl = factor(pheno$caseorctrl) #Specify it as a factor
  84. levels(pheno$caseorctrl) #check for white spaces
  85. levels(pheno$caseorctrl) = list("SIRS" = c("non-infectious illness"),
  86. "case" = c("bacterial", "viral"))
  87. table(pheno$caseorctr) #185 cases, 88 controls (non-infectious illness)
  88.  
  89. pheno$matlab.x = NULL
  90. pheno$orig.x = NULL
  91. pheno$orig.y = NULL
  92. pheno$X1.x = NULL
  93. pheno$X1.y = NULL
  94.  
  95.  
  96. #write.csv(pheno, "~/Desktop/GSE63990pheno.csv") #if you want to save as CSV
  97. #save(pheno, X2_clean, file="~/Desktop/Thesis/Data/March/GSE63900.RData")
  98. #================== Data prep finished =========#
  99.  
  100.  
  101.  
  102. #======================== Add scores
  103. genesT = data.frame(t(GSE63990genes))
  104. GSE63990pheno$lab = calculate_SC_score(df = genesT, microarray = T, log_t = F)
  105. GSE63990pheno$lab2 = calculate_SC_v2(df = genesT, log_t = F)
  106. GSE63990pheno$pv = rowSums(calc_pan_viral(X = genesT, logtrans = F))
  107. ###### Analysis - First plot the general cases...
  108. p11 = ggplot(GSE63990pheno, aes(x = diagnosis, y = pv, colour = diagnosis)) + geom_boxplot(outlier.size = NA) + geom_point(position = position_jitter(width = 0.4)) + labs(y = "Pan-Viral")
  109. p22 = ggplot(GSE63990pheno, aes(x = diagnosis, y = lab, colour = diagnosis)) + geom_boxplot(outlier.size = NA) + geom_point(position = position_jitter(width = 0.4)) + labs(y = "LAB V1")
  110. p33 = ggplot(GSE63990pheno, aes(x = diagnosis, y = lab2, colour = diagnosis)) + geom_boxplot(outlier.size = NA) + geom_point(position = position_jitter(width = 0.4)) + labs(y = "LAB V2")
  111. p11; p22; p33
  112. #Get all viral patients who have lab2 scores above 0:
  113. virals = subset(GSE63990pheno, diagnosis == "viral")
  114. above0 = subset(virals, pv >= 0)
  115. below0 = subset(virals, pv < 0)
  116. #96 captured / 115 total = 83% of virals were classified as viral
  117.  
  118. nonvirals = subset(GSE63990pheno, diagnosis != "viral")
  119. above0 = subset(nonvirals, pv >= 0)
  120. below0 = subset(nonvirals, pv < 0)
  121. #107 true negatives
  122. #158 conditional negatives (#57 false positives)
  123. #107/158 true negative / conditional negative = 67% specificity
  124.  
  125. #70 + 115virus + 88 = 273 total samples
  126.  
  127. save(GSE63990genes, GSE63990pheno, file="~/Desktop/Thesis/Data/March/GSE63990.RData")
Add Comment
Please, Sign In to add comment