bioScript.R

source("https://bioconductor.org/biocLite.R")
biocLite("genefilter")

source("https://bioconductor.org/biocLite.R")
biocLite("EBImage")

source("https://bioconductor.org/biocLite.R")
biocLite("rhdf5")

source("https://bioconductor.org/biocLite.R")
biocLite("DESeq")

source("https://bioconductor.org/biocLite.R")
biocLite("hom.Hs.inp.db")

source("https://bioconductor.org/biocLite.R")
biocLite("AnnotationDbi")

source("https://bioconductor.org/biocLite.R")
biocLite("org.Mm.eg.db")
biocLite("rhdf5")

install.packages("rhdf5")

library(rhdf5)
library(statmod)
library(gdata)
library(genefilter)
library(EBImage)
library(rhdf5)
library(DESeq)
library(statmod)
library(hom.Hs.inp.db)
library(org.Hs.eg.db)
library(AnnotationDbi)
library(org.Mm.eg.db)

setwd("~/Desktop")
countsQuartz = read.csv("readCounts.csv", header=TRUE)
dataQuartz = dataCB


#log-transformed counts
LCountsQuartz <- log10(countsQuartz+1)
LmeansQuartz <- rowMeans( LCountsQuartz )
LvarsQuartz <- rowVars( LCountsQuartz )
Lcv2Quartz <- LvarsQuartz / LmeansQuartz^2


LogNcountsList = list()
useForFitL = LmeansQuartz>0.3
LogNcountsList$mean = LmeansQuartz[useForFitL]
LogNcountsList$cv2 = Lcv2Quartz[useForFitL]
fit_loglin = nls(cv2 ~ a* 10^(-k*mean), LogNcountsList,start=c(a=10,k=2))

#variable genes
is_het = (coefficients(fit_loglin)["a"] *10^(-coefficients(fit_loglin)["k"]*LmeansQuartz) < Lcv2Quartz) &  LmeansQuartz>0.3

LogVar_techQuartz_logfit <- coefficients(fit_loglin)["a"] *10^(-coefficients(fit_loglin)["k"]*LmeansQuartz)*LmeansQuartz^2

#plot mean/cv2 relationship and variable genes
plot( LmeansQuartz, Lcv2Quartz, log="y", col=1+is_het,xlab='meansLogQuartz',ylab='cv2LogQuartz',ylim=c(1e-3,1e2))
xg <- seq( 0, 4.5, length.out=100 )
lines( xg, coefficients(fit_loglin)["a"] *10^(-coefficients(fit_loglin)["k"]*xg ),lwd=2,col='blue' )
legend('topright',c('Variable genes'),pch=c(1),col=c('red'),cex=0.8)

#gene names in the Quartz-Seq mESC data.
gene_names = rownames(dataQuartz)
gene_names_het = gene_names[is_het]

#all Cycle base genes homologs (top 600 genes)
hu2musAll = inpIDMapper(dataCB[1:600,3],'HOMSA','MUSMU',srcIDType='ENSEMBL',destIDType='ENSEMBL')
cellcyclegenes_filterCB = dataQuartz$gene_systematic_name

#get cell cycle genes from GO
xxGO <- as.list(org.Hs.egGO2EG)
print(xxGO)
cell_cycleEG <-unlist(xxGO['GO:0007049'])
#get ENSEMBLE ids
x <- org.Hs.egENSEMBL
mapped_genes <- mappedkeys(x)
xxE <- as.list(x[mapped_genes])
ens_ids_cc<-unlist(xxE[cell_cycleEG])
cc_gene_indices <- na.omit(match(ens_ids_cc, rownames(dataQuartz)))

#ensemble IDs to gene symbols
x <- org.Hs.egSYMBOL
# Get the gene symbol that are mapped to an entrez gene identifiers
mapped_genes <- mappedkeys(x)
# Convert to a list
xx <- as.list(x[mapped_genes])
xxenseg <- as.list(org.Hs.egENSEMBL2EG)
gene_syms=unlist(xx[unlist(xxenseg[gene_names])])
gene_names_list<-(lapply(xxenseg[gene_names],function(x){if(is.null(x)){x=NA}else{x=x[1]}}))
sym_names=unlist(lapply(xx[unlist(gene_names_list)],function(x){if(is.null(x)){x=NA}else{x=x[1]}}))
sym_names[is.na(sym_names)]=gene_names[is.na(sym_names)]

#rename a few variables...
tech_noise = LogVar_techQuartz_logfit
cellcyclegenes_filter <- cc_gene_indices
cell_names <- colnames(countsQuartz)
Y <- LCountsQuartz
genes_het_bool <- (is_het)*1

h5createFile("test.h5")
h5write(cellcyclegenes_filterCB,gene_names,sym_names,cellcyclegenes_filter,cell_names,Y,genes_het_bool,tech_noise,file='test.h5', name='dataset_1')

h5save(cellcyclegenes_filterCB,gene_names,sym_names,cellcyclegenes_filter,cell_names,Y,genes_het_bool,tech_noise,file='test.h5f', name=NULL, createnewfile=TRUE)