Untitled

library(tidyr)
library(dplyr)
library(magrittr)
library(ggplot2)

# load data in "wide" format (genes in columns)
spellman <- read.csv("spellman-reformated.csv")

# restructure in "long" format
spellman.long <- gather(spellman, gene, expression, -expt, -time)

# group by gene and calculate the number of missing (NA) values per gene
spellman.na <-
  spellman.long %>%
  group_by(gene) %>%
  summarize(na.count = sum(is.na(expression)))

# get genes where no more than 5 values are missing
good.genes <-
  spellman.na %>%
  filter(na.count < 5) %$% gene

# select corresponding columns
spellman.filtered <-
  spellman %>%
  select(one_of(good.genes))

dim(spellman)  # dimensions of original data
dim(spellman.filtered) # dimensions of filtered data