Untitled

# From: Marcel Curlin
# Sent: Thursday, October 20, 2016 9:11 AM
# To: Biostatistics & Design Program
# Subject: Help with coding problem
#
# Hi
#
# I have a stats problem and would very much appreciate some help. It is a
# fairly simple coding task in R. Is there anyone who could help? My problem is
# described below.

set.seed(as.integer(as.Date("2016-10-20")))

#CREATE SAMPLE DATA – this looks like my dataset
mydf <- data.frame(replicate(4,sample(c("ptid1", "ptid2","ptid3"),40,rep=TRUE)))
mydf[,2] <- sample(seq(as.Date('1999/01/01'), as.Date('2000/01/01'), by="day"), 40)
mydf[,3] <- sample(c(1, 0, 0, 0, "", "", "", "", ""), 40, replace = TRUE)
mydf[,4] <- sample(c(sample(100:30000,5,replace=T), rep("", 20), rep(0, 10)),40, replace = TRUE)
colnames(mydf) <- c("ID",
                    "TESTDATE", "ELISA", "HIV_VL")

#CLEAN UP DATA
mydf$ELISA[mydf$ELISA == ""] <- NA
mydf$HIV_VL[mydf$HIV_VL == ""] <- NA
# don't forget to make dates column as.Date in real dataset

#SORT DATA BY PTID AND DATE
mydf2 <- mydf[order(mydf$ID, mydf$TESTDATE),]

#DEFINE GOLD STANDARD AS RESULTS OF ELISA OR HIV_VL
# mydf2$goldstandard <- NA
mydf2$goldstandard[mydf2$ELISA == 0 | mydf2$HIV_VL == 0]  <- "neg"
# mydf2$goldstandard[mydf2$HIV_VL==0]  <- "neg"
mydf2$goldstandard[mydf2$ELISA == 1 | mydf2$HIV_VL > 0]  <- "pos"
# mydf2$goldstandard[mydf2$HIV_VL>0]  <- "pos"


mydf2 <- mydf2[!is.na(mydf2$goldstandard), ]  # Keep rows where goldstandard is not NA
mydf2Neg <- mydf2[mydf2$goldstandard == "neg", c("ID", "TESTDATE")]  # Partition off the negative results
mydf2Pos <- mydf2[mydf2$goldstandard == "pos", c("ID", "TESTDATE")]  # Partition off the positive results
mydf2Pos <- aggregate(TESTDATE ~ ID, data = mydf2Pos, min)  # Pick off the earliest positive result

mydf3 <- merge(mydf2Neg, mydf2Pos, by = "ID", suffixes = c("_neg", "_pos"))  # Merge the negative results to the positive results
mydf3$dateDiff <- mydf3$TESTDATE_pos - mydf3$TESTDATE_neg  # Calculate the date differences between negative and positive results
mydf3 <- mydf3[mydf3$dateDiff > 0, ]  # Keep rows where the negative result was before the positive result

lookup <- aggregate(dateDiff ~ ID + TESTDATE_pos, data = mydf3, min)  # Create a lookup data frame that picks off the positive result with the smallest gap from a previous negative result
mydf4 <- merge(mydf3, lookup, by = c("ID", "TESTDATE_pos", "dateDiff"))  # Merge the lookup to the negative results

mydf4$dateInfection <- mydf4$TESTDATE_neg + (mydf4$TESTDATE_pos - mydf4$TESTDATE_neg) / 2  # Calculate estimated date of infection (midpoint between last negative and first positive)