Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # From: Marcel Curlin
- # Sent: Thursday, October 20, 2016 9:11 AM
- # To: Biostatistics & Design Program
- # Subject: Help with coding problem
- #
- # Hi
- #
- # I have a stats problem and would very much appreciate some help. It is a
- # fairly simple coding task in R. Is there anyone who could help? My problem is
- # described below.
- set.seed(as.integer(as.Date("2016-10-20")))
- #CREATE SAMPLE DATA – this looks like my dataset
- mydf <- data.frame(replicate(4,sample(c("ptid1", "ptid2","ptid3"),40,rep=TRUE)))
- mydf[,2] <- sample(seq(as.Date('1999/01/01'), as.Date('2000/01/01'), by="day"), 40)
- mydf[,3] <- sample(c(1, 0, 0, 0, "", "", "", "", ""), 40, replace = TRUE)
- mydf[,4] <- sample(c(sample(100:30000,5,replace=T), rep("", 20), rep(0, 10)),40, replace = TRUE)
- colnames(mydf) <- c("ID",
- "TESTDATE", "ELISA", "HIV_VL")
- #CLEAN UP DATA
- mydf$ELISA[mydf$ELISA == ""] <- NA
- mydf$HIV_VL[mydf$HIV_VL == ""] <- NA
- # don't forget to make dates column as.Date in real dataset
- #SORT DATA BY PTID AND DATE
- mydf2 <- mydf[order(mydf$ID, mydf$TESTDATE),]
- #DEFINE GOLD STANDARD AS RESULTS OF ELISA OR HIV_VL
- # mydf2$goldstandard <- NA
- mydf2$goldstandard[mydf2$ELISA == 0 | mydf2$HIV_VL == 0] <- "neg"
- # mydf2$goldstandard[mydf2$HIV_VL==0] <- "neg"
- mydf2$goldstandard[mydf2$ELISA == 1 | mydf2$HIV_VL > 0] <- "pos"
- # mydf2$goldstandard[mydf2$HIV_VL>0] <- "pos"
- mydf2 <- mydf2[!is.na(mydf2$goldstandard), ] # Keep rows where goldstandard is not NA
- mydf2Neg <- mydf2[mydf2$goldstandard == "neg", c("ID", "TESTDATE")] # Partition off the negative results
- mydf2Pos <- mydf2[mydf2$goldstandard == "pos", c("ID", "TESTDATE")] # Partition off the positive results
- mydf2Pos <- aggregate(TESTDATE ~ ID, data = mydf2Pos, min) # Pick off the earliest positive result
- mydf3 <- merge(mydf2Neg, mydf2Pos, by = "ID", suffixes = c("_neg", "_pos")) # Merge the negative results to the positive results
- mydf3$dateDiff <- mydf3$TESTDATE_pos - mydf3$TESTDATE_neg # Calculate the date differences between negative and positive results
- mydf3 <- mydf3[mydf3$dateDiff > 0, ] # Keep rows where the negative result was before the positive result
- lookup <- aggregate(dateDiff ~ ID + TESTDATE_pos, data = mydf3, min) # Create a lookup data frame that picks off the positive result with the smallest gap from a previous negative result
- mydf4 <- merge(mydf3, lookup, by = c("ID", "TESTDATE_pos", "dateDiff")) # Merge the lookup to the negative results
- mydf4$dateInfection <- mydf4$TESTDATE_neg + (mydf4$TESTDATE_pos - mydf4$TESTDATE_neg) / 2 # Calculate estimated date of infection (midpoint between last negative and first positive)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement