deprofile.R

# deprofile.R by Michael Stravs, Eawag <michael.stravs@eawag.ch>
# this code is public domain, make of it what you want and don't blame me
# Needed for na.locf, which the R-style implementation of deprofile.fwhm relies on:
library(zoo)

#' Convert profile data to "centroid" mode using full-width, half-height algorithm or
#' local maximum method.
#'
#' WARNING: Not yet thoroughly tested!
#'
#' The "deprofile" functions convert profile-mode high-resolution input data to "centroid"-mode
#' data amenable to i.e. centWave.
#'
#' The "deprofile.fwhm" method is basically an R-semantic version of the "Exact Mass" m/z deprofiler
#' from MZmine. It takes the center between the m/z values at half-maximum intensity for the exact peak mass.
#' The logic is stolen verbatim from the Java MZmine algorithm, but it has been rewritten to use the fast
#' R vector operations instead of loops wherever possible. It's slower than the Java implementation, but
#' still decently fast IMO. Using matrices instead of the data frame would be more memory-efficient
#' and also faster, probably.
#'
#' The "deprofile.localMax" method uses local maxima and is probably the same used by e.g. Xcalibur.
#' For well-formed peaks, "deprofile.fwhm" gives more accurate mass results; for some applications,
#' "deprofile.localMax" might be better (e.g. for fine isotopic structure peaks which are
#' not separated by a valley and also not at half maximum.) For MS2 peaks, which have no isotopes,
#' "deprofile.fwhm" is probably the better choice generally.
#'
#' The word "centroid" is used for convenience to denote not-profile-mode data.
#' The data points are NOT mathematical centroids; we would like to have a better word for it.
#'
#' The "noise" parameter was only included for completeness, I personally don't use it.
#'
#' \code{deprofile.fwhm} and \code{deprofile.localMax} are the workhorses; \code{deprofile.xcmsRaw} and
#' \code{deprofile.scan} take an xcmsRaw object or a 2-column scan as inputs, respectively.
#' \code{deprofile} dispatches the call to the appropriate worker method.
#'
#' @note Known limitations: If the absolute leftmost stick or the absolute rightmost stick in
#'              a scan are maxima, they will be discarded! However, I don't think this will
#'              ever present a practical problem.
#'
#' @aliases deprofile, deprofile.xcmsRaw, deprofile.fwhm, deprofile.localMax
#' @usage       deprofile.scan(scan, noise, method, ...)
#'              deprofile.xcmsRaw(xraw, noise, method, ...)
#'              deprofile(df, noise, method, ...)
#'              deprofile.fwhm(df, noise, cut)
#'              deprofile.localMax(df, noise)
#' @param scan A matrix with 2 columns for m/z and intensity; from profile-mode high-resolution data. Corresponds
#'              to the spectra obtained with xcms::getScan or mzR::peaks.
#' @param noise The noise cutoff. A peak is not included if the maximum stick intensity of the peak
#'              is below the noise cutoff.
#' @param copy for \code{xcmsRaw}: if \code{TRUE}, the \code{xcmsRaw} object is deep-copied and calculations
#'              are performed on the copy. Otherwise (default) the calculations are performed in-place.
#' @param method "deprofile.fwhm" for full-width half-maximum or "deprofile.localMax" for
#'              local maximum.
#' @param colnames For deprofile.scan: return matrix with column names (xcms-style,
#'              \code{TRUE}, default) or plain (mzR-style, \code{FALSE}).
#' @param df A dataframe with at least the columns \code{mz} and \code{int} to perform deprofiling on.
#' @param xraw An xcmsRaw object with high-resolution profile mode MS data, e.g. Orbitrap.
#' @param cut A parameter for "deprofile.fwhm" indicating where the peak flanks should be taken. Standard is 0.5
#'              (as the algorithm name says, at half maximum.) Setting \code{cut = 0.75} would instead determine the peak
#'              width at 3/4 maximum, which might give a better accuracy for merged peaks, but could be less accurate
#'              if too few data points are present.
#' @return \code{deprofile.scan}: a matrix with 2 columns for m/z and intensity
#'              \code{deprofile.xcmsRaw}: a new xcmsRaw object, if \code{copy = TRUE}. Otherwise,
#'              nothing is returned and the conversion is performed in place.
#'
#' @examples
#' \dontrun{
#' # from xcmsRaw:
#' xraw <- xcmsRaw("myfile.mzML")
#' # by FWHM method:
#' xraw.sticked <- deprofile.xcmsRaw(xraw, copy=T, method="deprofile.fwhm")
#' # local maximum: faster, but less accurate especially for "low" resolution
#' xraw.sticked <- deprofile.xcmsRaw(xraw, copy=T, method="deprofile.localMax")
#' scan.xraw.profile <- getScan(xraw, 50)
#' scan.xraw <- getScan(xraw.sticked, 50)
#' # alternatively directly from scan:
#' scan.xraw.direct <- deprofile.scan(scan.xraw.profile)
#'
#' # the same scan from mzR:
#' mzrFile <- openMSfile("myfile.mzML")
#' acqNo <- xraw@acquisitionNum[[50]]
#' scan.mzML.profile <- mzR::peaks(mzrFile, acqNo)
#' scan.mzML <- deprofile.scan(scan.mzML.profile)
#' close(mzrFile)
#' }
#'
#' @author Michael Stravs, Eawag <michael.stravs@eawag.ch>
#' @references mzMine source code http://sourceforge.net/svn/?group_id=139835
#' @export
deprofile.scan <- function(scan, noise = NA, method="deprofile.fwhm", colnames=T, ...)
{
    # Format the input
    df <- as.data.frame(scan)
    colnames(df) <- c("mz", "int")
    # Call the actual workhorse
    peaklist <- deprofile(df, noise, method, ...)
    # return the centroided peaklist
    peaklist.m <- as.matrix(peaklist[,c("mz", "int")])
    if(colnames)
        colnames(peaklist.m) <- c("mz", "intensity")
    else
        colnames(peaklist.m) <- NULL
    return(peaklist.m)
}

# This implementation is cooler because it doesn't iterate over the scans,
# but it fails on my lab PC because it needs too much memory. Also, it's slower :(
# and I don't understand why.
#olddoc' Additionally, if using deprofile.fwhm.xcmsRaw, a peak might be interpreted as
#olddoc'                too large if it crosses 2 scans (i.e. na.locf(fromLast=T) carries a groupmax
#olddoc'                into a preceding scan or vice versa, see code).
#olddoc'                Again, I think this is not a practical problem. But we have to verify this.
## deprofile.xcmsRaw <- function(xraw, noise = NA, copy=F, method="deprofile.fwhm", ...)
## {
##     if(copy)
##         xraw <- deepCopy(xraw)
##     # This computes the entire centroid matrix in one flush without splitting the data by scans.
##     # Is it really error-free? We have to check.
##     # make a suitable input for masslist.fwhm
##     df <- data.frame(mz = xraw@env$mz, int = xraw@env$intensity)
##     # add additional input to be preserved: the scan index
##     df$scanindex <- NA
##     df[xraw@scanindex + 1, "scanindex"] <- xraw@scanindex
##     df$scanindex.full <- na.locf(df$scanindex)
##     peaklist <- deprofile(df, noise, method, ...)
##     # Now we can get the mz, int values from the new peaklist but we have
##     # to recompute the correct scan indices.
##     xraw@env$mz <- peaklist$mz
##     xraw@env$intensity <- peaklist$int
##     # Since the entry where scanindex != NA might have been cut and we can't rely
##     # on it always being there, we have to find the first occurrence of every scanindex
##     # and make it 0- instead of 1-based
##     peaklist$n2 <- 1:nrow(peaklist)
##     xraw@scanindex <- as.integer(which(!duplicated(peaklist$scanindex.full)) - 1)
##     # I do not recalculate TICs or the profile matrix. The profile matrix is low-resolution
##     # anyway and would look the same if recalculated.
##     if(copy)
##         return(xraw)
##     # That's all, folks.
## }

deprofile.xcmsRaw <- function(xraw, noise = NA, copy=F, method="deprofile.fwhm", ...)
{
    if(copy)
        xraw <- deepCopy(xraw)
    # This splits the data by scan and recomposes it
    # because the other version causes memory issues on my lab PC.
    # Using actual split() would be extremely slow.
    nscans <- length(xraw@scanindex)
    scanindex.lo <- xraw@scanindex + 1
    scanindex.hi <- c(xraw@scanindex[-1], length(xraw@env$mz))
    mz <- numeric(0)
    int <- numeric(0)
    scanindex <- c(0)
    for(n in 1:nscans)
    {
        df <- data.frame(mz = xraw@env$mz[scanindex.lo[[n]]:scanindex.hi[[n]]]
        , int = xraw@env$intensity[scanindex.lo[[n]]:scanindex.hi[[n]]])
        peaklist <- deprofile(df, noise, method, ...)
        # prolong the vectors
        mz <- c(mz, peaklist$mz)
        int <- c(int, peaklist$int)
        scanindex <- c(scanindex, length(mz))
    }
    # remove the last element from scanindex:
    scanindex <- scanindex[-length(scanindex)]
    # Now store everything into the xraw
    xraw@env$mz <- mz
    xraw@env$intensity <- int
    xraw@scanindex <- as.integer(scanindex)
    # I do not recalculate TICs or the profile matrix. The profile matrix is low-resolution
    # anyway and would look the same if recalculated.
    if(copy)
        return(xraw)
    # That's all, folks.
}


deprofile <- function(df, noise, method, ...)
{
    return(do.call(method, list(df, noise, ...)))
}

deprofile.fwhm <- function(df, noise=NA, cut=0.5)
{
    # split sticks into groups according to how MzMine does it:
    # a new group starts at zeroes and at new ascending points
    df$groupmax <- NA
    rows <- nrow(df)
    df <- within(df,
            {
                # identify local maxima
                groupmax[which(diff(sign(diff(int)))<0) + 1] <- which(diff(sign(diff(int)))<0) + 1
                # make forward-filled and backward-filled list for which was the last maximum.
                # This assigns the sticks to groups.
                groupmax_f <- na.locf(groupmax, na.rm=F)
                groupmax_b <- na.locf(groupmax, fromLast=T, na.rm=F)
                # take backward-filled group as default
                # and forward-filled group where the peak was ascending
                groupmax_b[which(diff(int)<0)+1 ] <- groupmax_f[which(diff(int) <0)+1]
                # eliminate zeroes
                groupmax_b[which(int==0)]<-NA
                # add "next intensities" and "next m/z" as well as "index" (n)
                n <- 1:rows
                int1 <- c(int[-1],0)
                mz1 <- c(mz[-1],0)
                # delete all the intensity+1 values from points which are last-in-group and therefore have int1 from next group!
                int1[which(!is.na(groupmax))-1] <- 0
                # find maximal intensity point for each peak member
                maxint <- df[groupmax_b, "int"]
                hm <- maxint * cut
                up <- ifelse( (int <= hm) & (int1 >= hm) & (n < groupmax_b), groupmax_b, NA)
                down <- ifelse( (int >= hm) & (int1 <= hm) & (n > groupmax_b), groupmax_b, NA)
            })

    # Compile finished peak list
    peaklist <- df[which(!is.na(df$groupmax)),]
    # Noise parameter:
    if(!is.na(noise))
        peaklist <- subset(peaklist, maxint > noise)
    # Find which peaks might have a FWHM value to substitue for the maxint mz value
    # We isolate the peaklists for left-hand and for right-hand FWHM peak.
    # If any straight line is found, the rightmost of the left points and the leftmost of the right points is used.
    peaklist.left <- df[which(!is.na(df$up) & !duplicated(df$up, fromLast=T)),]
    peaklist.right <- df[which(!is.na(df$down) & !duplicated(df$down)),]
    # calculate the slopes and the corresponding m/z value at half maximum
    peaklist.left$slope <- (peaklist.left$int1 - peaklist.left$int) / (peaklist.left$mz1 - peaklist.left$mz)
    peaklist.right$slope <- (peaklist.right$int1 - peaklist.right$int) / (peaklist.right$mz1 - peaklist.right$mz)
    peaklist.left$mzleft <- peaklist.left$mz + (peaklist.left$hm - peaklist.left$int) / peaklist.left$slope
    peaklist.right$mzright <- peaklist.right$mz + (peaklist.right$hm - peaklist.right$int) / peaklist.right$slope
    # add the two values to the full-peaklist where they exist
    peaklist <- merge(peaklist, peaklist.left[,c("up", "mzleft")], by.x="groupmax", by.y="up", all.x=T, suffix=c("", ".left"))
    peaklist <- merge(peaklist, peaklist.right[,c("down", "mzright")], by.x="groupmax", by.y="down", all.x=T, suffix=c("", ".right"))
    # Find which entries have both a left and a right end,
    # and calculate the center mass for them.
    peaklist.indexMzhm <- which(!is.na(peaklist$mzleft) & !is.na(peaklist$mzright))
    peaklist[peaklist.indexMzhm, "mz"] <- (peaklist[peaklist.indexMzhm, "mzleft"] + peaklist[peaklist.indexMzhm, "mzright"]) / 2

    return(peaklist)
}

deprofile.localMax <- function(df, noise=NA)
{
    # split sticks into groups:
    # a new group starts at zeroes and at new ascending points
    df$groupmax <- NA
    rows <- nrow(df)
    df$groupmax[which(diff(sign(diff(df$int)))<0) + 1] <- which(diff(sign(diff(df$int)))<0) + 1
    peaklist <- df[which(!is.na(df$groupmax)),]
    # Noise parameter:
    if(!is.na(noise))
        peaklist <- subset(peaklist, int > noise)
    # And that's it.
    return(peaklist)
}