Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # deprofile.R by Michael Stravs, Eawag <michael.stravs@eawag.ch>
- # this code is public domain, make of it what you want and don't blame me
- # Needed for na.locf, which the R-style implementation of deprofile.fwhm relies on:
- library(zoo)
- #' Convert profile data to "centroid" mode using full-width, half-height algorithm or
- #' local maximum method.
- #'
- #' WARNING: Not yet thoroughly tested!
- #'
- #' The "deprofile" functions convert profile-mode high-resolution input data to "centroid"-mode
- #' data amenable to i.e. centWave.
- #'
- #' The "deprofile.fwhm" method is basically an R-semantic version of the "Exact Mass" m/z deprofiler
- #' from MZmine. It takes the center between the m/z values at half-maximum intensity for the exact peak mass.
- #' The logic is stolen verbatim from the Java MZmine algorithm, but it has been rewritten to use the fast
- #' R vector operations instead of loops wherever possible. It's slower than the Java implementation, but
- #' still decently fast IMO. Using matrices instead of the data frame would be more memory-efficient
- #' and also faster, probably.
- #'
- #' The "deprofile.localMax" method uses local maxima and is probably the same used by e.g. Xcalibur.
- #' For well-formed peaks, "deprofile.fwhm" gives more accurate mass results; for some applications,
- #' "deprofile.localMax" might be better (e.g. for fine isotopic structure peaks which are
- #' not separated by a valley and also not at half maximum.) For MS2 peaks, which have no isotopes,
- #' "deprofile.fwhm" is probably the better choice generally.
- #'
- #' The word "centroid" is used for convenience to denote not-profile-mode data.
- #' The data points are NOT mathematical centroids; we would like to have a better word for it.
- #'
- #' The "noise" parameter was only included for completeness, I personally don't use it.
- #'
- #' \code{deprofile.fwhm} and \code{deprofile.localMax} are the workhorses; \code{deprofile.xcmsRaw} and
- #' \code{deprofile.scan} take an xcmsRaw object or a 2-column scan as inputs, respectively.
- #' \code{deprofile} dispatches the call to the appropriate worker method.
- #'
- #' @note Known limitations: If the absolute leftmost stick or the absolute rightmost stick in
- #' a scan are maxima, they will be discarded! However, I don't think this will
- #' ever present a practical problem.
- #'
- #' @aliases deprofile, deprofile.xcmsRaw, deprofile.fwhm, deprofile.localMax
- #' @usage deprofile.scan(scan, noise, method, ...)
- #' deprofile.xcmsRaw(xraw, noise, method, ...)
- #' deprofile(df, noise, method, ...)
- #' deprofile.fwhm(df, noise, cut)
- #' deprofile.localMax(df, noise)
- #' @param scan A matrix with 2 columns for m/z and intensity; from profile-mode high-resolution data. Corresponds
- #' to the spectra obtained with xcms::getScan or mzR::peaks.
- #' @param noise The noise cutoff. A peak is not included if the maximum stick intensity of the peak
- #' is below the noise cutoff.
- #' @param copy for \code{xcmsRaw}: if \code{TRUE}, the \code{xcmsRaw} object is deep-copied and calculations
- #' are performed on the copy. Otherwise (default) the calculations are performed in-place.
- #' @param method "deprofile.fwhm" for full-width half-maximum or "deprofile.localMax" for
- #' local maximum.
- #' @param colnames For deprofile.scan: return matrix with column names (xcms-style,
- #' \code{TRUE}, default) or plain (mzR-style, \code{FALSE}).
- #' @param df A dataframe with at least the columns \code{mz} and \code{int} to perform deprofiling on.
- #' @param xraw An xcmsRaw object with high-resolution profile mode MS data, e.g. Orbitrap.
- #' @param cut A parameter for "deprofile.fwhm" indicating where the peak flanks should be taken. Standard is 0.5
- #' (as the algorithm name says, at half maximum.) Setting \code{cut = 0.75} would instead determine the peak
- #' width at 3/4 maximum, which might give a better accuracy for merged peaks, but could be less accurate
- #' if too few data points are present.
- #' @return \code{deprofile.scan}: a matrix with 2 columns for m/z and intensity
- #' \code{deprofile.xcmsRaw}: a new xcmsRaw object, if \code{copy = TRUE}. Otherwise,
- #' nothing is returned and the conversion is performed in place.
- #'
- #' @examples
- #' \dontrun{
- #' # from xcmsRaw:
- #' xraw <- xcmsRaw("myfile.mzML")
- #' # by FWHM method:
- #' xraw.sticked <- deprofile.xcmsRaw(xraw, copy=T, method="deprofile.fwhm")
- #' # local maximum: faster, but less accurate especially for "low" resolution
- #' xraw.sticked <- deprofile.xcmsRaw(xraw, copy=T, method="deprofile.localMax")
- #' scan.xraw.profile <- getScan(xraw, 50)
- #' scan.xraw <- getScan(xraw.sticked, 50)
- #' # alternatively directly from scan:
- #' scan.xraw.direct <- deprofile.scan(scan.xraw.profile)
- #'
- #' # the same scan from mzR:
- #' mzrFile <- openMSfile("myfile.mzML")
- #' acqNo <- xraw@acquisitionNum[[50]]
- #' scan.mzML.profile <- mzR::peaks(mzrFile, acqNo)
- #' scan.mzML <- deprofile.scan(scan.mzML.profile)
- #' close(mzrFile)
- #' }
- #'
- #' @author Michael Stravs, Eawag <michael.stravs@eawag.ch>
- #' @references mzMine source code http://sourceforge.net/svn/?group_id=139835
- #' @export
- deprofile.scan <- function(scan, noise = NA, method="deprofile.fwhm", colnames=T, ...)
- {
- # Format the input
- df <- as.data.frame(scan)
- colnames(df) <- c("mz", "int")
- # Call the actual workhorse
- peaklist <- deprofile(df, noise, method, ...)
- # return the centroided peaklist
- peaklist.m <- as.matrix(peaklist[,c("mz", "int")])
- if(colnames)
- colnames(peaklist.m) <- c("mz", "intensity")
- else
- colnames(peaklist.m) <- NULL
- return(peaklist.m)
- }
- # This implementation is cooler because it doesn't iterate over the scans,
- # but it fails on my lab PC because it needs too much memory. Also, it's slower :(
- # and I don't understand why.
- #olddoc' Additionally, if using deprofile.fwhm.xcmsRaw, a peak might be interpreted as
- #olddoc' too large if it crosses 2 scans (i.e. na.locf(fromLast=T) carries a groupmax
- #olddoc' into a preceding scan or vice versa, see code).
- #olddoc' Again, I think this is not a practical problem. But we have to verify this.
- ## deprofile.xcmsRaw <- function(xraw, noise = NA, copy=F, method="deprofile.fwhm", ...)
- ## {
- ## if(copy)
- ## xraw <- deepCopy(xraw)
- ## # This computes the entire centroid matrix in one flush without splitting the data by scans.
- ## # Is it really error-free? We have to check.
- ## # make a suitable input for masslist.fwhm
- ## df <- data.frame(mz = xraw@env$mz, int = xraw@env$intensity)
- ## # add additional input to be preserved: the scan index
- ## df$scanindex <- NA
- ## df[xraw@scanindex + 1, "scanindex"] <- xraw@scanindex
- ## df$scanindex.full <- na.locf(df$scanindex)
- ## peaklist <- deprofile(df, noise, method, ...)
- ## # Now we can get the mz, int values from the new peaklist but we have
- ## # to recompute the correct scan indices.
- ## xraw@env$mz <- peaklist$mz
- ## xraw@env$intensity <- peaklist$int
- ## # Since the entry where scanindex != NA might have been cut and we can't rely
- ## # on it always being there, we have to find the first occurrence of every scanindex
- ## # and make it 0- instead of 1-based
- ## peaklist$n2 <- 1:nrow(peaklist)
- ## xraw@scanindex <- as.integer(which(!duplicated(peaklist$scanindex.full)) - 1)
- ## # I do not recalculate TICs or the profile matrix. The profile matrix is low-resolution
- ## # anyway and would look the same if recalculated.
- ## if(copy)
- ## return(xraw)
- ## # That's all, folks.
- ## }
- deprofile.xcmsRaw <- function(xraw, noise = NA, copy=F, method="deprofile.fwhm", ...)
- {
- if(copy)
- xraw <- deepCopy(xraw)
- # This splits the data by scan and recomposes it
- # because the other version causes memory issues on my lab PC.
- # Using actual split() would be extremely slow.
- nscans <- length(xraw@scanindex)
- scanindex.lo <- xraw@scanindex + 1
- scanindex.hi <- c(xraw@scanindex[-1], length(xraw@env$mz))
- mz <- numeric(0)
- int <- numeric(0)
- scanindex <- c(0)
- for(n in 1:nscans)
- {
- df <- data.frame(mz = xraw@env$mz[scanindex.lo[[n]]:scanindex.hi[[n]]]
- , int = xraw@env$intensity[scanindex.lo[[n]]:scanindex.hi[[n]]])
- peaklist <- deprofile(df, noise, method, ...)
- # prolong the vectors
- mz <- c(mz, peaklist$mz)
- int <- c(int, peaklist$int)
- scanindex <- c(scanindex, length(mz))
- }
- # remove the last element from scanindex:
- scanindex <- scanindex[-length(scanindex)]
- # Now store everything into the xraw
- xraw@env$mz <- mz
- xraw@env$intensity <- int
- xraw@scanindex <- as.integer(scanindex)
- # I do not recalculate TICs or the profile matrix. The profile matrix is low-resolution
- # anyway and would look the same if recalculated.
- if(copy)
- return(xraw)
- # That's all, folks.
- }
- deprofile <- function(df, noise, method, ...)
- {
- return(do.call(method, list(df, noise, ...)))
- }
- deprofile.fwhm <- function(df, noise=NA, cut=0.5)
- {
- # split sticks into groups according to how MzMine does it:
- # a new group starts at zeroes and at new ascending points
- df$groupmax <- NA
- rows <- nrow(df)
- df <- within(df,
- {
- # identify local maxima
- groupmax[which(diff(sign(diff(int)))<0) + 1] <- which(diff(sign(diff(int)))<0) + 1
- # make forward-filled and backward-filled list for which was the last maximum.
- # This assigns the sticks to groups.
- groupmax_f <- na.locf(groupmax, na.rm=F)
- groupmax_b <- na.locf(groupmax, fromLast=T, na.rm=F)
- # take backward-filled group as default
- # and forward-filled group where the peak was ascending
- groupmax_b[which(diff(int)<0)+1 ] <- groupmax_f[which(diff(int) <0)+1]
- # eliminate zeroes
- groupmax_b[which(int==0)]<-NA
- # add "next intensities" and "next m/z" as well as "index" (n)
- n <- 1:rows
- int1 <- c(int[-1],0)
- mz1 <- c(mz[-1],0)
- # delete all the intensity+1 values from points which are last-in-group and therefore have int1 from next group!
- int1[which(!is.na(groupmax))-1] <- 0
- # find maximal intensity point for each peak member
- maxint <- df[groupmax_b, "int"]
- hm <- maxint * cut
- up <- ifelse( (int <= hm) & (int1 >= hm) & (n < groupmax_b), groupmax_b, NA)
- down <- ifelse( (int >= hm) & (int1 <= hm) & (n > groupmax_b), groupmax_b, NA)
- })
- # Compile finished peak list
- peaklist <- df[which(!is.na(df$groupmax)),]
- # Noise parameter:
- if(!is.na(noise))
- peaklist <- subset(peaklist, maxint > noise)
- # Find which peaks might have a FWHM value to substitue for the maxint mz value
- # We isolate the peaklists for left-hand and for right-hand FWHM peak.
- # If any straight line is found, the rightmost of the left points and the leftmost of the right points is used.
- peaklist.left <- df[which(!is.na(df$up) & !duplicated(df$up, fromLast=T)),]
- peaklist.right <- df[which(!is.na(df$down) & !duplicated(df$down)),]
- # calculate the slopes and the corresponding m/z value at half maximum
- peaklist.left$slope <- (peaklist.left$int1 - peaklist.left$int) / (peaklist.left$mz1 - peaklist.left$mz)
- peaklist.right$slope <- (peaklist.right$int1 - peaklist.right$int) / (peaklist.right$mz1 - peaklist.right$mz)
- peaklist.left$mzleft <- peaklist.left$mz + (peaklist.left$hm - peaklist.left$int) / peaklist.left$slope
- peaklist.right$mzright <- peaklist.right$mz + (peaklist.right$hm - peaklist.right$int) / peaklist.right$slope
- # add the two values to the full-peaklist where they exist
- peaklist <- merge(peaklist, peaklist.left[,c("up", "mzleft")], by.x="groupmax", by.y="up", all.x=T, suffix=c("", ".left"))
- peaklist <- merge(peaklist, peaklist.right[,c("down", "mzright")], by.x="groupmax", by.y="down", all.x=T, suffix=c("", ".right"))
- # Find which entries have both a left and a right end,
- # and calculate the center mass for them.
- peaklist.indexMzhm <- which(!is.na(peaklist$mzleft) & !is.na(peaklist$mzright))
- peaklist[peaklist.indexMzhm, "mz"] <- (peaklist[peaklist.indexMzhm, "mzleft"] + peaklist[peaklist.indexMzhm, "mzright"]) / 2
- return(peaklist)
- }
- deprofile.localMax <- function(df, noise=NA)
- {
- # split sticks into groups:
- # a new group starts at zeroes and at new ascending points
- df$groupmax <- NA
- rows <- nrow(df)
- df$groupmax[which(diff(sign(diff(df$int)))<0) + 1] <- which(diff(sign(diff(df$int)))<0) + 1
- peaklist <- df[which(!is.na(df$groupmax)),]
- # Noise parameter:
- if(!is.na(noise))
- peaklist <- subset(peaklist, int > noise)
- # And that's it.
- return(peaklist)
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement