Advertisement
meowcat

deprofile.R

Jun 24th, 2012
57
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 12.18 KB | None | 0 0
  1.  
  2. # Needed for na.locf, which the R-style implementation of deprofile.fwhm relies on:
  3. library(zoo)
  4.  
  5. #' Convert profile data to "centroid" mode using full-width, half-height algorithm or
  6. #' local maximum method.
  7. #'
  8. #' WARNING: Not yet thoroughly tested!
  9. #'
  10. #' The "deprofile" functions convert profile-mode high-resolution input data to "centroid"-mode
  11. #' data amenable to i.e. centWave.
  12. #'
  13. #' The "deprofile.fwhm" method is basically an R-semantic version of the "Exact Mass" m/z deprofiler
  14. #' from MZmine. It takes the center between the m/z values at half-maximum intensity for the exact peak mass.
  15. #' The logic is stolen verbatim from the Java MZmine algorithm, but it has been rewritten to use the fast
  16. #' R vector operations instead of loops wherever possible. It's slower than the Java implementation, but
  17. #' still decently fast IMO. Using matrices instead of the data frame would be more memory-efficient
  18. #' and also faster, probably.
  19. #'
  20. #' The "deprofile.localMax" method uses local maxima and is probably the same used by e.g. Xcalibur.
  21. #' For well-formed peaks, "deprofile.fwhm" gives more accurate mass results; for some applications,
  22. #' "deprofile.localMax" might be better (e.g. for fine isotopic structure peaks which are
  23. #' not separated by a valley and also not at half maximum.) For MS2 peaks, which have no isotopes,
  24. #' "deprofile.fwhm" is probably the better choice generally.
  25. #'
  26. #' The word "centroid" is used for convenience to denote not-profile-mode data.
  27. #' The data points are NOT mathematical centroids; we would like to have a better word for it.
  28. #'
  29. #' The "noise" parameter was only included for completeness, I personally don't use it.
  30. #'
  31. #' \code{deprofile.fwhm} and \code{deprofile.localMax} are the workhorses; \code{deprofile.xcmsRaw} and
  32. #' \code{deprofile.scan} take an xcmsRaw object or a 2-column scan as inputs, respectively.
  33. #' \code{deprofile} dispatches the call to the appropriate worker method.
  34. #'
  35. #' @note Known limitations: If the absolute leftmost stick or the absolute rightmost stick in
  36. #'              a scan are maxima, they will be discarded! However, I don't think this will
  37. #'              ever present a practical problem.
  38. #'
  39. #' @aliases deprofile, deprofile.xcmsRaw, deprofile.fwhm, deprofile.localMax
  40. #' @usage       deprofile.scan(scan, noise, method, ...)
  41. #'              deprofile.xcmsRaw(xraw, noise, method, ...)
  42. #'              deprofile(df, noise, method, ...)
  43. #'              deprofile.fwhm(df, noise, cut)
  44. #'              deprofile.localMax(df, noise)
  45. #' @param scan A matrix with 2 columns for m/z and intensity; from profile-mode high-resolution data. Corresponds
  46. #'              to the spectra obtained with xcms::getScan or mzR::peaks.
  47. #' @param noise The noise cutoff. A peak is not included if the maximum stick intensity of the peak
  48. #'              is below the noise cutoff.
  49. #' @param copy for \code{xcmsRaw}: if \code{TRUE}, the \code{xcmsRaw} object is deep-copied and calculations
  50. #'              are performed on the copy. Otherwise (default) the calculations are performed in-place.
  51. #' @param method "deprofile.fwhm" for full-width half-maximum or "deprofile.localMax" for
  52. #'              local maximum.
  53. #' @param colnames For deprofile.scan: return matrix with column names (xcms-style,
  54. #'              \code{TRUE}, default) or plain (mzR-style, \code{FALSE}).
  55. #' @param df A dataframe with at least the columns \code{mz} and \code{int} to perform deprofiling on.
  56. #' @param xraw An xcmsRaw object with high-resolution profile mode MS data, e.g. Orbitrap.
  57. #' @param cut A parameter for "deprofile.fwhm" indicating where the peak flanks should be taken. Standard is 0.5
  58. #'              (as the algorithm name says, at half maximum.) Setting \code{cut = 0.75} would instead determine the peak
  59. #'              width at 3/4 maximum, which might give a better accuracy for merged peaks, but could be less accurate
  60. #'              if too few data points are present.
  61. #' @return \code{deprofile.scan}: a matrix with 2 columns for m/z and intensity
  62. #'              \code{deprofile.xcmsRaw}: a new xcmsRaw object, if \code{copy = TRUE}. Otherwise,
  63. #'              nothing is returned and the conversion is performed in place.
  64. #'
  65. #' @examples
  66. #' \dontrun{
  67. #' # from xcmsRaw:
  68. #' xraw <- xcmsRaw("myfile.mzML")
  69. #' # by FWHM method:
  70. #' xraw.sticked <- deprofile.xcmsRaw(xraw, copy=T, method="deprofile.fwhm")
  71. #' # local maximum: faster, but less accurate especially for "low" resolution
  72. #' xraw.sticked <- deprofile.xcmsRaw(xraw, copy=T, method="deprofile.localMax")
  73. #' scan.xraw.profile <- getScan(xraw, 50)
  74. #' scan.xraw <- getScan(xraw.sticked, 50)
  75. #' # alternatively directly from scan:
  76. #' scan.xraw.direct <- deprofile.scan(scan.xraw.profile)
  77. #'
  78. #' # the same scan from mzR:
  79. #' mzrFile <- openMSfile("myfile.mzML")
  80. #' acqNo <- xraw@acquisitionNum[[50]]
  81. #' scan.mzML.profile <- mzR::peaks(mzrFile, acqNo)
  82. #' scan.mzML <- deprofile.scan(scan.mzML.profile)
  83. #' close(mzrFile)
  84. #' }
  85. #'
  86. #' @author Michael Stravs, Eawag <michael.stravs@eawag.ch>
  87. #' @references mzMine source code http://sourceforge.net/svn/?group_id=139835
  88. #' @export
  89. deprofile.scan <- function(scan, noise = NA, method="deprofile.fwhm", colnames=T, ...)
  90. {
  91.     # Format the input
  92.     df <- as.data.frame(scan)
  93.     colnames(df) <- c("mz", "int")
  94.     # Call the actual workhorse
  95.     peaklist <- deprofile(df, noise, method, ...)
  96.     # return the centroided peaklist
  97.     peaklist.m <- as.matrix(peaklist[,c("mz", "int")])
  98.     if(colnames)
  99.         colnames(peaklist.m) <- c("mz", "intensity")
  100.     else
  101.         colnames(peaklist.m) <- NULL
  102.     return(peaklist.m)
  103. }
  104.  
  105. # This implementation is cooler because it doesn't iterate over the scans,
  106. # but it fails on my lab PC because it needs too much memory. Also, it's slower :(
  107. # and I don't understand why.
  108. #olddoc' Additionally, if using deprofile.fwhm.xcmsRaw, a peak might be interpreted as
  109. #olddoc'                too large if it crosses 2 scans (i.e. na.locf(fromLast=T) carries a groupmax
  110. #olddoc'                into a preceding scan or vice versa, see code).
  111. #olddoc'                Again, I think this is not a practical problem. But we have to verify this.
  112. ## deprofile.xcmsRaw <- function(xraw, noise = NA, copy=F, method="deprofile.fwhm", ...)
  113. ## {
  114. ##     if(copy)
  115. ##         xraw <- deepCopy(xraw)
  116. ##     # This computes the entire centroid matrix in one flush without splitting the data by scans.
  117. ##     # Is it really error-free? We have to check.
  118. ##     # make a suitable input for masslist.fwhm
  119. ##     df <- data.frame(mz = xraw@env$mz, int = xraw@env$intensity)
  120. ##     # add additional input to be preserved: the scan index
  121. ##     df$scanindex <- NA
  122. ##     df[xraw@scanindex + 1, "scanindex"] <- xraw@scanindex
  123. ##     df$scanindex.full <- na.locf(df$scanindex)
  124. ##     peaklist <- deprofile(df, noise, method, ...)
  125. ##     # Now we can get the mz, int values from the new peaklist but we have
  126. ##     # to recompute the correct scan indices.
  127. ##     xraw@env$mz <- peaklist$mz
  128. ##     xraw@env$intensity <- peaklist$int
  129. ##     # Since the entry where scanindex != NA might have been cut and we can't rely
  130. ##     # on it always being there, we have to find the first occurrence of every scanindex
  131. ##     # and make it 0- instead of 1-based
  132. ##     peaklist$n2 <- 1:nrow(peaklist)
  133. ##     xraw@scanindex <- as.integer(which(!duplicated(peaklist$scanindex.full)) - 1)
  134. ##     # I do not recalculate TICs or the profile matrix. The profile matrix is low-resolution
  135. ##     # anyway and would look the same if recalculated.
  136. ##     if(copy)
  137. ##         return(xraw)
  138. ##     # That's all, folks.
  139. ## }
  140.  
  141. deprofile.xcmsRaw <- function(xraw, noise = NA, copy=F, method="deprofile.fwhm", ...)
  142. {
  143.     if(copy)
  144.         xraw <- deepCopy(xraw)
  145.     # This splits the data by scan and recomposes it
  146.     # because the other version causes memory issues on my lab PC.
  147.     # Using actual split() would be extremely slow.
  148.     nscans <- length(xraw@scanindex)
  149.     scanindex.lo <- xraw@scanindex + 1
  150.     scanindex.hi <- c(xraw@scanindex[-1], length(xraw@env$mz))
  151.     mz <- numeric(0)
  152.     int <- numeric(0)
  153.     scanindex <- c(0)
  154.     for(n in 1:nscans)
  155.     {
  156.         df <- data.frame(mz = xraw@env$mz[scanindex.lo[[n]]:scanindex.hi[[n]]]
  157.         , int = xraw@env$intensity[scanindex.lo[[n]]:scanindex.hi[[n]]])
  158.         peaklist <- deprofile(df, noise, method, ...)
  159.         # prolong the vectors
  160.         mz <- c(mz, peaklist$mz)
  161.         int <- c(int, peaklist$int)
  162.         scanindex <- c(scanindex, length(mz))
  163.     }
  164.     # remove the last element from scanindex:
  165.     scanindex <- scanindex[-length(scanindex)]
  166.     # Now store everything into the xraw
  167.     xraw@env$mz <- mz
  168.     xraw@env$intensity <- int
  169.     xraw@scanindex <- as.integer(scanindex)
  170.     # I do not recalculate TICs or the profile matrix. The profile matrix is low-resolution
  171.     # anyway and would look the same if recalculated.
  172.     if(copy)
  173.         return(xraw)
  174.     # That's all, folks.
  175. }
  176.  
  177.  
  178. deprofile <- function(df, noise, method, ...)
  179. {
  180.     return(do.call(method, list(df, noise, ...)))
  181. }
  182.  
  183. deprofile.fwhm <- function(df, noise=NA, cut=0.5)
  184. {
  185.     # split sticks into groups according to how MzMine does it:
  186.     # a new group starts at zeroes and at new ascending points
  187.     df$groupmax <- NA
  188.     rows <- nrow(df)
  189.     df <- within(df,
  190.             {
  191.                 # identify local maxima
  192.                 groupmax[which(diff(sign(diff(int)))<0) + 1] <- which(diff(sign(diff(int)))<0) + 1
  193.                 # make forward-filled and backward-filled list for which was the last maximum.
  194.                 # This assigns the sticks to groups.
  195.                 groupmax_f <- na.locf(groupmax, na.rm=F)
  196.                 groupmax_b <- na.locf(groupmax, fromLast=T, na.rm=F)
  197.                 # take backward-filled group as default
  198.                 # and forward-filled group where the peak was ascending
  199.                 groupmax_b[which(diff(int)<0)+1 ] <- groupmax_f[which(diff(int) <0)+1]
  200.                 # eliminate zeroes
  201.                 groupmax_b[which(int==0)]<-NA
  202.                 # add "next intensities" and "next m/z" as well as "index" (n)
  203.                 n <- 1:rows
  204.                 int1 <- c(int[-1],0)
  205.                 mz1 <- c(mz[-1],0)
  206.                 # delete all the intensity+1 values from points which are last-in-group and therefore have int1 from next group!
  207.                 int1[which(!is.na(groupmax))-1] <- 0
  208.                 # find maximal intensity point for each peak member
  209.                 maxint <- df[groupmax_b, "int"]
  210.                 hm <- maxint * cut
  211.                 up <- ifelse( (int <= hm) & (int1 >= hm) & (n < groupmax_b), groupmax_b, NA)
  212.                 down <- ifelse( (int >= hm) & (int1 <= hm) & (n > groupmax_b), groupmax_b, NA)
  213.             })
  214.  
  215.     # Compile finished peak list
  216.     peaklist <- df[which(!is.na(df$groupmax)),]
  217.     # Noise parameter:
  218.     if(!is.na(noise))
  219.         peaklist <- subset(peaklist, maxint > noise)
  220.     # Find which peaks might have a FWHM value to substitue for the maxint mz value
  221.     # We isolate the peaklists for left-hand and for right-hand FWHM peak.
  222.     # If any straight line is found, the rightmost of the left points and the leftmost of the right points is used.
  223.     peaklist.left <- df[which(!is.na(df$up) & !duplicated(df$up, fromLast=T)),]
  224.     peaklist.right <- df[which(!is.na(df$down) & !duplicated(df$down)),]
  225.     # calculate the slopes and the corresponding m/z value at half maximum
  226.     peaklist.left$slope <- (peaklist.left$int1 - peaklist.left$int) / (peaklist.left$mz1 - peaklist.left$mz)
  227.     peaklist.right$slope <- (peaklist.right$int1 - peaklist.right$int) / (peaklist.right$mz1 - peaklist.right$mz)
  228.     peaklist.left$mzleft <- peaklist.left$mz + (peaklist.left$hm - peaklist.left$int) / peaklist.left$slope
  229.     peaklist.right$mzright <- peaklist.right$mz + (peaklist.right$hm - peaklist.right$int) / peaklist.right$slope
  230.     # add the two values to the full-peaklist where they exist
  231.     peaklist <- merge(peaklist, peaklist.left[,c("up", "mzleft")], by.x="groupmax", by.y="up", all.x=T, suffix=c("", ".left"))
  232.     peaklist <- merge(peaklist, peaklist.right[,c("down", "mzright")], by.x="groupmax", by.y="down", all.x=T, suffix=c("", ".right"))
  233.     # Find which entries have both a left and a right end,
  234.     # and calculate the center mass for them.
  235.     peaklist.indexMzhm <- which(!is.na(peaklist$mzleft) & !is.na(peaklist$mzright))
  236.     peaklist[peaklist.indexMzhm, "mz"] <- (peaklist[peaklist.indexMzhm, "mzleft"] + peaklist[peaklist.indexMzhm, "mzright"]) / 2
  237.    
  238.     return(peaklist)
  239. }
  240.  
  241. deprofile.localMax <- function(df, noise=NA)
  242. {
  243.     # split sticks into groups:
  244.     # a new group starts at zeroes and at new ascending points
  245.     df$groupmax <- NA
  246.     rows <- nrow(df)
  247.     df$groupmax[which(diff(sign(diff(df$int)))<0) + 1] <- which(diff(sign(diff(df$int)))<0) + 1
  248.     peaklist <- df[which(!is.na(df$groupmax)),]
  249.     # Noise parameter:
  250.     if(!is.na(noise))
  251.         peaklist <- subset(peaklist, int > noise)
  252.     # And that's it.
  253.     return(peaklist)
  254. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement