Guest User

Untitled

a guest
Jan 16th, 2018
68
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 12.03 KB | None | 0 0
  1. #!/usr/sbin/dtrace -Cs
  2. #pragma D option quiet
  3. #pragma D option dynvarsize=16M
  4. /*
  5. * This script is intended for observing IOs as they transition from
  6. * Wait Queue to Run Queue and to completion. Latency is measured as
  7. * the IOs enter and then exit the Wait Queue, and then as they leave
  8. * the Wait Queue and enter the Run Queue.
  9. * We collect this information by pool, so we can see differences in
  10. * terms of time spent in these two queues.
  11. * Also collected are counts of IOs by priority classification. See the
  12. * zio_priority enum below, as well zio_priority.h for more details.
  13. * Ideally we want to see as little as possible time spent in these queues.
  14. * And dispersion between lowest and highest times remaining tight and fairly
  15. * consistent without many offending outliers.
  16. *
  17. * aggregates prefixed with `tot` are tracking each ZIO from time it gets into
  18. * the queue to the time it is processed and is removed from queue.
  19. * We can think of `tot*` aggregates as a proxy for latency that we would
  20. * normally measure at the application level, roughly.
  21. */
  22. long wqueue[vdev_queue_t *, zio_t *]; /* measure wait queue time */
  23. long rqueue[vdev_queue_t *, zio_t *]; /* measure run queue time */
  24. long totqueue[zio_t *]; /* measure combined queue time */
  25.  
  26. inline const int SLOW = 10000 ; /* >10ms considered slow */
  27.  
  28. /* see usr/src/uts/common/fs/zfs/sys/zio_priority.h for details */
  29. typedef enum zio_priority {
  30. ZIO_PRIORITY_SYNC_READ,
  31. ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
  32. ZIO_PRIORITY_ASYNC_READ, /* prefetch */
  33. ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
  34. ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
  35. ZIO_PRIORITY_NUM_QUEUEABLE,
  36.  
  37. ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
  38. } zio_priority_t;
  39.  
  40. BEGIN {
  41. printf("ts,pool,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
  42. "rqav", "rqmin", "rqmax", "rqsd", "rqlt1ms", "rq1to10ms", "rq10to100ms",
  43. "rq100msto1s", "rqgt1s", "rqslowct", "rqaddct", "rqrmct", "rqctav",
  44. "rqctmin", "rqctmax", "rqctsd",
  45. "wqav", "wqmin", "wqmax", "wqsd", "wqlt1ms", "wq1to10ms", "wq10to100ms",
  46. "wq100msto1s", "wqgt1s", "wqslowct", "wqaddct", "wqrmct", "wqctav",
  47. "wqctmin", "wqctmax", "wqctsd",
  48. "totqav", "totqmin", "totqmax", "totqsd", "totqct", "syncrdct", "syncwrct", "asyncrdct", "asyncwrct", "scrubct"
  49. );
  50. interval = walltimestamp - (walltimestamp%1000000000);
  51. }
  52.  
  53. /* (z)IO is entering Wait Queue */
  54. ::vdev_queue_io_add:entry {
  55. wqueue[args[0], args[1]] = args[1]->io_queued_timestamp ;
  56. this->spa = args[1]->io_spa ;
  57. this->allqueued = 0 ; /* reset to avoid bogus data */
  58. this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_READ].spa_queued ;
  59. this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_WRITE].spa_queued ;
  60. this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_READ].spa_queued ;
  61. this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_WRITE].spa_queued ;
  62. this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_SCRUB].spa_queued ;
  63. @wqctav[interval, this->spa->spa_name] = avg(this->allqueued) ;
  64. @wqctmin[interval, this->spa->spa_name] = min(this->allqueued) ;
  65. @wqctmax[interval, this->spa->spa_name] = max(this->allqueued) ;
  66. @wqctsd[interval, this->spa->spa_name] = stddev(this->allqueued == 0 ?
  67. 1 : this->allqueued) ;
  68. @wqaddct[interval, this->spa->spa_name] = count() ; /* Count ++ to Wait Queue */
  69. }
  70.  
  71. /*
  72. * Count number of IOs by priority classification. In other words
  73. * we bucket each IO by ZIO priority as defined in the zio_priority enum.
  74. * This is in effect a histogram with 5 buckets, one each for:
  75. * sync-read, sync-write, async-read, async-write, and scrub.
  76. */
  77. ::vdev_queue_io_add:entry {
  78. this->spa = args[1]->io_spa ;
  79. this->pri = args[1]->io_priority ;
  80. @syncrdct[interval, this->spa->spa_name] =
  81. sum(this->pri == ZIO_PRIORITY_SYNC_READ ? 1 : 0) ;
  82. @syncwrct[interval, this->spa->spa_name] =
  83. sum(this->pri == ZIO_PRIORITY_SYNC_WRITE ? 1 : 0) ;
  84. @asyncrdct[interval, this->spa->spa_name] =
  85. sum(this->pri == ZIO_PRIORITY_ASYNC_READ ? 1 : 0) ;
  86. @asyncwrct[interval, this->spa->spa_name] =
  87. sum(this->pri == ZIO_PRIORITY_ASYNC_WRITE ? 1 : 0) ;
  88. @scrubct[interval, this->spa->spa_name] =
  89. sum(this->pri == ZIO_PRIORITY_SCRUB ? 1 : 0) ;
  90. }
  91.  
  92. ::vdev_queue_io_remove:entry {
  93. this->spa = args[1]->io_spa ;
  94. @wqrmct[interval, this->spa->spa_name] = count() ; /* Count -- from Wait Queue */
  95. }
  96.  
  97. /* (z)IO is leaving Wait Queue on its way to Run Queue */
  98. ::vdev_queue_io_remove:entry
  99. /wqueue[args[0], args[1]] == args[1]->io_queued_timestamp/ {
  100. this->delta = (timestamp - wqueue[args[0], args[1]]) / 1000 ;
  101. this->spa = args[1]->io_spa ;
  102. @wqavlat[interval, this->spa->spa_name] = avg(this->delta) ;
  103. @wqminlat[interval, this->spa->spa_name] = min(this->delta) ;
  104. @wqmaxlat[interval, this->spa->spa_name] = max(this->delta) ;
  105. @wqsdlat[interval, this->spa->spa_name] = stddev(this->delta) ;
  106. /* If spending to long in the queue, identify as slow IO. */
  107. @wqslowct[interval, this->spa->spa_name] = sum(this->delta >= SLOW ? 1 : 0) ;
  108. /*
  109. * Maintain a histogram of wait qeueue latency distribution
  110. * using base10 buckets: <1ms, 1-10ms, 10-100ms, 100ms-1s.
  111. */
  112. @wqlatlt1ms[interval, this->spa->spa_name] =
  113. sum(this->delta < 1000 ? 1 : 0) ;
  114. @wqlat1to10ms[interval, this->spa->spa_name] =
  115. sum(this->delta >= 1000 && this->delta < 10000 ? 1 : 0) ;
  116. @wqlat10to100ms[interval, this->spa->spa_name]=
  117. sum(this->delta >= 10000 && this->delta < 100000 ? 1 : 0) ;
  118. @wqlat100msto1s[interval, this->spa->spa_name]=
  119. sum(this->delta >= 100000 && this->delta < 1000000 ? 1 : 0) ;
  120. @wqlatgt1s[interval, this->spa->spa_name] = sum(this->delta > 1000000 ? 1 : 0) ;
  121. /*
  122. * The zio_t structures are going to be recycled over and over and
  123. * as such, we need to make sure that we avoid chances of bogus data
  124. * because we are checking entry points into the clauses for a !0
  125. * value.
  126. */
  127. wqueue[args[0], args[1]] = 0 ;
  128. }
  129.  
  130. /* (z)IO is entering Run Queue */
  131. ::vdev_queue_pending_add:entry {
  132. rqueue[args[0], args[1]] = timestamp ; /* time IO entered Run Queue */
  133. this->spa = args[1]->io_spa ;
  134. this->allactive = 0 ; /* reset to avoid bogus data */
  135. this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_READ].spa_active ;
  136. this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_WRITE].spa_active ;
  137. this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_READ].spa_active ;
  138. this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_WRITE].spa_active ;
  139. this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_SCRUB].spa_active ;
  140. @rqctav[interval, this->spa->spa_name] = avg(this->allactive) ;
  141. @rqctmin[interval, this->spa->spa_name] = min(this->allactive) ;
  142. @rqctmax[interval, this->spa->spa_name] = max(this->allactive) ;
  143. @rqctsd[interval, this->spa->spa_name] = stddev(this->allactive) ;
  144. @rqaddct[interval, this->spa->spa_name] = count() ; /* Count ++ to Run Queue */
  145. }
  146.  
  147. /* IO enters the queue */
  148. ::vdev_queue_io:entry {
  149. /*
  150. * This should capture the total time we spend waiting and processing
  151. * for each zio.
  152. */
  153. totqueue[args[0]] = args[0]->io_queued_timestamp ;
  154. }
  155.  
  156. ::vdev_queue_pending_remove:entry {
  157. this->spa = args[1]->io_spa ;
  158. @rqrmct[interval, this->spa->spa_name] = count() ; /* Count -- from Run Queue */
  159. }
  160.  
  161. /* (z)IO is leaving Run Queue, which implies IO completed */
  162. ::vdev_queue_pending_remove:entry /rqueue[args[0], args[1]]/ {
  163. this->delta = (timestamp - rqueue[args[0], args[1]]) / 1000 ;
  164. this->spa = args[1]->io_spa ;
  165. @rqavlat[interval, this->spa->spa_name] = avg(this->delta) ;
  166. @rqminlat[interval, this->spa->spa_name] = min(this->delta) ;
  167. @rqmaxlat[interval, this->spa->spa_name] = max(this->delta) ;
  168. @rqsdlat[interval, this->spa->spa_name] = stddev(this->delta == 0 ?
  169. 1 : this->delta) ;
  170. @rqslowct[interval, this->spa->spa_name] = sum(this->delta >= SLOW ? 1 : 0) ;
  171. /*
  172. * Maintain a histogram of run qeueue latency distribution
  173. * using base10 buckets: <1ms, 1-10ms, 10-100ms, 100ms-1s.
  174. */
  175. @rqlatlt1ms[interval, this->spa->spa_name] =
  176. sum(this->delta < 1000 ? 1 : 0) ;
  177. @rqlat1to10ms[interval, this->spa->spa_name] =
  178. sum(this->delta >= 1000 && this->delta < 10000 ? 1 : 0) ;
  179. @rqlat10to100ms[interval, this->spa->spa_name]=
  180. sum(this->delta >= 10000 && this->delta < 100000 ? 1 : 0) ;
  181. @rqlat100msto1s[interval, this->spa->spa_name]=
  182. sum(this->delta >= 100000 && this->delta < 1000000 ? 1 : 0) ;
  183. @rqlatgt1s[interval, this->spa->spa_name] =
  184. sum(this->delta > 1000000 ? 1 : 0) ;
  185. /*
  186. * The zio_t structures are going to be recycled over and over and
  187. * as such, we need to make sure that we avoid chances of bogus data
  188. * because we are checking entry points into the clauses for a !0
  189. * value.
  190. */
  191. rqueue[args[0], args[1]] = 0 ;
  192. }
  193.  
  194. /* IO completed */
  195. ::vdev_queue_io_done:entry
  196. /totqueue[args[0]] == args[0]->io_queued_timestamp/ {
  197. /*
  198. * This is where we measure amount of time all the way from IO entering the
  199. * wait queue through all the stages to completion.
  200. */
  201. this->spa = args[0]->io_spa ;
  202. this->delta = (timestamp - totqueue[args[0]]) / 1000 ;
  203. @totqct[interval, this->spa->spa_name] = count() ;
  204. @totqavlat[interval, this->spa->spa_name] = avg(this->delta) ;
  205. @totqminlat[interval, this->spa->spa_name] = min(this->delta) ;
  206. @totqmaxlat[interval, this->spa->spa_name] = max(this->delta) ;
  207. @totqsdlat[interval, this->spa->spa_name] = stddev(this->delta == 0 ?
  208. 1 : this->delta) ;
  209. totqueue[args[0]] = 0 ;
  210. }
  211.  
  212. tick-5sec {
  213. /*
  214. * If the counts are not equal between these histograms, we may have
  215. * interrupted the script at the very moment an IO was transitioning
  216. * and it did not get accounted for correctly. In general we expect
  217. * that these counts are all equal.
  218. */
  219. printa("%ld,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
  220. @rqavlat, @rqminlat, @rqmaxlat, @rqsdlat, @rqlatlt1ms, @rqlat1to10ms,
  221. @rqlat10to100ms, @rqlat100msto1s, @rqlatgt1s,
  222. @rqslowct, @rqaddct, @rqrmct, @rqctav, @rqctmin, @rqctmax, @rqctsd,
  223. @wqavlat, @wqminlat, @wqmaxlat, @wqsdlat, @wqlatlt1ms, @wqlat1to10ms,
  224. @wqlat10to100ms, @wqlat100msto1s, @wqlatgt1s, @wqslowct, @wqaddct,
  225. @wqrmct, @wqctav, @wqctmin, @wqctmax, @wqctsd,
  226. @totqavlat, @totqminlat, @totqmaxlat, @totqsdlat, @totqct,
  227. @syncrdct, @syncwrct, @asyncrdct, @asyncwrct, @scrubct
  228. ) ;
  229. trunc(@rqavlat) ; trunc(@rqminlat) ; trunc(@rqmaxlat) ; trunc(@rqsdlat) ;
  230. trunc(@rqlatlt1ms) ; trunc(@rqlat1to10ms) ; trunc(@rqlat10to100ms) ;
  231. trunc(@rqlat100msto1s) ; trunc(@rqlatgt1s) ;
  232. trunc(@rqslowct) ; trunc(@rqaddct) ; trunc(@rqrmct) ;
  233. trunc(@rqctav) ; trunc(@rqctmin) ; trunc(@rqctmax) ; trunc(@rqctsd) ;
  234. trunc(@wqavlat) ; trunc(@wqminlat) ; trunc(@wqmaxlat) ; trunc(@wqsdlat) ;
  235. trunc(@wqlatlt1ms) ; trunc(@wqlat1to10ms) ; trunc(@wqlat10to100ms) ;
  236. trunc(@wqlat100msto1s) ; trunc(@wqlatgt1s) ;
  237. trunc(@wqslowct) ; trunc(@wqaddct) ; trunc(@wqrmct) ;
  238. trunc(@wqctav) ; trunc(@wqctmin) ; trunc(@wqctmax) ; trunc(@wqctsd) ;
  239. trunc(@totqavlat) ; trunc(@totqminlat) ; trunc(@totqmaxlat) ;
  240. trunc(@totqsdlat) ; trunc(@totqct) ;
  241. trunc(@syncrdct); trunc(@syncwrct); trunc(@asyncrdct);
  242. trunc(@asyncwrct); trunc(@scrubct) ;
  243. interval = walltimestamp - (walltimestamp%1000000000);
  244. }
Add Comment
Please, Sign In to add comment