Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/sbin/dtrace -Cs
- #pragma D option quiet
- #pragma D option dynvarsize=16M
- /*
- * This script is intended for observing IOs as they transition from
- * Wait Queue to Run Queue and to completion. Latency is measured as
- * the IOs enter and then exit the Wait Queue, and then as they leave
- * the Wait Queue and enter the Run Queue.
- * We collect this information by pool, so we can see differences in
- * terms of time spent in these two queues.
- * Also collected are counts of IOs by priority classification. See the
- * zio_priority enum below, as well zio_priority.h for more details.
- * Ideally we want to see as little as possible time spent in these queues.
- * And dispersion between lowest and highest times remaining tight and fairly
- * consistent without many offending outliers.
- *
- * aggregates prefixed with `tot` are tracking each ZIO from time it gets into
- * the queue to the time it is processed and is removed from queue.
- * We can think of `tot*` aggregates as a proxy for latency that we would
- * normally measure at the application level, roughly.
- */
- long wqueue[vdev_queue_t *, zio_t *]; /* measure wait queue time */
- long rqueue[vdev_queue_t *, zio_t *]; /* measure run queue time */
- long totqueue[zio_t *]; /* measure combined queue time */
- inline const int SLOW = 10000 ; /* >10ms considered slow */
- /* see usr/src/uts/common/fs/zfs/sys/zio_priority.h for details */
- typedef enum zio_priority {
- ZIO_PRIORITY_SYNC_READ,
- ZIO_PRIORITY_SYNC_WRITE, /* ZIL */
- ZIO_PRIORITY_ASYNC_READ, /* prefetch */
- ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */
- ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */
- ZIO_PRIORITY_NUM_QUEUEABLE,
- ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */
- } zio_priority_t;
- BEGIN {
- printf("ts,pool,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
- "rqav", "rqmin", "rqmax", "rqsd", "rqlt1ms", "rq1to10ms", "rq10to100ms",
- "rq100msto1s", "rqgt1s", "rqslowct", "rqaddct", "rqrmct", "rqctav",
- "rqctmin", "rqctmax", "rqctsd",
- "wqav", "wqmin", "wqmax", "wqsd", "wqlt1ms", "wq1to10ms", "wq10to100ms",
- "wq100msto1s", "wqgt1s", "wqslowct", "wqaddct", "wqrmct", "wqctav",
- "wqctmin", "wqctmax", "wqctsd",
- "totqav", "totqmin", "totqmax", "totqsd", "totqct", "syncrdct", "syncwrct", "asyncrdct", "asyncwrct", "scrubct"
- );
- interval = walltimestamp - (walltimestamp%1000000000);
- }
- /* (z)IO is entering Wait Queue */
- ::vdev_queue_io_add:entry {
- wqueue[args[0], args[1]] = args[1]->io_queued_timestamp ;
- this->spa = args[1]->io_spa ;
- this->allqueued = 0 ; /* reset to avoid bogus data */
- this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_READ].spa_queued ;
- this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_WRITE].spa_queued ;
- this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_READ].spa_queued ;
- this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_WRITE].spa_queued ;
- this->allqueued += this->spa->spa_queue_stats[ZIO_PRIORITY_SCRUB].spa_queued ;
- @wqctav[interval, this->spa->spa_name] = avg(this->allqueued) ;
- @wqctmin[interval, this->spa->spa_name] = min(this->allqueued) ;
- @wqctmax[interval, this->spa->spa_name] = max(this->allqueued) ;
- @wqctsd[interval, this->spa->spa_name] = stddev(this->allqueued == 0 ?
- 1 : this->allqueued) ;
- @wqaddct[interval, this->spa->spa_name] = count() ; /* Count ++ to Wait Queue */
- }
- /*
- * Count number of IOs by priority classification. In other words
- * we bucket each IO by ZIO priority as defined in the zio_priority enum.
- * This is in effect a histogram with 5 buckets, one each for:
- * sync-read, sync-write, async-read, async-write, and scrub.
- */
- ::vdev_queue_io_add:entry {
- this->spa = args[1]->io_spa ;
- this->pri = args[1]->io_priority ;
- @syncrdct[interval, this->spa->spa_name] =
- sum(this->pri == ZIO_PRIORITY_SYNC_READ ? 1 : 0) ;
- @syncwrct[interval, this->spa->spa_name] =
- sum(this->pri == ZIO_PRIORITY_SYNC_WRITE ? 1 : 0) ;
- @asyncrdct[interval, this->spa->spa_name] =
- sum(this->pri == ZIO_PRIORITY_ASYNC_READ ? 1 : 0) ;
- @asyncwrct[interval, this->spa->spa_name] =
- sum(this->pri == ZIO_PRIORITY_ASYNC_WRITE ? 1 : 0) ;
- @scrubct[interval, this->spa->spa_name] =
- sum(this->pri == ZIO_PRIORITY_SCRUB ? 1 : 0) ;
- }
- ::vdev_queue_io_remove:entry {
- this->spa = args[1]->io_spa ;
- @wqrmct[interval, this->spa->spa_name] = count() ; /* Count -- from Wait Queue */
- }
- /* (z)IO is leaving Wait Queue on its way to Run Queue */
- ::vdev_queue_io_remove:entry
- /wqueue[args[0], args[1]] == args[1]->io_queued_timestamp/ {
- this->delta = (timestamp - wqueue[args[0], args[1]]) / 1000 ;
- this->spa = args[1]->io_spa ;
- @wqavlat[interval, this->spa->spa_name] = avg(this->delta) ;
- @wqminlat[interval, this->spa->spa_name] = min(this->delta) ;
- @wqmaxlat[interval, this->spa->spa_name] = max(this->delta) ;
- @wqsdlat[interval, this->spa->spa_name] = stddev(this->delta) ;
- /* If spending to long in the queue, identify as slow IO. */
- @wqslowct[interval, this->spa->spa_name] = sum(this->delta >= SLOW ? 1 : 0) ;
- /*
- * Maintain a histogram of wait qeueue latency distribution
- * using base10 buckets: <1ms, 1-10ms, 10-100ms, 100ms-1s.
- */
- @wqlatlt1ms[interval, this->spa->spa_name] =
- sum(this->delta < 1000 ? 1 : 0) ;
- @wqlat1to10ms[interval, this->spa->spa_name] =
- sum(this->delta >= 1000 && this->delta < 10000 ? 1 : 0) ;
- @wqlat10to100ms[interval, this->spa->spa_name]=
- sum(this->delta >= 10000 && this->delta < 100000 ? 1 : 0) ;
- @wqlat100msto1s[interval, this->spa->spa_name]=
- sum(this->delta >= 100000 && this->delta < 1000000 ? 1 : 0) ;
- @wqlatgt1s[interval, this->spa->spa_name] = sum(this->delta > 1000000 ? 1 : 0) ;
- /*
- * The zio_t structures are going to be recycled over and over and
- * as such, we need to make sure that we avoid chances of bogus data
- * because we are checking entry points into the clauses for a !0
- * value.
- */
- wqueue[args[0], args[1]] = 0 ;
- }
- /* (z)IO is entering Run Queue */
- ::vdev_queue_pending_add:entry {
- rqueue[args[0], args[1]] = timestamp ; /* time IO entered Run Queue */
- this->spa = args[1]->io_spa ;
- this->allactive = 0 ; /* reset to avoid bogus data */
- this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_READ].spa_active ;
- this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_SYNC_WRITE].spa_active ;
- this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_READ].spa_active ;
- this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_ASYNC_WRITE].spa_active ;
- this->allactive += this->spa->spa_queue_stats[ZIO_PRIORITY_SCRUB].spa_active ;
- @rqctav[interval, this->spa->spa_name] = avg(this->allactive) ;
- @rqctmin[interval, this->spa->spa_name] = min(this->allactive) ;
- @rqctmax[interval, this->spa->spa_name] = max(this->allactive) ;
- @rqctsd[interval, this->spa->spa_name] = stddev(this->allactive) ;
- @rqaddct[interval, this->spa->spa_name] = count() ; /* Count ++ to Run Queue */
- }
- /* IO enters the queue */
- ::vdev_queue_io:entry {
- /*
- * This should capture the total time we spend waiting and processing
- * for each zio.
- */
- totqueue[args[0]] = args[0]->io_queued_timestamp ;
- }
- ::vdev_queue_pending_remove:entry {
- this->spa = args[1]->io_spa ;
- @rqrmct[interval, this->spa->spa_name] = count() ; /* Count -- from Run Queue */
- }
- /* (z)IO is leaving Run Queue, which implies IO completed */
- ::vdev_queue_pending_remove:entry /rqueue[args[0], args[1]]/ {
- this->delta = (timestamp - rqueue[args[0], args[1]]) / 1000 ;
- this->spa = args[1]->io_spa ;
- @rqavlat[interval, this->spa->spa_name] = avg(this->delta) ;
- @rqminlat[interval, this->spa->spa_name] = min(this->delta) ;
- @rqmaxlat[interval, this->spa->spa_name] = max(this->delta) ;
- @rqsdlat[interval, this->spa->spa_name] = stddev(this->delta == 0 ?
- 1 : this->delta) ;
- @rqslowct[interval, this->spa->spa_name] = sum(this->delta >= SLOW ? 1 : 0) ;
- /*
- * Maintain a histogram of run qeueue latency distribution
- * using base10 buckets: <1ms, 1-10ms, 10-100ms, 100ms-1s.
- */
- @rqlatlt1ms[interval, this->spa->spa_name] =
- sum(this->delta < 1000 ? 1 : 0) ;
- @rqlat1to10ms[interval, this->spa->spa_name] =
- sum(this->delta >= 1000 && this->delta < 10000 ? 1 : 0) ;
- @rqlat10to100ms[interval, this->spa->spa_name]=
- sum(this->delta >= 10000 && this->delta < 100000 ? 1 : 0) ;
- @rqlat100msto1s[interval, this->spa->spa_name]=
- sum(this->delta >= 100000 && this->delta < 1000000 ? 1 : 0) ;
- @rqlatgt1s[interval, this->spa->spa_name] =
- sum(this->delta > 1000000 ? 1 : 0) ;
- /*
- * The zio_t structures are going to be recycled over and over and
- * as such, we need to make sure that we avoid chances of bogus data
- * because we are checking entry points into the clauses for a !0
- * value.
- */
- rqueue[args[0], args[1]] = 0 ;
- }
- /* IO completed */
- ::vdev_queue_io_done:entry
- /totqueue[args[0]] == args[0]->io_queued_timestamp/ {
- /*
- * This is where we measure amount of time all the way from IO entering the
- * wait queue through all the stages to completion.
- */
- this->spa = args[0]->io_spa ;
- this->delta = (timestamp - totqueue[args[0]]) / 1000 ;
- @totqct[interval, this->spa->spa_name] = count() ;
- @totqavlat[interval, this->spa->spa_name] = avg(this->delta) ;
- @totqminlat[interval, this->spa->spa_name] = min(this->delta) ;
- @totqmaxlat[interval, this->spa->spa_name] = max(this->delta) ;
- @totqsdlat[interval, this->spa->spa_name] = stddev(this->delta == 0 ?
- 1 : this->delta) ;
- totqueue[args[0]] = 0 ;
- }
- tick-5sec {
- /*
- * If the counts are not equal between these histograms, we may have
- * interrupted the script at the very moment an IO was transitioning
- * and it did not get accounted for correctly. In general we expect
- * that these counts are all equal.
- */
- printa("%ld,%s,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d,%@d\n",
- @rqavlat, @rqminlat, @rqmaxlat, @rqsdlat, @rqlatlt1ms, @rqlat1to10ms,
- @rqlat10to100ms, @rqlat100msto1s, @rqlatgt1s,
- @rqslowct, @rqaddct, @rqrmct, @rqctav, @rqctmin, @rqctmax, @rqctsd,
- @wqavlat, @wqminlat, @wqmaxlat, @wqsdlat, @wqlatlt1ms, @wqlat1to10ms,
- @wqlat10to100ms, @wqlat100msto1s, @wqlatgt1s, @wqslowct, @wqaddct,
- @wqrmct, @wqctav, @wqctmin, @wqctmax, @wqctsd,
- @totqavlat, @totqminlat, @totqmaxlat, @totqsdlat, @totqct,
- @syncrdct, @syncwrct, @asyncrdct, @asyncwrct, @scrubct
- ) ;
- trunc(@rqavlat) ; trunc(@rqminlat) ; trunc(@rqmaxlat) ; trunc(@rqsdlat) ;
- trunc(@rqlatlt1ms) ; trunc(@rqlat1to10ms) ; trunc(@rqlat10to100ms) ;
- trunc(@rqlat100msto1s) ; trunc(@rqlatgt1s) ;
- trunc(@rqslowct) ; trunc(@rqaddct) ; trunc(@rqrmct) ;
- trunc(@rqctav) ; trunc(@rqctmin) ; trunc(@rqctmax) ; trunc(@rqctsd) ;
- trunc(@wqavlat) ; trunc(@wqminlat) ; trunc(@wqmaxlat) ; trunc(@wqsdlat) ;
- trunc(@wqlatlt1ms) ; trunc(@wqlat1to10ms) ; trunc(@wqlat10to100ms) ;
- trunc(@wqlat100msto1s) ; trunc(@wqlatgt1s) ;
- trunc(@wqslowct) ; trunc(@wqaddct) ; trunc(@wqrmct) ;
- trunc(@wqctav) ; trunc(@wqctmin) ; trunc(@wqctmax) ; trunc(@wqctsd) ;
- trunc(@totqavlat) ; trunc(@totqminlat) ; trunc(@totqmaxlat) ;
- trunc(@totqsdlat) ; trunc(@totqct) ;
- trunc(@syncrdct); trunc(@syncwrct); trunc(@asyncrdct);
- trunc(@asyncwrct); trunc(@scrubct) ;
- interval = walltimestamp - (walltimestamp%1000000000);
- }
Add Comment
Please, Sign In to add comment