Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- commit 6363651597963ecf3034d6b88855dc164ab28fed
- Author: Andrew Heybey <ath@niksun.com>
- Date: Tue Nov 18 15:00:57 2014 -0500
- zdb: Add -Z flag like http://mbruning.blogspot.com/2009/12/zfs-raidz-data-walk.html
- diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
- index 04970fc..93653600 100644
- --- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
- +++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
- @@ -59,6 +59,7 @@
- #include <sys/ddt.h>
- #include <sys/zfeature.h>
- #include <zfs_comutil.h>
- +#include <sys/vdev_raidz.h>
- #undef ZFS_MAXNAMELEN
- #undef verify
- #include <libzfs.h>
- @@ -3023,6 +3024,168 @@ zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
- }
- }
- +
- +typedef struct raidz_col {
- + uint64_t rc_devidx; /* child device index for I/O */
- + uint64_t rc_offset; /* device offset */
- + uint64_t rc_size; /* I/O size */
- + void *rc_data; /* I/O data */
- + void *rc_gdata; /* used to store the "good" version */
- + int rc_error; /* I/O error for this device */
- + uint8_t rc_tried; /* Did we attempt this I/O column? */
- + uint8_t rc_skipped; /* Did we skip this I/O column? */
- +} raidz_col_t;
- +
- +typedef struct raidz_map {
- + uint64_t rm_cols; /* Regular column count */
- + uint64_t rm_scols; /* Count including skipped columns */
- + uint64_t rm_bigcols; /* Number of oversized columns */
- + uint64_t rm_asize; /* Actual total I/O size */
- + uint64_t rm_missingdata; /* Count of missing data devices */
- + uint64_t rm_missingparity; /* Count of missing parity devices */
- + uint64_t rm_firstdatacol; /* First data column/parity count */
- + uint64_t rm_nskip; /* Skipped sectors for padding */
- + uint64_t rm_skipstart; /* Column index of padding start */
- + void *rm_datacopy; /* rm_asize-buffer of copied data */
- + uintptr_t rm_reports; /* # of referencing checksum reports */
- + uint8_t rm_freed; /* map no longer has referencing ZIO */
- + uint8_t rm_ecksuminjected; /* checksum error was injected */
- + raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
- +} raidz_map_t;
- +
- +/*
- + * Divides the IO evenly across all child vdevs; usually, dcols is
- + * the number of children in the target vdev.
- + *
- + * copy-pasted from vdev_raidz in the ZFS sources
- + */
- +raidz_map_t*
- +vdev_raidz_map(uint64_t size, uint64_t offset, uint64_t unit_shift,
- + uint64_t dcols, uint64_t nparity)
- +{
- + raidz_map_t* rm;
- + /* The starting RAIDZ (parent) vdev sector of the block. */
- + uint64_t b = offset >> unit_shift;
- + /* The zio's size in units of the vdev's minimum sector size. */
- + uint64_t s = size >> unit_shift;
- + /* The first column for this stripe. */
- + uint64_t f = b % dcols;
- + /* The starting byte offset on each child vdev. */
- + uint64_t o = (b / dcols) << unit_shift;
- + uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
- +
- + /*
- + * "Quotient": The number of data sectors for this stripe on all but
- + * the "big column" child vdevs that also contain "remainder" data.
- + */
- + q = s / (dcols - nparity);
- +
- + /*
- + * "Remainder": The number of partial stripe data sectors in this I/O.
- + * This will add a sector to some, but not all, child vdevs.
- + */
- + r = s - q * (dcols - nparity);
- +
- + /* The number of "big columns" - those which contain remainder data. */
- + bc = (r == 0 ? 0 : r + nparity);
- +
- + /*
- + * The total number of data and parity sectors associated with
- + * this I/O.
- + */
- + tot = s + nparity * (q + (r == 0 ? 0 : 1));
- +
- + /* acols: The columns that will be accessed. */
- + /* scols: The columns that will be accessed or skipped. */
- + if (q == 0) {
- + /* Our I/O request doesn't span all child vdevs. */
- + acols = bc;
- + scols = MIN(dcols, roundup(bc, nparity + 1));
- + } else {
- + acols = dcols;
- + scols = dcols;
- + }
- +
- + rm = umem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
- +
- + rm->rm_cols = acols;
- + rm->rm_scols = scols;
- + rm->rm_bigcols = bc;
- + rm->rm_skipstart = bc;
- + rm->rm_missingdata = 0;
- + rm->rm_missingparity = 0;
- + rm->rm_firstdatacol = nparity;
- + rm->rm_datacopy = NULL;
- + rm->rm_reports = 0;
- + rm->rm_freed = 0;
- + rm->rm_ecksuminjected = 0;
- +
- + asize = 0;
- +
- + for (c = 0; c < scols; c++) {
- + col = f + c;
- + coff = o;
- + if (col >= dcols) {
- + col -= dcols;
- + coff += 1ULL << unit_shift;
- + }
- + rm->rm_col[c].rc_devidx = col;
- + rm->rm_col[c].rc_offset = coff;
- + rm->rm_col[c].rc_data = NULL;
- + rm->rm_col[c].rc_gdata = NULL;
- + rm->rm_col[c].rc_error = 0;
- + rm->rm_col[c].rc_tried = 0;
- + rm->rm_col[c].rc_skipped = 0;
- +
- + if (c >= acols)
- + rm->rm_col[c].rc_size = 0;
- + else if (c < bc)
- + rm->rm_col[c].rc_size = (q + 1) << unit_shift;
- + else
- + rm->rm_col[c].rc_size = q << unit_shift;
- +
- + asize += rm->rm_col[c].rc_size;
- + }
- +
- + rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
- + rm->rm_nskip = roundup(tot, nparity + 1) - tot;
- +
- + /*
- + * If all data stored spans all columns, there's a danger that parity
- + * will always be on the same device and, since parity isn't read
- + * during normal operation, that that device's I/O bandwidth won't be
- + * used effectively. We therefore switch the parity every 1MB.
- + *
- + * ... at least that was, ostensibly, the theory. As a practical
- + * matter unless we juggle the parity between all devices evenly, we
- + * won't see any benefit. Further, occasional writes that aren't a
- + * multiple of the LCM of the number of children and the minimum
- + * stripe width are sufficient to avoid pessimal behavior.
- + * Unfortunately, this decision created an implicit on-disk format
- + * requirement that we need to support for all eternity, but only
- + * for single-parity RAID-Z.
- + *
- + * If we intend to skip a sector in the zeroth column for padding
- + * we must make sure to note this swap. We will never intend to
- + * skip the first column since at least one data and one parity
- + * column must appear in each row.
- + */
- + if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
- + devidx = rm->rm_col[0].rc_devidx;
- + o = rm->rm_col[0].rc_offset;
- + rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
- + rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
- + rm->rm_col[1].rc_devidx = devidx;
- + rm->rm_col[1].rc_offset = o;
- +
- + if (rm->rm_skipstart == 0)
- + rm->rm_skipstart = 1;
- + }
- +
- + return (rm);
- +}
- +
- +
- /*
- * There are two acceptable formats:
- * leaf_name - For example: c1t0d0 or /tmp/ztest.0a
- @@ -3081,8 +3244,10 @@ name:
- }
- /*
- - * Read a block from a pool and print it out. The syntax of the
- - * block descriptor is:
- + * Read a block from a pool and print it out, or (if Zflag is true)
- + * print out where the block is found on the constituents of the vdev.
- + *
- + * The syntax of the block descriptor is:
- *
- * pool:vdev_specifier:offset:size[:flags]
- *
- @@ -3103,7 +3268,7 @@ name:
- * * = not yet implemented
- */
- static void
- -zdb_read_block(char *thing, spa_t *spa)
- +zdb_read_block(char *thing, spa_t *spa, boolean_t Zflag)
- {
- blkptr_t blk, *bp = &blk;
- dva_t *dva = bp->blk_dva;
- @@ -3183,6 +3348,22 @@ zdb_read_block(char *thing, spa_t *spa)
- psize = size;
- lsize = size;
- + if (Zflag) {
- + raidz_map_t* rm;
- + rm = vdev_raidz_map(psize, offset, vd->vdev_ashift,
- + vd->vdev_children, vd->vdev_nparity);
- + (void) printf("columns %lu bigcols %lu asize %lu firstdatacol %lu\n",
- + rm->rm_cols, rm->rm_bigcols, rm->rm_asize,
- + rm->rm_firstdatacol);
- + for (int c = 0; c < rm->rm_scols; ++c) {
- + raidz_col_t* rc = &rm->rm_col[c];
- + (void) printf("devidx %lu offset 0x%lx size 0x%lx\n",
- + rc->rc_devidx, rc->rc_offset, rc->rc_size);
- + }
- + umem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
- + return;
- + }
- +
- pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
- lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
- @@ -3404,7 +3585,7 @@ main(int argc, char **argv)
- dprintf_setup(&argc, argv);
- while ((c = getopt(argc, argv,
- - "bcdhilmMI:suCDRSAFLXx:evp:t:U:P")) != -1) {
- + "bcdhilmMI:suCDRSAFLXx:evp:t:U:PZ")) != -1) {
- switch (c) {
- case 'b':
- case 'c':
- @@ -3420,6 +3601,7 @@ main(int argc, char **argv)
- case 'M':
- case 'R':
- case 'S':
- + case 'Z':
- dump_opt[c]++;
- dump_all = 0;
- break;
- @@ -3490,6 +3672,9 @@ main(int argc, char **argv)
- if (dump_all)
- verbose = MAX(verbose, 1);
- + if (dump_opt['Z'])
- + dump_opt['R'] = 1;
- +
- for (c = 0; c < 256; c++) {
- if (dump_all && !strchr("elAFLRSXP", c))
- dump_opt[c] = 1;
- @@ -3618,7 +3803,7 @@ main(int argc, char **argv)
- flagbits['r'] = ZDB_FLAG_RAW;
- for (i = 0; i < argc; i++)
- - zdb_read_block(argv[i], spa);
- + zdb_read_block(argv[i], spa, dump_opt['Z']);
- }
- (os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement