Advertisement
Guest User

FBSD 10.1 zdb -Z patch

a guest
Dec 10th, 2014
218
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 8.68 KB | None | 0 0
  1. commit 6363651597963ecf3034d6b88855dc164ab28fed
  2. Author: Andrew Heybey <ath@niksun.com>
  3. Date:   Tue Nov 18 15:00:57 2014 -0500
  4.  
  5.     zdb: Add -Z flag like http://mbruning.blogspot.com/2009/12/zfs-raidz-data-walk.html
  6.  
  7. diff --git a/cddl/contrib/opensolaris/cmd/zdb/zdb.c b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
  8. index 04970fc..93653600 100644
  9. --- a/cddl/contrib/opensolaris/cmd/zdb/zdb.c
  10. +++ b/cddl/contrib/opensolaris/cmd/zdb/zdb.c
  11. @@ -59,6 +59,7 @@
  12.  #include <sys/ddt.h>
  13.  #include <sys/zfeature.h>
  14.  #include <zfs_comutil.h>
  15. +#include <sys/vdev_raidz.h>
  16.  #undef ZFS_MAXNAMELEN
  17.  #undef verify
  18.  #include <libzfs.h>
  19. @@ -3023,6 +3024,168 @@ zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
  20.     }
  21.  }
  22.  
  23. +
  24. +typedef struct raidz_col {
  25. +   uint64_t rc_devidx;     /* child device index for I/O */
  26. +   uint64_t rc_offset;     /* device offset */
  27. +   uint64_t rc_size;       /* I/O size */
  28. +   void *rc_data;          /* I/O data */
  29. +   void *rc_gdata;         /* used to store the "good" version */
  30. +   int rc_error;           /* I/O error for this device */
  31. +   uint8_t rc_tried;       /* Did we attempt this I/O column? */
  32. +   uint8_t rc_skipped;     /* Did we skip this I/O column? */
  33. +} raidz_col_t;
  34. +
  35. +typedef struct raidz_map {
  36. +   uint64_t rm_cols;       /* Regular column count */
  37. +   uint64_t rm_scols;      /* Count including skipped columns */
  38. +   uint64_t rm_bigcols;        /* Number of oversized columns */
  39. +   uint64_t rm_asize;      /* Actual total I/O size */
  40. +   uint64_t rm_missingdata;    /* Count of missing data devices */
  41. +   uint64_t rm_missingparity;  /* Count of missing parity devices */
  42. +   uint64_t rm_firstdatacol;   /* First data column/parity count */
  43. +   uint64_t rm_nskip;      /* Skipped sectors for padding */
  44. +   uint64_t rm_skipstart;      /* Column index of padding start */
  45. +   void *rm_datacopy;      /* rm_asize-buffer of copied data */
  46. +   uintptr_t rm_reports;       /* # of referencing checksum reports */
  47. +   uint8_t rm_freed;       /* map no longer has referencing ZIO */
  48. +   uint8_t rm_ecksuminjected;  /* checksum error was injected */
  49. +   raidz_col_t rm_col[1];      /* Flexible array of I/O columns */
  50. +} raidz_map_t;
  51. +
  52. +/*
  53. + * Divides the IO evenly across all child vdevs; usually, dcols is
  54. + * the number of children in the target vdev.
  55. + *
  56. + * copy-pasted from vdev_raidz in the ZFS sources
  57. + */
  58. +raidz_map_t*
  59. +vdev_raidz_map(uint64_t size, uint64_t offset, uint64_t unit_shift,
  60. +          uint64_t dcols, uint64_t nparity)
  61. +{
  62. +   raidz_map_t* rm;
  63. +   /* The starting RAIDZ (parent) vdev sector of the block. */
  64. +   uint64_t b = offset >> unit_shift;
  65. +   /* The zio's size in units of the vdev's minimum sector size. */
  66. +   uint64_t s = size >> unit_shift;
  67. +   /* The first column for this stripe. */
  68. +   uint64_t f = b % dcols;
  69. +   /* The starting byte offset on each child vdev. */
  70. +   uint64_t o = (b / dcols) << unit_shift;
  71. +   uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
  72. +
  73. +   /*
  74. +    * "Quotient": The number of data sectors for this stripe on all but
  75. +    * the "big column" child vdevs that also contain "remainder" data.
  76. +    */
  77. +   q = s / (dcols - nparity);
  78. +
  79. +   /*
  80. +    * "Remainder": The number of partial stripe data sectors in this I/O.
  81. +    * This will add a sector to some, but not all, child vdevs.
  82. +    */
  83. +   r = s - q * (dcols - nparity);
  84. +
  85. +   /* The number of "big columns" - those which contain remainder data. */
  86. +   bc = (r == 0 ? 0 : r + nparity);
  87. +
  88. +   /*
  89. +    * The total number of data and parity sectors associated with
  90. +    * this I/O.
  91. +    */
  92. +   tot = s + nparity * (q + (r == 0 ? 0 : 1));
  93. +
  94. +   /* acols: The columns that will be accessed. */
  95. +   /* scols: The columns that will be accessed or skipped. */
  96. +   if (q == 0) {
  97. +       /* Our I/O request doesn't span all child vdevs. */
  98. +       acols = bc;
  99. +       scols = MIN(dcols, roundup(bc, nparity + 1));
  100. +   } else {
  101. +       acols = dcols;
  102. +       scols = dcols;
  103. +   }
  104. +
  105. +   rm = umem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
  106. +
  107. +   rm->rm_cols = acols;
  108. +   rm->rm_scols = scols;
  109. +   rm->rm_bigcols = bc;
  110. +   rm->rm_skipstart = bc;
  111. +   rm->rm_missingdata = 0;
  112. +   rm->rm_missingparity = 0;
  113. +   rm->rm_firstdatacol = nparity;
  114. +   rm->rm_datacopy = NULL;
  115. +   rm->rm_reports = 0;
  116. +   rm->rm_freed = 0;
  117. +   rm->rm_ecksuminjected = 0;
  118. +
  119. +   asize = 0;
  120. +
  121. +   for (c = 0; c < scols; c++) {
  122. +       col = f + c;
  123. +       coff = o;
  124. +       if (col >= dcols) {
  125. +           col -= dcols;
  126. +           coff += 1ULL << unit_shift;
  127. +       }
  128. +       rm->rm_col[c].rc_devidx = col;
  129. +       rm->rm_col[c].rc_offset = coff;
  130. +       rm->rm_col[c].rc_data = NULL;
  131. +       rm->rm_col[c].rc_gdata = NULL;
  132. +       rm->rm_col[c].rc_error = 0;
  133. +       rm->rm_col[c].rc_tried = 0;
  134. +       rm->rm_col[c].rc_skipped = 0;
  135. +
  136. +       if (c >= acols)
  137. +           rm->rm_col[c].rc_size = 0;
  138. +       else if (c < bc)
  139. +           rm->rm_col[c].rc_size = (q + 1) << unit_shift;
  140. +       else
  141. +           rm->rm_col[c].rc_size = q << unit_shift;
  142. +
  143. +       asize += rm->rm_col[c].rc_size;
  144. +   }
  145. +
  146. +   rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
  147. +   rm->rm_nskip = roundup(tot, nparity + 1) - tot;
  148. +
  149. +   /*
  150. +    * If all data stored spans all columns, there's a danger that parity
  151. +    * will always be on the same device and, since parity isn't read
  152. +    * during normal operation, that that device's I/O bandwidth won't be
  153. +    * used effectively. We therefore switch the parity every 1MB.
  154. +    *
  155. +    * ... at least that was, ostensibly, the theory. As a practical
  156. +    * matter unless we juggle the parity between all devices evenly, we
  157. +    * won't see any benefit. Further, occasional writes that aren't a
  158. +    * multiple of the LCM of the number of children and the minimum
  159. +    * stripe width are sufficient to avoid pessimal behavior.
  160. +    * Unfortunately, this decision created an implicit on-disk format
  161. +    * requirement that we need to support for all eternity, but only
  162. +    * for single-parity RAID-Z.
  163. +    *
  164. +    * If we intend to skip a sector in the zeroth column for padding
  165. +    * we must make sure to note this swap. We will never intend to
  166. +    * skip the first column since at least one data and one parity
  167. +    * column must appear in each row.
  168. +    */
  169. +   if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
  170. +       devidx = rm->rm_col[0].rc_devidx;
  171. +       o = rm->rm_col[0].rc_offset;
  172. +       rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
  173. +       rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
  174. +       rm->rm_col[1].rc_devidx = devidx;
  175. +       rm->rm_col[1].rc_offset = o;
  176. +
  177. +       if (rm->rm_skipstart == 0)
  178. +           rm->rm_skipstart = 1;
  179. +   }
  180. +
  181. +   return (rm);
  182. +}
  183. +
  184. +
  185.  /*
  186.   * There are two acceptable formats:
  187.   * leaf_name     - For example: c1t0d0 or /tmp/ztest.0a
  188. @@ -3081,8 +3244,10 @@ name:
  189.  }
  190.  
  191.  /*
  192. - * Read a block from a pool and print it out.  The syntax of the
  193. - * block descriptor is:
  194. + * Read a block from a pool and print it out, or (if Zflag is true)
  195. + * print out where the block is found on the constituents of the vdev.
  196. + *
  197. + * The syntax of the block descriptor is:
  198.   *
  199.   * pool:vdev_specifier:offset:size[:flags]
  200.   *
  201. @@ -3103,7 +3268,7 @@ name:
  202.   *              * = not yet implemented
  203.   */
  204.  static void
  205. -zdb_read_block(char *thing, spa_t *spa)
  206. +zdb_read_block(char *thing, spa_t *spa, boolean_t Zflag)
  207.  {
  208.     blkptr_t blk, *bp = &blk;
  209.     dva_t *dva = bp->blk_dva;
  210. @@ -3183,6 +3348,22 @@ zdb_read_block(char *thing, spa_t *spa)
  211.     psize = size;
  212.     lsize = size;
  213.  
  214. +   if (Zflag) {
  215. +       raidz_map_t* rm;
  216. +       rm = vdev_raidz_map(psize, offset, vd->vdev_ashift,
  217. +                   vd->vdev_children, vd->vdev_nparity);
  218. +       (void) printf("columns %lu bigcols %lu asize %lu firstdatacol %lu\n",
  219. +                 rm->rm_cols, rm->rm_bigcols, rm->rm_asize,
  220. +                 rm->rm_firstdatacol);
  221. +       for (int c = 0; c < rm->rm_scols; ++c) {
  222. +           raidz_col_t* rc = &rm->rm_col[c];
  223. +           (void) printf("devidx %lu offset 0x%lx size 0x%lx\n",
  224. +                     rc->rc_devidx, rc->rc_offset, rc->rc_size);
  225. +       }
  226. +       umem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
  227. +       return;
  228. +   }
  229. +
  230.     pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
  231.     lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
  232.  
  233. @@ -3404,7 +3585,7 @@ main(int argc, char **argv)
  234.     dprintf_setup(&argc, argv);
  235.  
  236.     while ((c = getopt(argc, argv,
  237. -       "bcdhilmMI:suCDRSAFLXx:evp:t:U:P")) != -1) {
  238. +       "bcdhilmMI:suCDRSAFLXx:evp:t:U:PZ")) != -1) {
  239.         switch (c) {
  240.         case 'b':
  241.         case 'c':
  242. @@ -3420,6 +3601,7 @@ main(int argc, char **argv)
  243.         case 'M':
  244.         case 'R':
  245.         case 'S':
  246. +       case 'Z':
  247.             dump_opt[c]++;
  248.             dump_all = 0;
  249.             break;
  250. @@ -3490,6 +3672,9 @@ main(int argc, char **argv)
  251.     if (dump_all)
  252.         verbose = MAX(verbose, 1);
  253.  
  254. +   if (dump_opt['Z'])
  255. +       dump_opt['R'] = 1;
  256. +
  257.     for (c = 0; c < 256; c++) {
  258.         if (dump_all && !strchr("elAFLRSXP", c))
  259.             dump_opt[c] = 1;
  260. @@ -3618,7 +3803,7 @@ main(int argc, char **argv)
  261.         flagbits['r'] = ZDB_FLAG_RAW;
  262.  
  263.         for (i = 0; i < argc; i++)
  264. -           zdb_read_block(argv[i], spa);
  265. +           zdb_read_block(argv[i], spa, dump_opt['Z']);
  266.     }
  267.  
  268.     (os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG);
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement