Advertisement
Guest User

btrfs device replace can cause silent or noisy corruption

a guest
May 17th, 2018
180
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 17.73 KB | None | 0 0
  1. root@archiso ~ # uname -r
  2. 4.16.5-1-ARCH
  3. root@archiso /mnt # btrfs --version
  4. btrfs-progs v4.16
  5.  
  6. # Setup a 3 device RAID1. This example is all on the same disk, but it makes no difference.
  7.  
  8. root@archiso ~ # mkfs.btrfs --data raid1 --metadata raid1 /dev/vda{1,2,3} -f
  9. btrfs-progs v4.16
  10. See http://btrfs.wiki.kernel.org for more information.
  11.  
  12. Label: (null)
  13. UUID: b765aaea-c99c-4950-a347-354b28e6cc50
  14. Node size: 16384
  15. Sector size: 4096
  16. Filesystem size: 30.00GiB
  17. Block group profiles:
  18. Data: RAID1 1.00GiB
  19. Metadata: RAID1 1.00GiB
  20. System: RAID1 8.00MiB
  21. SSD detected: no
  22. Incompat features: extref, skinny-metadata
  23. Number of devices: 3
  24. Devices:
  25. ID SIZE PATH
  26. 1 10.00GiB /dev/vda1
  27. 2 10.00GiB /dev/vda2
  28. 3 10.00GiB /dev/vda3
  29.  
  30. # Mount using default (zstd) compression. Lzo has the same behavior, and I'm assuming zlib does too.
  31.  
  32. root@archiso ~ # mount -o compress /dev/vda1 /mnt
  33. root@archiso ~ # cd /mnt
  34.  
  35. # Create 1MB file filled with zeros, and by submitting it for defragmentation with compression, force it to be compressed despite being NOCOW and NODATASUM.
  36.  
  37. root@archiso /mnt # touch zero
  38. root@archiso /mnt # chattr +C zero
  39. root@archiso /mnt # dd if=/dev/zero of=zero bs=1M count=1
  40. 1+0 records in
  41. 1+0 records out
  42. 1048576 bytes (1.0 MB, 1.0 MiB) copied, 0.00568061 s, 185 MB/s
  43. root@archiso /mnt # sync
  44. root@archiso /mnt # filefrag -v zero
  45. Filesystem type is: 9123683e
  46. File size of zero is 1048576 (256 blocks of 4096 bytes)
  47. ext: logical_offset: physical_offset: length: expected: flags:
  48. 0: 0.. 255: 269568.. 269823: 256: last,eof
  49. zero: 1 extent found
  50. root@archiso /mnt # btrfs fi def -c zero
  51.  
  52. # Confirm the file is now stored as compressed. Note filefrag correctly shows ending logical offsets, but only understands a single length field which is the uncompressed length, so it can shows a much later ending physical offset. It just doesn't understand btrfs compression. Note the starting physical offsets are consecutive, and the 1MB of zeros has been compressed into 8 extents of 4k each (32k total) proving the file was forced to be compressed.
  53.  
  54. root@archiso /mnt # filefrag -v zero
  55. Filesystem type is: 9123683e
  56. File size of zero is 1048576 (256 blocks of 4096 bytes)
  57. ext: logical_offset: physical_offset: length: expected: flags:
  58. 0: 0.. 31: 269952.. 269983: 32: encoded
  59. 1: 32.. 63: 269953.. 269984: 32: 269984: encoded
  60. 2: 64.. 95: 269954.. 269985: 32: 269985: encoded
  61. 3: 96.. 127: 269955.. 269986: 32: 269986: encoded
  62. 4: 128.. 159: 269956.. 269987: 32: 269987: encoded
  63. 5: 160.. 191: 269957.. 269988: 32: 269988: encoded
  64. 6: 192.. 223: 269958.. 269989: 32: 269989: encoded
  65. 7: 224.. 255: 269959.. 269990: 32: 269990: last,encoded,eof
  66. zero: 8 extents found
  67.  
  68. # Verify RAID is fine, and note the file is mirrored on devices 1 & 2, by the data bytes.
  69.  
  70. root@archiso /mnt # btrfs scrub start -BdR /mnt
  71. scrub device /dev/vda1 (id 1) done
  72. scrub started at Thu May 17 08:53:54 2018 and finished after 00:00:00
  73. data_extents_scrubbed: 24
  74. tree_extents_scrubbed: 7
  75. data_bytes_scrubbed: 1572864
  76. tree_bytes_scrubbed: 114688
  77. read_errors: 0
  78. csum_errors: 0
  79. verify_errors: 0
  80. no_csum: 384
  81. csum_discards: 0
  82. super_errors: 0
  83. malloc_errors: 0
  84. uncorrectable_errors: 0
  85. unverified_errors: 0
  86. corrected_errors: 0
  87. last_physical: 2169503744
  88. scrub device /dev/vda2 (id 2) done
  89. scrub started at Thu May 17 08:53:54 2018 and finished after 00:00:00
  90. data_extents_scrubbed: 24
  91. tree_extents_scrubbed: 1
  92. data_bytes_scrubbed: 1572864
  93. tree_bytes_scrubbed: 16384
  94. read_errors: 0
  95. csum_errors: 0
  96. verify_errors: 0
  97. no_csum: 384
  98. csum_discards: 0
  99. super_errors: 0
  100. malloc_errors: 0
  101. uncorrectable_errors: 0
  102. unverified_errors: 0
  103. corrected_errors: 0
  104. last_physical: 1083179008
  105. scrub device /dev/vda3 (id 3) done
  106. scrub started at Thu May 17 08:53:54 2018 and finished after 00:00:00
  107. data_extents_scrubbed: 0
  108. tree_extents_scrubbed: 8
  109. data_bytes_scrubbed: 0
  110. tree_bytes_scrubbed: 131072
  111. read_errors: 0
  112. csum_errors: 0
  113. verify_errors: 0
  114. no_csum: 0
  115. csum_discards: 0
  116. super_errors: 0
  117. malloc_errors: 0
  118. uncorrectable_errors: 0
  119. unverified_errors: 0
  120. corrected_errors: 0
  121. last_physical: 1083179008
  122.  
  123. # Calculate the btrfs logical extent number, for the first file extent, starting at physical offset 269952 of 4k blocks.
  124.  
  125. root@archiso /mnt # echo $[269952*4096]
  126. 1105723392
  127.  
  128. # Read mirrored copies of the disk data (so, in compressed form) for the first file extent. Note btrfs-map-logical takes its last argument as a device in the volume, so it can be done offline, not a mountpoint, but that does NOT specify which mirrored copy will be written to the file as "-c" does that.
  129.  
  130. root@archiso /mnt # btrfs-map-logical -l 1105723392 -b 4096 -o /root/1105723392.begin.copy1 -c 1 /dev/vda1
  131. mirror 1 logical 1105723392 physical 11010048 device /dev/vda2
  132. mirror 2 logical 1105723392 physical 1097334784 device /dev/vda1
  133. root@archiso /mnt # btrfs-map-logical -l 1105723392 -b 4096 -o /root/1105723392.begin.copy2 -c 2 /dev/vda1
  134. mirror 1 logical 1105723392 physical 11010048 device /dev/vda2
  135. mirror 2 logical 1105723392 physical 1097334784 device /dev/vda1
  136.  
  137. # As expected, the mirrored copies are the same. Take a brief look at the compressed form.
  138.  
  139. root@archiso /mnt # diff --brief /root/1105723392.begin.copy*
  140. root@archiso /mnt # xxd /root/1105723392.copy1 | head -n 5
  141. 00000000: 785e ecd0 010d 0000 00c2 a0f7 4f6d 0f07 x^..........Om..
  142. 00000010: 1128 0c18 3060 c080 0103 060c 1830 60c0 .(..0`.......0`.
  143. 00000020: 8081 f781 0100 00ff ffec d081 0c00 0000 ................
  144. 00000030: c020 7feb 7b7c 2114 9f01 0306 0c18 3060 . ..{|!.......0`
  145. 00000040: c080 0103 060c 1830 f01f 0800 00ff ffec .......0........
  146. root@archiso /mnt # xxd /root/1105723392.copy2 | head -n 5
  147. 00000000: 785e ecd0 010d 0000 00c2 a0f7 4f6d 0f07 x^..........Om..
  148. 00000010: 1128 0c18 3060 c080 0103 060c 1830 60c0 .(..0`.......0`.
  149. 00000020: 8081 f781 0100 00ff ffec d081 0c00 0000 ................
  150. 00000030: c020 7feb 7b7c 2114 9f01 0306 0c18 3060 . ..{|!.......0`
  151. 00000040: c080 0103 060c 1830 f01f 0800 00ff ffec .......0........
  152.  
  153. # Simulate device 1 failure and replacement. Shouldn't have data loss as long as we get the replacement in without another drive failing.
  154.  
  155. root@archiso /mnt # cd
  156. root@archiso ~ # umount /mnt
  157. root@archiso ~ # dd if=/dev/zero of=/dev/vda1 bs=1M
  158. dd: error writing '/dev/vda1': No space left on device
  159. 10241+0 records in
  160. 10240+0 records out
  161. 10737418240 bytes (11 GB, 10 GiB) copied, 22.4585 s, 478 MB/s
  162. dd if=/dev/zero of=/dev/vda1 bs=1M 0.01s user 6.50s system 28% cpu 22.460 total
  163. 1 root@archiso ~ # sync :(
  164.  
  165. # Make sure check passes. Space cache warnings are documented as being able to be ignored, and btrfs easily sidesteps the problem and fixes them. They are usually indicative of something weird having happened, and we had a complete drive fail.
  166.  
  167. root@archiso ~ # btrfs check /dev/vda2
  168. Checking filesystem on /dev/vda2
  169. UUID: b765aaea-c99c-4950-a347-354b28e6cc50
  170. checking extents
  171. checking free space cache
  172. failed to load free space cache for block group 30408704
  173. failed to load free space cache for block group 1104150528
  174. checking fs roots
  175. checking csums
  176. checking root refs
  177. found 688128 bytes used, no error found
  178. total csum bytes: 0
  179. total tree bytes: 131072
  180. total fs tree bytes: 32768
  181. total extent tree bytes: 16384
  182. btree space waste bytes: 121695
  183. file data blocks allocated: 557056
  184. referenced 1572864
  185.  
  186. # Mount the volume degraded. Mounting with or without compression makes no difference to how replace will misbehave.
  187.  
  188. root@archiso ~ # mount -o compress,degraded /dev/vda2 /mnt
  189.  
  190. # Make sure scrub passes. (Not that there's any files with checksums.) Note all data bytes are on device 2, since 1 is missing.
  191.  
  192. root@archiso ~ # btrfs scrub start -BdR /mnt
  193. WARNING: device 1 not present
  194. scrub device /dev/vda1 (id 1) canceled
  195. scrub started at Thu May 17 08:56:59 2018 and was aborted after 00:00:00
  196. data_extents_scrubbed: 0
  197. tree_extents_scrubbed: 0
  198. data_bytes_scrubbed: 0
  199. tree_bytes_scrubbed: 0
  200. read_errors: 0
  201. csum_errors: 0
  202. verify_errors: 0
  203. no_csum: 0
  204. csum_discards: 0
  205. super_errors: 0
  206. malloc_errors: 0
  207. uncorrectable_errors: 0
  208. unverified_errors: 0
  209. corrected_errors: 0
  210. last_physical: 0
  211. scrub device /dev/vda2 (id 2) done
  212. scrub started at Thu May 17 08:56:59 2018 and finished after 00:00:00
  213. data_extents_scrubbed: 16
  214. tree_extents_scrubbed: 1
  215. data_bytes_scrubbed: 557056
  216. tree_bytes_scrubbed: 16384
  217. read_errors: 0
  218. csum_errors: 0
  219. verify_errors: 0
  220. no_csum: 136
  221. csum_discards: 0
  222. super_errors: 0
  223. malloc_errors: 0
  224. uncorrectable_errors: 0
  225. unverified_errors: 0
  226. corrected_errors: 0
  227. last_physical: 1385168896
  228. scrub device /dev/vda3 (id 3) done
  229. scrub started at Thu May 17 08:56:59 2018 and finished after 00:00:00
  230. data_extents_scrubbed: 0
  231. tree_extents_scrubbed: 8
  232. data_bytes_scrubbed: 0
  233. tree_bytes_scrubbed: 131072
  234. read_errors: 0
  235. csum_errors: 0
  236. verify_errors: 0
  237. no_csum: 0
  238. csum_discards: 0
  239. super_errors: 0
  240. malloc_errors: 0
  241. uncorrectable_errors: 0
  242. unverified_errors: 0
  243. corrected_errors: 0
  244. last_physical: 1385168896
  245.  
  246. # Check our file's integrity is intact in uncompressed form.
  247.  
  248. root@archiso ~ # xxd /mnt/zero | head -n 5
  249. 00000000: 0000 0000 0000 0000 0000 0000 0000 0000 ................
  250. 00000010: 0000 0000 0000 0000 0000 0000 0000 0000 ................
  251. 00000020: 0000 0000 0000 0000 0000 0000 0000 0000 ................
  252. 00000030: 0000 0000 0000 0000 0000 0000 0000 0000 ................
  253. 00000040: 0000 0000 0000 0000 0000 0000 0000 0000 ................
  254.  
  255. # Check our file's integrity is intact in its on disk (compressed) form. Note since device 1 is missing, copy 1 is now on device 2, where before it was on device 1.
  256.  
  257. root@archiso ~ # btrfs-map-logical -l 1105723392 -b 4096 -o /root/1105723392.degraded.copy1 -c 1 /dev/vda2
  258. mirror 1 logical 1105723392 physical 11010048 device /dev/vda2
  259. mirror 2 logical 1105723392 physical 1097334784 device (null)
  260. root@archiso ~ # xxd /root/1105723392.degraded.copy1| head -n 5
  261. 00000000: 785e ecd0 010d 0000 00c2 a0f7 4f6d 0f07 x^..........Om..
  262. 00000010: 1128 0c18 3060 c080 0103 060c 1830 60c0 .(..0`.......0`.
  263. 00000020: 8081 f781 0100 00ff ffec d081 0c00 0000 ................
  264. 00000030: c020 7feb 7b7c 2114 9f01 0306 0c18 3060 . ..{|!.......0`
  265. 00000040: c080 0103 060c 1830 f01f 0800 00ff ffec .......0........
  266. root@archiso ~ # diff --brief /root/1105723392.{begin,degraded}.copy1
  267.  
  268. # Check our file's integrity, in that it can still be read and uncompressed.
  269.  
  270. root@archiso ~ # cat /mnt/zero > /dev/null
  271.  
  272. # Replace of (missing) device 1 with our new device 1.
  273.  
  274. root@archiso ~ # btrfs replace start -B 1 /dev/vda1 /mnt
  275. root@archiso ~ # btrfs replace status /mnt
  276. Started on 17.May 08:58:14, finished on 17.May 08:58:14, 0 write errs, 0 uncorr. read errs
  277.  
  278. # Unmount the volume and make sure btrfs check passes
  279.  
  280. root@archiso ~ # umount /mnt
  281. root@archiso ~ # btrfs check /dev/vda1
  282. Checking filesystem on /dev/vda1
  283. UUID: b765aaea-c99c-4950-a347-354b28e6cc50
  284. checking extents
  285. checking free space cache
  286. checking fs roots
  287. checking csums
  288. checking root refs
  289. found 753664 bytes used, no error found
  290. total csum bytes: 0
  291. total tree bytes: 131072
  292. total fs tree bytes: 32768
  293. total extent tree bytes: 16384
  294. btree space waste bytes: 121191
  295. file data blocks allocated: 622592
  296. referenced 1638400
  297.  
  298. # Mount the volume.
  299.  
  300. root@archiso ~ # mount -o compress /dev/vda1 /mnt
  301.  
  302. # Make sure scrub passes. (Still no files with checksums.) Note data bytes are now on devices 1 and 2. Also note no_csum used to be 136 and is now 152, there's an extra data_extent, 2 extra tree_extents, extra data and tree bytes, and last physical is later than before our drive failure. This surprises me, but I have no idea if it's important or somehow expected.
  303.  
  304. root@archiso ~ # btrfs scrub start -BdR /mnt
  305. scrub device /dev/vda1 (id 1) done
  306. scrub started at Thu May 17 09:00:29 2018 and finished after 00:00:00
  307. data_extents_scrubbed: 17
  308. tree_extents_scrubbed: 7
  309. data_bytes_scrubbed: 622592
  310. tree_bytes_scrubbed: 114688
  311. read_errors: 0
  312. csum_errors: 0
  313. verify_errors: 0
  314. no_csum: 152
  315. csum_discards: 0
  316. super_errors: 0
  317. malloc_errors: 0
  318. uncorrectable_errors: 0
  319. unverified_errors: 0
  320. corrected_errors: 0
  321. last_physical: 2169503744
  322. scrub device /dev/vda2 (id 2) done
  323. scrub started at Thu May 17 09:00:29 2018 and finished after 00:00:00
  324. data_extents_scrubbed: 17
  325. tree_extents_scrubbed: 1
  326. data_bytes_scrubbed: 622592
  327. tree_bytes_scrubbed: 16384
  328. read_errors: 0
  329. csum_errors: 0
  330. verify_errors: 0
  331. no_csum: 152
  332. csum_discards: 0
  333. super_errors: 0
  334. malloc_errors: 0
  335. uncorrectable_errors: 0
  336. unverified_errors: 0
  337. corrected_errors: 0
  338. last_physical: 1418723328
  339. scrub device /dev/vda3 (id 3) done
  340. scrub started at Thu May 17 09:00:29 2018 and finished after 00:00:00
  341. data_extents_scrubbed: 0
  342. tree_extents_scrubbed: 8
  343. data_bytes_scrubbed: 0
  344. tree_bytes_scrubbed: 131072
  345. read_errors: 0
  346. csum_errors: 0
  347. verify_errors: 0
  348. no_csum: 0
  349. csum_discards: 0
  350. super_errors: 0
  351. malloc_errors: 0
  352. uncorrectable_errors: 0
  353. unverified_errors: 0
  354. corrected_errors: 0
  355. last_physical: 1418723328
  356.  
  357. # Go straight to checking our file's integrity by checking its on disk (compressed) form. Note copy 1 is now on device 2, and copy 2 is now on device 1.
  358. # Oh, no! These shouldn't differ! We have mirrored copies that are different, and the data that is actually read off the disk will depend on which device btrfs goes to.
  359.  
  360. root@archiso ~ # btrfs-map-logical -l 1105723392 -b 4096 -o /root/1105723392.replaced.copy1 -c 1 /dev/vda1
  361. mirror 1 logical 1105723392 physical 11010048 device /dev/vda2
  362. mirror 2 logical 1105723392 physical 1097334784 device /dev/vda1
  363. root@archiso ~ # btrfs-map-logical -l 1105723392 -b 4096 -o /root/1105723392.replaced.copy2 -c 2 /dev/vda1
  364. mirror 1 logical 1105723392 physical 11010048 device /dev/vda2
  365. mirror 2 logical 1105723392 physical 1097334784 device /dev/vda1
  366. root@archiso ~ # diff --brief /root/1105723392.replaced.copy*
  367. Files /root/1105723392.replaced.copy1 and /root/1105723392.replaced.copy2 differ
  368.  
  369. # Copy 1 is proper, which is now the one on device 2. But, copy 2 on device 1 (the drive we just added as a replacement) contains uncompressed data. Writing all 1 bits to the file at the beginning instead of 0's, copy 2 shows all 'FF' bytes here.
  370.  
  371. 1 root@archiso ~ # xxd /root/1105723392.replaced.copy1 | head -n 5 :(
  372. 00000000: 785e ecd0 010d 0000 00c2 a0f7 4f6d 0f07 x^..........Om..
  373. 00000010: 1128 0c18 3060 c080 0103 060c 1830 60c0 .(..0`.......0`.
  374. 00000020: 8081 f781 0100 00ff ffec d081 0c00 0000 ................
  375. 00000030: c020 7feb 7b7c 2114 9f01 0306 0c18 3060 . ..{|!.......0`
  376. 00000040: c080 0103 060c 1830 f01f 0800 00ff ffec .......0........
  377. root@archiso ~ # xxd /root/1105723392.replaced.copy2 | head -n 5
  378. 00000000: 0000 0000 0000 0000 0000 0000 0000 0000 ................
  379. 00000010: 0000 0000 0000 0000 0000 0000 0000 0000 ................
  380. 00000020: 0000 0000 0000 0000 0000 0000 0000 0000 ................
  381. 00000030: 0000 0000 0000 0000 0000 0000 0000 0000 ................
  382. 00000040: 0000 0000 0000 0000 0000 0000 0000 0000 ................
  383.  
  384. # Reading and uncompressing the file has undefined behavior. If btrfs goes to the disk with compressed data, mirror 1 (device 2), like it did in this example, user has hidden file corruption.
  385.  
  386. root@archiso ~ # cat /mnt/zero
  387. root@archiso ~ # umount /mnt
  388.  
  389. # To force it to go to the disk with uncompressed data, let's simulate a failure on device 2, so the only mirror it has left is on device 1.
  390.  
  391. root@archiso ~ # dd if=/dev/zero of=/dev/vda2 bs=1M
  392. dd: error writing '/dev/vda2': No space left on device
  393. 10241+0 records in
  394. 10240+0 records out
  395. 10737418240 bytes (11 GB, 10 GiB) copied, 21.3278 s, 503 MB/s
  396. 1 root@archiso ~ # mount -o compress,degraded /dev/vda1 /mnt :(
  397.  
  398. # What happens here depends on the already uncompressed data, and how the decompression algorithm handles it. All zero's like this example seems to always give an Input/output error. Other data has been seen to cause random kernel memory corruption and likely bring the system down with it without a helpful oops or stack trace pointing to the culprit.
  399.  
  400. root@archiso ~ # cat /mnt/zero
  401. cat: /mnt/zero: Input/output error
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement