Advertisement
Guest User

sync: don't block the flusher thread waiting on IO

a guest
Sep 2nd, 2013
109
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 12.80 KB | None | 0 0
  1. On Tue, Jul 02, 2013 at 10:19:37AM +0200, Jan Kara wrote:
  2. > On Tue 02-07-13 16:29:54, Dave Chinner wrote:
  3. > > > > We could, but we just end up in the same place with sync as we are
  4. > > > > now - with a long list of clean inodes with a few inodes hidden in
  5. > > > > it that are under IO. i.e. we still have to walk lots of clean
  6. > > > > inodes to find the dirty ones that we need to wait on....
  7. > > > If the syncs are rare then yes. If they are relatively frequent, you
  8. > > > would win because the first sync will cleanup the list and subsequent ones
  9. > > > will be fast.
  10. > >
  11. > > I haven't done this yet, because I've found an interesting
  12. > > performance problem with our sync implementation. Basically, sync(2)
  13. > > on a filesystem that is being constantly dirtied blocks the flusher
  14. > > thread waiting for IO completion like so:
  15. > >
  16. > > # echo w > /proc/sysrq-trigger
  17. > > [ 1968.031001] SysRq : Show Blocked State
  18. > > [ 1968.032748] task PC stack pid father
  19. > > [ 1968.034534] kworker/u19:2 D ffff8800bed13140 3448 4830 2 0x00000000
  20. > > [ 1968.034534] Workqueue: writeback bdi_writeback_workfn (flush-253:32)
  21. > > [ 1968.034534] ffff8800bdca3998 0000000000000046 ffff8800bd1cae20 ffff8800bdca3fd8
  22. > > [ 1968.034534] ffff8800bdca3fd8 ffff8800bdca3fd8 ffff88003ea10000 ffff8800bd1cae20
  23. > > [ 1968.034534] ffff8800bdca3968 ffff8800bd1cae20 ffff8800bed139a0 0000000000000002
  24. > > [ 1968.034534] Call Trace:
  25. > > [ 1968.034534] [<ffffffff81bff7c9>] schedule+0x29/0x70
  26. > > [ 1968.034534] [<ffffffff81bff89f>] io_schedule+0x8f/0xd0
  27. > > [ 1968.034534] [<ffffffff8113263e>] sleep_on_page+0xe/0x20
  28. > > [ 1968.034534] [<ffffffff81bfd030>] __wait_on_bit+0x60/0x90
  29. > > [ 1968.034534] [<ffffffff81132770>] wait_on_page_bit+0x80/0x90
  30. > > [ 1968.034534] [<ffffffff81132881>] filemap_fdatawait_range+0x101/0x190
  31. > > [ 1968.034534] [<ffffffff81132937>] filemap_fdatawait+0x27/0x30
  32. > > [ 1968.034534] [<ffffffff811a7f18>] __writeback_single_inode+0x1b8/0x220
  33. > > [ 1968.034534] [<ffffffff811a88ab>] writeback_sb_inodes+0x27b/0x410
  34. > > [ 1968.034534] [<ffffffff811a8c00>] wb_writeback+0xf0/0x2c0
  35. > > [ 1968.034534] [<ffffffff811aa668>] wb_do_writeback+0xb8/0x210
  36. > > [ 1968.034534] [<ffffffff811aa832>] bdi_writeback_workfn+0x72/0x160
  37. > > [ 1968.034534] [<ffffffff8109e487>] process_one_work+0x177/0x400
  38. > > [ 1968.034534] [<ffffffff8109eb82>] worker_thread+0x122/0x380
  39. > > [ 1968.034534] [<ffffffff810a5508>] kthread+0xd8/0xe0
  40. > > [ 1968.034534] [<ffffffff81c091ec>] ret_from_fork+0x7c/0xb0
  41. > >
  42. > > i.e. this code:
  43. > >
  44. > > static int
  45. > > __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
  46. > > {
  47. > > struct address_space *mapping = inode->i_mapping;
  48. > > long nr_to_write = wbc->nr_to_write;
  49. > > unsigned dirty;
  50. > > int ret;
  51. > >
  52. > > WARN_ON(!(inode->i_state & I_SYNC));
  53. > >
  54. > > trace_writeback_single_inode_start(inode, wbc, nr_to_write);
  55. > >
  56. > > ret = do_writepages(mapping, wbc);
  57. > >
  58. > > /*
  59. > > * Make sure to wait on the data before writing out the metadata.
  60. > > * This is important for filesystems that modify metadata on data
  61. > > * I/O completion.
  62. > > */
  63. > > if (wbc->sync_mode == WB_SYNC_ALL) {
  64. > > int err = filemap_fdatawait(mapping);
  65. > > if (ret == 0)
  66. > > ret = err;
  67. > > }
  68. > > ....
  69. > >
  70. > > If completely serialising IO dispatch during sync. We are not
  71. > > batching IO submission at all - we are dispatching it one inode at a
  72. > > time an then waiting for it to complete. Guess where in the
  73. > > benchmark run I ran sync:
  74. > >
  75. > > FSUse% Count Size Files/sec App Overhead
  76. > > .....
  77. > > 0 640000 4096 35154.6 1026984
  78. > > 0 720000 4096 36740.3 1023844
  79. > > 0 800000 4096 36184.6 916599
  80. > > 0 880000 4096 1282.7 1054367
  81. > > 0 960000 4096 3951.3 918773
  82. > > 0 1040000 4096 40646.2 996448
  83. > > 0 1120000 4096 43610.1 895647
  84. > > 0 1200000 4096 40333.1 921048
  85. > >
  86. > > sync absolutely *murders* asynchronous IO performance right now
  87. > > because it stops background writeback completely and stalls all new
  88. > > writes in balance_dirty_pages like:
  89. > >
  90. > > [ 1968.034534] fs_mark D ffff88007ed13140 3680 9219 7127 0x00000000
  91. > > [ 1968.034534] ffff88005a279a38 0000000000000046 ffff880040318000 ffff88005a279fd8
  92. > > [ 1968.034534] ffff88005a279fd8 ffff88005a279fd8 ffff88003e9fdc40 ffff880040318000
  93. > > [ 1968.034534] ffff88005a279a28 ffff88005a279a70 ffff88007e9e0000 0000000100065d20
  94. > > [ 1968.034534] Call Trace:
  95. > > [ 1968.034534] [<ffffffff81bff7c9>] schedule+0x29/0x70
  96. > > [ 1968.034534] [<ffffffff81bfcd3b>] schedule_timeout+0x10b/0x1f0
  97. > > [ 1968.034534] [<ffffffff81bfe492>] io_schedule_timeout+0xa2/0x100
  98. > > [ 1968.034534] [<ffffffff8113d6fb>] balance_dirty_pages_ratelimited+0x37b/0x7a0
  99. > > [ 1968.034534] [<ffffffff811322e8>] generic_file_buffered_write+0x1b8/0x280
  100. > > [ 1968.034534] [<ffffffff8144e649>] xfs_file_buffered_aio_write+0x109/0x1a0
  101. > > [ 1968.034534] [<ffffffff8144e7ae>] xfs_file_aio_write+0xce/0x140
  102. > > [ 1968.034534] [<ffffffff8117f4b0>] do_sync_write+0x80/0xb0
  103. > > [ 1968.034534] [<ffffffff811801c1>] vfs_write+0xc1/0x1c0
  104. > > [ 1968.034534] [<ffffffff81180642>] SyS_write+0x52/0xa0
  105. > > [ 1968.034534] [<ffffffff81c09299>] system_call_fastpath+0x16/0x1b
  106. > >
  107. > > IOWs, blocking the flusher thread for IO completion on WB_SYNC_ALL
  108. > > writeback is very harmful. Given that we rely on ->sync_fs to
  109. > > guarantee all inode metadata is written back - the async pass up
  110. > > front doesn't do any waiting so any inode metadata updates done
  111. > > after IO completion have to be caught by ->sync_fs - why are we
  112. > > doing IO completion waiting here for sync(2) writeback?
  113. > So I did a bit of digging in history and the wait in
  114. > __writeback_single_inode() (at that time it was just
  115. > writeback_single_inode()) has been introduced by Christoph in commit
  116. > 26821ed40. It is there for calls like sync_inode() or write_inode_now()
  117. > where it really is necessary.
  118. >
  119. > You are right that for syncing the whole filesystem like sync(2) does, the
  120. > wait in __writeback_single_inode() isn't necessary. After all, all the
  121. > inode data might have been written back asynchronously so never have to see
  122. > the inode in __writeback_single_inode() when sync_mode == WB_SYNC_ALL and
  123. > each filesystem has to make sure inode metadata is correctly on disk. So
  124. > removing that wait for global sync isn't going to introduce any new
  125. > problem.
  126. >
  127. > Now how to implement that practically - __writeback_single_inode() is
  128. > called from two places. From writeback_single_inode() - that is the
  129. > function for writing only a single inode, and from writeback_sb_inodes()
  130. > - that is used by flusher threads. I'd be inclined to move do_writepages()
  131. > call from __writeback_single_inode() to the callsites and move the wait to
  132. > writeback_single_inode() only. But that would mean also moving of
  133. > tracepoints and it starts to get a bit ugly. Maybe we could instead create
  134. > a new flag in struct writeback_control to indicate that we are doing global
  135. > sync - for_sync flag? Then we could use that flag in
  136. > __writeback_single_inode() to decide whether to wait or not.
  137.  
  138. Snap!
  139.  
  140. I wrote a patch a few hours ago that does exactly this, right down
  141. to the "for_sync" variable name. :)
  142. See below.
  143.  
  144. > As a bonus filesystems could also optimize their write_inode() methods when
  145. > they know ->sync_fs() is going to happen in future. E.g. ext4 wouldn't have
  146. > to do the stupid ext4_force_commit() after each written inode in
  147. > WB_SYNC_ALL mode.
  148.  
  149. Yeah, that's true.
  150.  
  151. Since XFS now catches all metadata modifications in it's journal, it
  152. doesn't have a ->write_inode method anymore. Only ->fsync,
  153. ->sync_fs and ->commit_metadata are defined as data integrity
  154. operations that require metadata to be sychronised and we ensure the
  155. journal is committed in those methods. WB_SYNC_ALL writeback is
  156. really only a method for getting data dispatched to disk, so I
  157. suspect that we should ensure that waiting for data IO completion
  158. happens at higher levels, not be hidden deep in the guts of writing
  159. back inode metadata..
  160.  
  161. Cheers,
  162.  
  163. Dave.
  164. --
  165. Dave Chinner
  166. david@fromorbit.com
  167.  
  168. sync: don't block the flusher thread waiting on IO
  169.  
  170. From: Dave Chinner <dchinner@redhat.com>
  171.  
  172. When sync does it's WB_SYNC_ALL writeback, it issues data Io and
  173. then immediately waits for IO completion. This is done in the
  174. context of the flusher thread, and hence completely ties up the
  175. flusher thread for the backing device until all the dirty inodes
  176. have been synced. On filesystems that are dirtying inodes constantly
  177. and quickly, this means the flusher thread can be tied up for
  178. minutes per sync call and hence badly affect system level write IO
  179. performance as the page cache cannot be cleaned quickly.
  180.  
  181. We already have a wait loop for IO completion for sync(2), so cut
  182. this out of the flusher thread and delegate it to wait_sb_inodes().
  183. Hence we can do rapid IO submission, and then wait for it all to
  184. complete.
  185.  
  186. Effect of sync on fsmark before the patch:
  187.  
  188. FSUse% Count Size Files/sec App Overhead
  189. .....
  190. 0 640000 4096 35154.6 1026984
  191. 0 720000 4096 36740.3 1023844
  192. 0 800000 4096 36184.6 916599
  193. 0 880000 4096 1282.7 1054367
  194. 0 960000 4096 3951.3 918773
  195. 0 1040000 4096 40646.2 996448
  196. 0 1120000 4096 43610.1 895647
  197. 0 1200000 4096 40333.1 921048
  198.  
  199. And a single sync pass took:
  200.  
  201. real 0m52.407s
  202. user 0m0.000s
  203. sys 0m0.090s
  204.  
  205. After the patch, there is no impact on fsmark results, and each
  206. individual sync(2) operation run concurrently with the same fsmark
  207. workload takes roughly 7s:
  208.  
  209. real 0m6.930s
  210. user 0m0.000s
  211. sys 0m0.039s
  212.  
  213. IOWs, sync is 7-8x faster on a busy filesystem and does not have an
  214. adverse impact on ongoing async data write operations.
  215.  
  216. Signed-off-by: Dave Chinner <dchinner@redhat.com>
  217. ---
  218. fs/fs-writeback.c | 9 +++++++--
  219. include/linux/writeback.h | 1 +
  220. 2 files changed, 8 insertions(+), 2 deletions(-)
  221.  
  222. diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
  223. index 25a766c..ea56583 100644
  224. --- a/fs/fs-writeback.c
  225. +++ b/fs/fs-writeback.c
  226. @@ -45,6 +45,7 @@ struct wb_writeback_work {
  227. unsigned int for_kupdate:1;
  228. unsigned int range_cyclic:1;
  229. unsigned int for_background:1;
  230. + unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
  231. enum wb_reason reason; /* why was writeback initiated? */
  232.  
  233. struct list_head list; /* pending work list */
  234. @@ -476,9 +477,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
  235. /*
  236. * Make sure to wait on the data before writing out the metadata.
  237. * This is important for filesystems that modify metadata on data
  238. - * I/O completion.
  239. + * I/O completion. We don't do it for sync(2) writeback because it has a
  240. + * separate, external IO completion path and ->sync_fs for guaranteeing
  241. + * inode metadata is written back correctly.
  242. */
  243. - if (wbc->sync_mode == WB_SYNC_ALL) {
  244. + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
  245. int err = filemap_fdatawait(mapping);
  246. if (ret == 0)
  247. ret = err;
  248. @@ -611,6 +614,7 @@ static long writeback_sb_inodes(struct super_block *sb,
  249. .tagged_writepages = work->tagged_writepages,
  250. .for_kupdate = work->for_kupdate,
  251. .for_background = work->for_background,
  252. + .for_sync = work->for_sync,
  253. .range_cyclic = work->range_cyclic,
  254. .range_start = 0,
  255. .range_end = LLONG_MAX,
  256. @@ -1442,6 +1446,7 @@ void sync_inodes_sb(struct super_block *sb)
  257. .range_cyclic = 0,
  258. .done = &done,
  259. .reason = WB_REASON_SYNC,
  260. + .for_sync = 1,
  261. };
  262.  
  263. /* Nothing to do? */
  264. diff --git a/include/linux/writeback.h b/include/linux/writeback.h
  265. index 579a500..abfe117 100644
  266. --- a/include/linux/writeback.h
  267. +++ b/include/linux/writeback.h
  268. @@ -78,6 +78,7 @@ struct writeback_control {
  269. unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */
  270. unsigned for_reclaim:1; /* Invoked from the page allocator */
  271. unsigned range_cyclic:1; /* range_start is cyclic */
  272. + unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
  273. };
  274.  
  275. /*
  276. --
  277. To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
  278. the body of a message to majordomo@vger.kernel.org
  279. More majordomo info at http://vger.kernel.org/majordomo-info.html
  280. Please read the FAQ at http://www.tux.org/lkml/
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement