From 42d8be1ecd81ef608f797fbc67b071e0382c0e34 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 25 Jun 2026 12:46:16 -0400 Subject: [PATCH 1/7] writeback: Avoid contention on wb->list_lock when switching inodes jira jira SECO-535 bugfix: writeback softlockups commit-author Jan Kara commit e1b849cfa6b61f1c866a908c9e8dd9b5aaab820b upstream-diff | Due to the change in bdi_writeback it propagates a kabi breakage through every pointer version of this and backing_dev_info we have to use RH_KABI_EXTEND() on bdi_writeback to prevent the CRC miscalculation. There can be multiple inode switch works that are trying to switch inodes to / from the same wb. This can happen in particular if some cgroup exits which owns many (thousands) inodes and we need to switch them all. In this case several inode_switch_wbs_work_fn() instances will be just spinning on the same wb->list_lock while only one of them makes forward progress. This wastes CPU cycles and quickly leads to softlockup reports and unusable system. Instead of running several inode_switch_wbs_work_fn() instances in parallel switching to the same wb and contending on wb->list_lock, run just one work item per wb and manage a queue of isw items switching to this wb. Acked-by: Tejun Heo Signed-off-by: Jan Kara (cherry picked from commit e1b849cfa6b61f1c866a908c9e8dd9b5aaab820b) Signed-off-by: Jonathan Maple --- fs/fs-writeback.c | 99 ++++++++++++++++++++------------ include/linux/backing-dev-defs.h | 7 +++ include/linux/writeback.h | 2 + mm/backing-dev.c | 5 ++ 4 files changed, 77 insertions(+), 36 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 128fd2f99a7bf..1748434997e1b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -369,7 +369,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) } struct inode_switch_wbs_context { - struct rcu_work work; + /* List of queued switching contexts for the wb */ + struct llist_node list; /* * Multiple inodes can be switched at once. The switching procedure @@ -379,7 +380,6 @@ struct inode_switch_wbs_context { * array embedded into struct inode_switch_wbs_context. Otherwise * an inode could be left in a non-consistent state. */ - struct bdi_writeback *new_wb; struct inode *inodes[]; }; @@ -487,13 +487,11 @@ static bool inode_do_switch_wbs(struct inode *inode, return switched; } -static void inode_switch_wbs_work_fn(struct work_struct *work) +static void process_inode_switch_wbs(struct bdi_writeback *new_wb, + struct inode_switch_wbs_context *isw) { - struct inode_switch_wbs_context *isw = - container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; - struct bdi_writeback *new_wb = isw->new_wb; unsigned long nr_switched = 0; struct inode **inodep; @@ -544,6 +542,38 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) atomic_dec(&isw_nr_in_flight); } +void inode_switch_wbs_work_fn(struct work_struct *work) +{ + struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback, + switch_work); + struct inode_switch_wbs_context *isw, *next_isw; + struct llist_node *list; + + /* + * Grab out reference to wb so that it cannot get freed under us + * after we process all the isw items. + */ + wb_get(new_wb); + while (1) { + list = llist_del_all(&new_wb->switch_wbs_ctxs); + /* Nothing to do? */ + if (!list) + break; + /* + * In addition to synchronizing among switchers, I_WB_SWITCH + * tells the RCU protected stat update paths to grab the i_page + * lock so that stat transfer can synchronize against them. + * Let's continue after I_WB_SWITCH is guaranteed to be + * visible. + */ + synchronize_rcu(); + + llist_for_each_entry_safe(isw, next_isw, list, list) + process_inode_switch_wbs(new_wb, isw); + } + wb_put(new_wb); +} + static bool inode_prepare_wbs_switch(struct inode *inode, struct bdi_writeback *new_wb) { @@ -573,6 +603,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode, return true; } +static void wb_queue_isw(struct bdi_writeback *wb, + struct inode_switch_wbs_context *isw) +{ + if (llist_add(&isw->list, &wb->switch_wbs_ctxs)) + queue_work(isw_wq, &wb->switch_work); +} + /** * inode_switch_wbs - change the wb association of an inode * @inode: target inode @@ -586,6 +623,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) struct backing_dev_info *bdi = inode_to_bdi(inode); struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; + struct bdi_writeback *new_wb = NULL; /* noop if seems to be already in progress */ if (inode->i_state & I_WB_SWITCH) @@ -610,40 +648,34 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (!memcg_css) goto out_free; - isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); css_put(memcg_css); - if (!isw->new_wb) + if (!new_wb) goto out_free; - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + if (!inode_prepare_wbs_switch(inode, new_wb)) goto out_free; isw->inodes[0] = inode; - /* - * In addition to synchronizing among switchers, I_WB_SWITCH tells - * the RCU protected stat update paths to grab the i_page - * lock so that stat transfer can synchronize against them. - * Let's continue after I_WB_SWITCH is guaranteed to be visible. - */ - INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); - queue_rcu_work(isw_wq, &isw->work); + wb_queue_isw(new_wb, isw); return; out_free: atomic_dec(&isw_nr_in_flight); - if (isw->new_wb) - wb_put(isw->new_wb); + if (new_wb) + wb_put(new_wb); kfree(isw); } -static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw, +static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb, + struct inode_switch_wbs_context *isw, struct list_head *list, int *nr) { struct inode *inode; list_for_each_entry(inode, list, i_io_list) { - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + if (!inode_prepare_wbs_switch(inode, new_wb)) continue; isw->inodes[*nr] = inode; @@ -667,6 +699,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) { struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; + struct bdi_writeback *new_wb; int nr; bool restart = false; @@ -679,12 +712,12 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) for (memcg_css = wb->memcg_css->parent; memcg_css; memcg_css = memcg_css->parent) { - isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); - if (isw->new_wb) + new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); + if (new_wb) break; } - if (unlikely(!isw->new_wb)) - isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ + if (unlikely(!new_wb)) + new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ nr = 0; spin_lock(&wb->list_lock); @@ -696,27 +729,21 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) * bandwidth restrictions, as writeback of inode metadata is not * accounted for. */ - restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr); + restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr); if (!restart) - restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr); + restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time, + &nr); spin_unlock(&wb->list_lock); /* no attached inodes? bail out */ if (nr == 0) { atomic_dec(&isw_nr_in_flight); - wb_put(isw->new_wb); + wb_put(new_wb); kfree(isw); return restart; } - /* - * In addition to synchronizing among switchers, I_WB_SWITCH tells - * the RCU protected stat update paths to grab the i_page - * lock so that stat transfer can synchronize against them. - * Let's continue after I_WB_SWITCH is guaranteed to be visible. - */ - INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); - queue_rcu_work(isw_wq, &isw->work); + wb_queue_isw(new_wb, isw); return restart; } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 2ad261082bba5..9637a59ba71ab 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -157,6 +157,13 @@ struct bdi_writeback { struct work_struct release_work; struct rcu_head rcu; }; + + RH_KABI_EXTEND(struct work_struct switch_work) /* work used to perform + * inode switching to + * this wb */ + RH_KABI_EXTEND(struct llist_head switch_wbs_ctxs) /* queued contexts for + * writeback switching + * */ #endif }; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d6db822e4bb30..d27a0435c5b84 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -293,6 +293,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); } +void inode_switch_wbs_work_fn(struct work_struct *work); + #else /* CONFIG_CGROUP_WRITEBACK */ static inline void inode_attach_wb(struct inode *inode, struct folio *folio) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 783904d8c5ef8..0beaca6bacf77 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -633,6 +633,7 @@ static void cgwb_release_workfn(struct work_struct *work) wb_exit(wb); bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); + WARN_ON_ONCE(work_pending(&wb->switch_work)); call_rcu(&wb->rcu, cgwb_free_rcu); } @@ -709,6 +710,8 @@ static int cgwb_create(struct backing_dev_info *bdi, wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; INIT_LIST_HEAD(&wb->b_attached); + INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn); + init_llist_head(&wb->switch_wbs_ctxs); INIT_WORK(&wb->release_work, cgwb_release_workfn); set_bit(WB_registered, &wb->state); bdi_get(bdi); @@ -839,6 +842,8 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) if (!ret) { bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; + INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn); + init_llist_head(&bdi->wb.switch_wbs_ctxs); } return ret; } From 40a213c0fb0e9a27b3dda327e92b4cd9e7204bc8 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 25 Jun 2026 12:56:23 -0400 Subject: [PATCH 2/7] writeback: Avoid softlockup when switching many inodes jira SECO-535 bugfix: writeback softlockups commit-author Jan Kara commit 66c14dccd810d42ec5c73bb8a9177489dfd62278 process_inode_switch_wbs_work() can be switching over 100 inodes to a different cgroup. Since switching an inode requires counting all dirty & under-writeback pages in the address space of each inode, this can take a significant amount of time. Add a possibility to reschedule after processing each inode to avoid softlockups. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Christian Brauner (cherry picked from commit 66c14dccd810d42ec5c73bb8a9177489dfd62278) Signed-off-by: Jonathan Maple --- fs/fs-writeback.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1748434997e1b..b1536fef3b976 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -501,6 +501,7 @@ static void process_inode_switch_wbs(struct bdi_writeback *new_wb, */ down_read(&bdi->wb_switch_rwsem); + inodep = isw->inodes; /* * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions @@ -511,6 +512,7 @@ static void process_inode_switch_wbs(struct bdi_writeback *new_wb, * gives us exclusion against all wb related operations on @inode * including IO list manipulations and stat updates. */ +relock: if (old_wb < new_wb) { spin_lock(&old_wb->list_lock); spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); @@ -519,10 +521,17 @@ static void process_inode_switch_wbs(struct bdi_writeback *new_wb, spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); } - for (inodep = isw->inodes; *inodep; inodep++) { + while (*inodep) { WARN_ON_ONCE((*inodep)->i_wb != old_wb); if (inode_do_switch_wbs(*inodep, old_wb, new_wb)) nr_switched++; + inodep++; + if (*inodep && need_resched()) { + spin_unlock(&new_wb->list_lock); + spin_unlock(&old_wb->list_lock); + cond_resched(); + goto relock; + } } spin_unlock(&new_wb->list_lock); From dbce4855cce73eaf15b637f4848feedac82ecda5 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 25 Jun 2026 12:58:55 -0400 Subject: [PATCH 3/7] writeback: Avoid excessively long inode switching times jira SECO-535 bugfix: writeback softlockups commit-author Jan Kara commit 9a6ebbdbd41235ea3bc0c4f39e2076599b8113cc With lazytime mount option enabled we can be switching many dirty inodes on cgroup exit to the parent cgroup. The numbers observed in practice when systemd slice of a large cron job exits can easily reach hundreds of thousands or millions. The logic in inode_do_switch_wbs() which sorts the inode into appropriate place in b_dirty list of the target wb however has linear complexity in the number of dirty inodes thus overall time complexity of switching all the inodes is quadratic leading to workers being pegged for hours consuming 100% of the CPU and switching inodes to the parent wb. Simple reproducer of the issue: FILES=10000 # Filesystem mounted with lazytime mount option MNT=/mnt/ echo "Creating files and switching timestamps" for (( j = 0; j < 50; j ++ )); do mkdir $MNT/dir$j for (( i = 0; i < $FILES; i++ )); do echo "foo" >$MNT/dir$j/file$i done touch -a -t 202501010000 $MNT/dir$j/file* done wait echo "Syncing and flushing" sync echo 3 >/proc/sys/vm/drop_caches echo "Reading all files from a cgroup" mkdir /sys/fs/cgroup/unified/mycg1 || exit echo $$ >/sys/fs/cgroup/unified/mycg1/cgroup.procs || exit for (( j = 0; j < 50; j ++ )); do cat /mnt/dir$j/file* >/dev/null & done wait echo "Switching wbs" # Now rmdir the cgroup after the script exits We need to maintain b_dirty list ordering to keep writeback happy so instead of sorting inode into appropriate place just append it at the end of the list and clobber dirtied_time_when. This may result in inode writeback starting later after cgroup switch however cgroup switches are rare so it shouldn't matter much. Since the cgroup had write access to the inode, there are no practical concerns of the possible DoS issues. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Christian Brauner (cherry picked from commit 9a6ebbdbd41235ea3bc0c4f39e2076599b8113cc) Signed-off-by: Jonathan Maple --- fs/fs-writeback.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index b1536fef3b976..f52097917aba4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -446,22 +446,23 @@ static bool inode_do_switch_wbs(struct inode *inode, * Transfer to @new_wb's IO list if necessary. If the @inode is dirty, * the specific list @inode was on is ignored and the @inode is put on * ->b_dirty which is always correct including from ->b_dirty_time. - * The transfer preserves @inode->dirtied_when ordering. If the @inode - * was clean, it means it was on the b_attached list, so move it onto - * the b_attached list of @new_wb. + * If the @inode was clean, it means it was on the b_attached list, so + * move it onto the b_attached list of @new_wb. */ if (!list_empty(&inode->i_io_list)) { inode->i_wb = new_wb; if (inode->i_state & I_DIRTY_ALL) { - struct inode *pos; - - list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) - if (time_after_eq(inode->dirtied_when, - pos->dirtied_when)) - break; + /* + * We need to keep b_dirty list sorted by + * dirtied_time_when. However properly sorting the + * inode in the list gets too expensive when switching + * many inodes. So just attach inode at the end of the + * dirty list and clobber the dirtied_time_when. + */ + inode->dirtied_time_when = jiffies; inode_io_list_move_locked(inode, new_wb, - pos->i_io_list.prev); + &new_wb->b_dirty); } else { inode_cgwb_move_to_attached(inode, new_wb); } From f7488082f863b3cfda182e6792ffcb0380020882 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 25 Jun 2026 13:00:21 -0400 Subject: [PATCH 4/7] writeback: Add tracepoint to track pending inode switches jira SECO-535 bugfix: writeback softlockups commit-author Jan Kara commit 0cee64c547e3c9cda646af3e075a64f445ee8148 Add trace_inode_switch_wbs_queue tracepoint to allow insight into how many inodes are queued to switch their bdi_writeback structure. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Christian Brauner (cherry picked from commit 0cee64c547e3c9cda646af3e075a64f445ee8148) Signed-off-by: Jonathan Maple --- fs/fs-writeback.c | 2 ++ include/trace/events/writeback.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f52097917aba4..a9b477d6662a7 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -668,6 +668,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) isw->inodes[0] = inode; + trace_inode_switch_wbs_queue(inode->i_wb, new_wb, 1); wb_queue_isw(new_wb, isw); return; @@ -753,6 +754,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) return restart; } + trace_inode_switch_wbs_queue(wb, new_wb, nr); wb_queue_isw(new_wb, isw); return restart; diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index a261e86e61fac..e71531f01a52b 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -213,6 +213,35 @@ TRACE_EVENT(inode_foreign_history, ) ); +TRACE_EVENT(inode_switch_wbs_queue, + + TP_PROTO(struct bdi_writeback *old_wb, struct bdi_writeback *new_wb, + unsigned int count), + + TP_ARGS(old_wb, new_wb, count), + + TP_STRUCT__entry( + __array(char, name, 32) + __field(ino_t, old_cgroup_ino) + __field(ino_t, new_cgroup_ino) + __field(unsigned int, count) + ), + + TP_fast_assign( + strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32); + __entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb); + __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); + __entry->count = count; + ), + + TP_printk("bdi %s: old_cgroup_ino=%lu new_cgroup_ino=%lu count=%u", + __entry->name, + (unsigned long)__entry->old_cgroup_ino, + (unsigned long)__entry->new_cgroup_ino, + __entry->count + ) +); + TRACE_EVENT(inode_switch_wbs, TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb, From 75278254d08fad795002fd64e70000d87f80d427 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 25 Jun 2026 13:17:39 -0400 Subject: [PATCH 5/7] writeback: Fix use after free in inode_switch_wbs_work_fn() jira SECO-535 bugfix: writeback softlockups commit-author Jan Kara commit 6689f01d6740cf358932b3e97ee968c6099800d9 inode_switch_wbs_work_fn() has a loop like: wb_get(new_wb); while (1) { list = llist_del_all(&new_wb->switch_wbs_ctxs); /* Nothing to do? */ if (!list) break; ... process the items ... } Now adding of items to the list looks like: wb_queue_isw() if (llist_add(&isw->list, &wb->switch_wbs_ctxs)) queue_work(isw_wq, &wb->switch_work); Because inode_switch_wbs_work_fn() loops when processing isw items, it can happen that wb->switch_work is pending while wb->switch_wbs_ctxs is empty. This is a problem because in that case wb can get freed (no isw items -> no wb reference) while the work is still pending causing use-after-free issues. We cannot just fix this by cancelling work when freeing wb because that could still trigger problematic 0 -> 1 transitions on wb refcount due to wb_get() in inode_switch_wbs_work_fn(). It could be all handled with more careful code but that seems unnecessarily complex so let's avoid that until it is proven that the looping actually brings practical benefit. Just remove the loop from inode_switch_wbs_work_fn() instead. That way when wb_queue_isw() queues work, we are guaranteed we have added the first item to wb->switch_wbs_ctxs and nobody is going to remove it (and drop the wb reference it holds) until the queued work runs. Fixes: e1b849cfa6b6 ("writeback: Avoid contention on wb->list_lock when switching inodes") CC: stable@vger.kernel.org Signed-off-by: Jan Kara Link: https://patch.msgid.link/20260413093618.17244-2-jack@suse.cz Acked-by: Tejun Heo Signed-off-by: Christian Brauner (cherry picked from commit 6689f01d6740cf358932b3e97ee968c6099800d9) Signed-off-by: Jonathan Maple --- fs/fs-writeback.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a9b477d6662a7..4a27ce85bb882 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -559,28 +559,30 @@ void inode_switch_wbs_work_fn(struct work_struct *work) struct inode_switch_wbs_context *isw, *next_isw; struct llist_node *list; + list = llist_del_all(&new_wb->switch_wbs_ctxs); /* - * Grab out reference to wb so that it cannot get freed under us + * Nothing to do? That would be a problem as references held by isw + * items protect wb from freeing... + */ + if (WARN_ON_ONCE(!list)) + return; + + /* + * Grab our reference to wb so that it cannot get freed under us * after we process all the isw items. */ wb_get(new_wb); - while (1) { - list = llist_del_all(&new_wb->switch_wbs_ctxs); - /* Nothing to do? */ - if (!list) - break; - /* - * In addition to synchronizing among switchers, I_WB_SWITCH - * tells the RCU protected stat update paths to grab the i_page - * lock so that stat transfer can synchronize against them. - * Let's continue after I_WB_SWITCH is guaranteed to be - * visible. - */ - synchronize_rcu(); + /* + * In addition to synchronizing among switchers, I_WB_SWITCH + * tells the RCU protected stat update paths to grab the i_page + * lock so that stat transfer can synchronize against them. + * Let's continue after I_WB_SWITCH is guaranteed to be + * visible. + */ + synchronize_rcu(); - llist_for_each_entry_safe(isw, next_isw, list, list) - process_inode_switch_wbs(new_wb, isw); - } + llist_for_each_entry_safe(isw, next_isw, list, list) + process_inode_switch_wbs(new_wb, isw); wb_put(new_wb); } From dcf9b03477d0ea8f21afcbd913e8e6acdddf1f3d Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 25 Jun 2026 13:25:25 -0400 Subject: [PATCH 6/7] writeback: fix race between cgroup_writeback_umount() and inode_switch_wbs() jira SECO-535 bugfix: writeback softlockups commit-author Baokun Li commit cba38ec4cbd3a7b8b942a8d52531a05be8a9ff0d When a container exits, the following BUG_ON() is occasionally triggered: ================================================================== VFS: Busy inodes after unmount of sdb (ext4) ------------[ cut here ]------------ kernel BUG at fs/super.c:695! CPU: 3 PID: 6 Comm: containerd-shim Tainted: G OE K 6.6 #1 pstate: 63400009 (nZCv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--) pc : generic_shutdown_super+0xf0/0x100 lr : generic_shutdown_super+0xf0/0x100 Call trace: generic_shutdown_super+0xf0/0x100 kill_block_super+0x20/0x48 ext4_kill_sb+0x28/0x60 deactivate_locked_super+0x54/0x130 deactivate_super+0x84/0xa0 cleanup_mnt+0xa4/0x140 __cleanup_mnt+0x18/0x28 task_work_run+0x78/0xe0 do_notify_resume+0x204/0x240 ================================================================== The root cause is a race between cgroup_writeback_umount() and inode_switch_wbs()/cleanup_offline_cgwb(). There is a window between inode_prepare_wbs_switch() returning true and the subsequent wb_queue_isw() call. Following is the process that triggers the issue: CPU A (umount) | CPU B (writeback) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ inode_switch_wbs/cleanup_offline_cgwb atomic_inc(&isw_nr_in_flight) inode_prepare_wbs_switch -> passes SB_ACTIVE check __iget(inode) generic_shutdown_super sb->s_flags &= ~SB_ACTIVE cgroup_writeback_umount(sb) smp_mb() atomic_read(&isw_nr_in_flight) rcu_barrier() -> no pending RCU callbacks flush_workqueue(isw_wq) -> nothing queued, returns evict_inodes(sb) -> Inode skipped as isw still holds a ref. sop->put_super(sb) /* destroys percpu counters */ -> VFS: Busy inodes after unmount! wb_queue_isw() queue_work(isw_wq, ...) /* later in work function */ inode_switch_wbs_work_fn process_inode_switch_wbs iput() -> evict percpu_counter_dec() // UAF! Fix this by extending the RCU read-side critical section in inode_switch_wbs() and cleanup_offline_cgwb() to cover from inode_prepare_wbs_switch() through wb_queue_isw(). Since there is no sleep in this window, rcu_read_lock() can be used. Then add a synchronize_rcu() in cgroup_writeback_umount() before the existing rcu_barrier(), so that all in-flight switchers that have passed the SB_ACTIVE check have completed queue_work() before flush_workqueue() is called. The existing rcu_barrier() is intentionally retained so this fix can be backported unchanged to stable kernels (5.10.y, 6.6.y, ...) that still queue switches via queue_rcu_work(). It is a no-op on current mainline (since commit e1b849cfa6b6 ("writeback: Avoid contention on wb->list_lock when switching inodes")) and is removed in a follow-up patch. Fixes: a1a0e23e4903 ("writeback: flush inode cgroup wb switches instead of pinning super_block") Cc: stable@vger.kernel.org Suggested-by: Jan Kara Link: https://lore.kernel.org/all/mxnjq2l6guusfchvauxr3v7c4bwjasybxlleqbbh4efloeqspz@iqylk76ohufz Reviewed-by: Jan Kara Signed-off-by: Baokun Li Link: https://patch.msgid.link/20260521095016.2791354-2-libaokun@linux.alibaba.com Acked-by: Tejun Heo Signed-off-by: Christian Brauner (Amutable) (cherry picked from commit cba38ec4cbd3a7b8b942a8d52531a05be8a9ff0d) Signed-off-by: Jonathan Maple --- fs/fs-writeback.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4a27ce85bb882..31ba6630c4002 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -651,12 +651,19 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) atomic_inc(&isw_nr_in_flight); - /* find and pin the new wb */ + /* + * Paired with synchronize_rcu() in cgroup_writeback_umount(): + * holding rcu_read_lock across inode_prepare_wbs_switch() + * (covering the SB_ACTIVE check and the inode grab) and + * wb_queue_isw() ensures synchronize_rcu() cannot return until + * the work is queued, so the subsequent flush_workqueue() will + * wait for the switch. + */ rcu_read_lock(); + /* find and pin the new wb */ memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys); if (memcg_css && !css_tryget(memcg_css)) memcg_css = NULL; - rcu_read_unlock(); if (!memcg_css) goto out_free; @@ -672,9 +679,11 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) trace_inode_switch_wbs_queue(inode->i_wb, new_wb, 1); wb_queue_isw(new_wb, isw); + rcu_read_unlock(); return; out_free: + rcu_read_unlock(); atomic_dec(&isw_nr_in_flight); if (new_wb) wb_put(new_wb); @@ -733,6 +742,14 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ nr = 0; + /* + * Paired with synchronize_rcu() in cgroup_writeback_umount(). + * Holding rcu_read_lock across the SB_ACTIVE check, the inode grab + * and wb_queue_isw() ensures synchronize_rcu() cannot return until + * the work is queued, so the subsequent flush_workqueue() will wait + * for the switch. + */ + rcu_read_lock(); spin_lock(&wb->list_lock); /* * In addition to the inodes that have completed writeback, also switch @@ -750,6 +767,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) /* no attached inodes? bail out */ if (nr == 0) { + rcu_read_unlock(); atomic_dec(&isw_nr_in_flight); wb_put(new_wb); kfree(isw); @@ -758,6 +776,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) trace_inode_switch_wbs_queue(wb, new_wb, nr); wb_queue_isw(new_wb, isw); + rcu_read_unlock(); return restart; } @@ -1195,6 +1214,14 @@ void cgroup_writeback_umount(struct super_block *sb) smp_mb(); if (atomic_read(&isw_nr_in_flight)) { + /* + * Paired with rcu_read_lock() in inode_switch_wbs() and + * cleanup_offline_cgwb(). synchronize_rcu() waits for any + * in-flight switcher that already passed the SB_ACTIVE check + * to finish queueing its work, so flush_workqueue() below + * will then drain it. + */ + synchronize_rcu(); /* * Use rcu_barrier() to wait for all pending callbacks to * ensure that all in-flight wb switches are in the workqueue. From 398154cf909d07f04df44dd7ba62b8630760b759 Mon Sep 17 00:00:00 2001 From: Jonathan Maple Date: Thu, 25 Jun 2026 13:26:07 -0400 Subject: [PATCH 7/7] writeback: drop now-unnecessary rcu_barrier() in cgroup_writeback_umount() jira SECO-535 bugfix: writeback softlockups commit-author Baokun Li commit e90a6d668e26e00a72df2d09c173b563468f09c9 Commit e1b849cfa6b6 ("writeback: Avoid contention on wb->list_lock when switching inodes") replaced the queue_rcu_work() based scheduling of inode wb switches with a plain queue_work(). Since then no switcher goes through call_rcu(), so rcu_barrier() in cgroup_writeback_umount() has no callbacks of its own to wait for. It still drains unrelated call_rcu() callbacks from other subsystems on busy systems, which incidentally slows umount down; drop it. Fixes: e1b849cfa6b6 ("writeback: Avoid contention on wb->list_lock when switching inodes") Reviewed-by: Jan Kara Signed-off-by: Baokun Li Link: https://patch.msgid.link/20260521095016.2791354-3-libaokun@linux.alibaba.com Acked-by: Tejun Heo Signed-off-by: Christian Brauner (Amutable) (cherry picked from commit e90a6d668e26e00a72df2d09c173b563468f09c9) Signed-off-by: Jonathan Maple --- fs/fs-writeback.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 31ba6630c4002..ee32cc9b9075a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1222,11 +1222,6 @@ void cgroup_writeback_umount(struct super_block *sb) * will then drain it. */ synchronize_rcu(); - /* - * Use rcu_barrier() to wait for all pending callbacks to - * ensure that all in-flight wb switches are in the workqueue. - */ - rcu_barrier(); flush_workqueue(isw_wq); } }