Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 115 additions & 52 deletions fs/fs-writeback.c
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,8 @@ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
}

struct inode_switch_wbs_context {
struct rcu_work work;
/* List of queued switching contexts for the wb */
struct llist_node list;

/*
* Multiple inodes can be switched at once. The switching procedure
Expand All @@ -379,7 +380,6 @@ struct inode_switch_wbs_context {
* array embedded into struct inode_switch_wbs_context. Otherwise
* an inode could be left in a non-consistent state.
*/
struct bdi_writeback *new_wb;
struct inode *inodes[];
};

Expand Down Expand Up @@ -446,22 +446,23 @@ static bool inode_do_switch_wbs(struct inode *inode,
* Transfer to @new_wb's IO list if necessary. If the @inode is dirty,
* the specific list @inode was on is ignored and the @inode is put on
* ->b_dirty which is always correct including from ->b_dirty_time.
* The transfer preserves @inode->dirtied_when ordering. If the @inode
* was clean, it means it was on the b_attached list, so move it onto
* the b_attached list of @new_wb.
* If the @inode was clean, it means it was on the b_attached list, so
* move it onto the b_attached list of @new_wb.
*/
if (!list_empty(&inode->i_io_list)) {
inode->i_wb = new_wb;

if (inode->i_state & I_DIRTY_ALL) {
struct inode *pos;

list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
if (time_after_eq(inode->dirtied_when,
pos->dirtied_when))
break;
/*
* We need to keep b_dirty list sorted by
* dirtied_time_when. However properly sorting the
* inode in the list gets too expensive when switching
* many inodes. So just attach inode at the end of the
* dirty list and clobber the dirtied_time_when.
*/
inode->dirtied_time_when = jiffies;
inode_io_list_move_locked(inode, new_wb,
pos->i_io_list.prev);
&new_wb->b_dirty);
} else {
inode_cgwb_move_to_attached(inode, new_wb);
}
Expand All @@ -487,13 +488,11 @@ static bool inode_do_switch_wbs(struct inode *inode,
return switched;
}

static void inode_switch_wbs_work_fn(struct work_struct *work)
static void process_inode_switch_wbs(struct bdi_writeback *new_wb,
struct inode_switch_wbs_context *isw)
{
struct inode_switch_wbs_context *isw =
container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
struct bdi_writeback *new_wb = isw->new_wb;
unsigned long nr_switched = 0;
struct inode **inodep;

Expand All @@ -503,6 +502,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
*/
down_read(&bdi->wb_switch_rwsem);

inodep = isw->inodes;
/*
* By the time control reaches here, RCU grace period has passed
* since I_WB_SWITCH assertion and all wb stat update transactions
Expand All @@ -513,6 +513,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
* gives us exclusion against all wb related operations on @inode
* including IO list manipulations and stat updates.
*/
relock:
if (old_wb < new_wb) {
spin_lock(&old_wb->list_lock);
spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
Expand All @@ -521,10 +522,17 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
}

for (inodep = isw->inodes; *inodep; inodep++) {
while (*inodep) {
WARN_ON_ONCE((*inodep)->i_wb != old_wb);
if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
nr_switched++;
inodep++;
if (*inodep && need_resched()) {
spin_unlock(&new_wb->list_lock);
spin_unlock(&old_wb->list_lock);
cond_resched();
goto relock;
}
}

spin_unlock(&new_wb->list_lock);
Expand All @@ -544,6 +552,40 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
atomic_dec(&isw_nr_in_flight);
}

void inode_switch_wbs_work_fn(struct work_struct *work)
{
struct bdi_writeback *new_wb = container_of(work, struct bdi_writeback,
switch_work);
struct inode_switch_wbs_context *isw, *next_isw;
struct llist_node *list;

list = llist_del_all(&new_wb->switch_wbs_ctxs);
/*
* Nothing to do? That would be a problem as references held by isw
* items protect wb from freeing...
*/
if (WARN_ON_ONCE(!list))
return;

/*
* Grab our reference to wb so that it cannot get freed under us
* after we process all the isw items.
*/
wb_get(new_wb);
/*
* In addition to synchronizing among switchers, I_WB_SWITCH
* tells the RCU protected stat update paths to grab the i_page
* lock so that stat transfer can synchronize against them.
* Let's continue after I_WB_SWITCH is guaranteed to be
* visible.
*/
synchronize_rcu();

llist_for_each_entry_safe(isw, next_isw, list, list)
process_inode_switch_wbs(new_wb, isw);
wb_put(new_wb);
}

static bool inode_prepare_wbs_switch(struct inode *inode,
struct bdi_writeback *new_wb)
{
Expand Down Expand Up @@ -573,6 +615,13 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
return true;
}

static void wb_queue_isw(struct bdi_writeback *wb,
struct inode_switch_wbs_context *isw)
{
if (llist_add(&isw->list, &wb->switch_wbs_ctxs))
queue_work(isw_wq, &wb->switch_work);
}

/**
* inode_switch_wbs - change the wb association of an inode
* @inode: target inode
Expand All @@ -586,6 +635,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
struct backing_dev_info *bdi = inode_to_bdi(inode);
struct cgroup_subsys_state *memcg_css;
struct inode_switch_wbs_context *isw;
struct bdi_writeback *new_wb = NULL;

/* noop if seems to be already in progress */
if (inode->i_state & I_WB_SWITCH)
Expand All @@ -601,49 +651,53 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)

atomic_inc(&isw_nr_in_flight);

/* find and pin the new wb */
/*
* Paired with synchronize_rcu() in cgroup_writeback_umount():
* holding rcu_read_lock across inode_prepare_wbs_switch()
* (covering the SB_ACTIVE check and the inode grab) and
* wb_queue_isw() ensures synchronize_rcu() cannot return until
* the work is queued, so the subsequent flush_workqueue() will
* wait for the switch.
*/
rcu_read_lock();
/* find and pin the new wb */
memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
if (memcg_css && !css_tryget(memcg_css))
memcg_css = NULL;
rcu_read_unlock();
if (!memcg_css)
goto out_free;

isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
css_put(memcg_css);
if (!isw->new_wb)
if (!new_wb)
goto out_free;

if (!inode_prepare_wbs_switch(inode, isw->new_wb))
if (!inode_prepare_wbs_switch(inode, new_wb))
goto out_free;

isw->inodes[0] = inode;

/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
* the RCU protected stat update paths to grab the i_page
* lock so that stat transfer can synchronize against them.
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
*/
INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
queue_rcu_work(isw_wq, &isw->work);
trace_inode_switch_wbs_queue(inode->i_wb, new_wb, 1);
wb_queue_isw(new_wb, isw);
rcu_read_unlock();
return;

out_free:
rcu_read_unlock();
atomic_dec(&isw_nr_in_flight);
if (isw->new_wb)
wb_put(isw->new_wb);
if (new_wb)
wb_put(new_wb);
kfree(isw);
}

static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
static bool isw_prepare_wbs_switch(struct bdi_writeback *new_wb,
struct inode_switch_wbs_context *isw,
struct list_head *list, int *nr)
{
struct inode *inode;

list_for_each_entry(inode, list, i_io_list) {
if (!inode_prepare_wbs_switch(inode, isw->new_wb))
if (!inode_prepare_wbs_switch(inode, new_wb))
continue;

isw->inodes[*nr] = inode;
Expand All @@ -667,6 +721,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
{
struct cgroup_subsys_state *memcg_css;
struct inode_switch_wbs_context *isw;
struct bdi_writeback *new_wb;
int nr;
bool restart = false;

Expand All @@ -679,14 +734,22 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)

for (memcg_css = wb->memcg_css->parent; memcg_css;
memcg_css = memcg_css->parent) {
isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
if (isw->new_wb)
new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
if (new_wb)
break;
}
if (unlikely(!isw->new_wb))
isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */
if (unlikely(!new_wb))
new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */

nr = 0;
/*
* Paired with synchronize_rcu() in cgroup_writeback_umount().
* Holding rcu_read_lock across the SB_ACTIVE check, the inode grab
* and wb_queue_isw() ensures synchronize_rcu() cannot return until
* the work is queued, so the subsequent flush_workqueue() will wait
* for the switch.
*/
rcu_read_lock();
spin_lock(&wb->list_lock);
/*
* In addition to the inodes that have completed writeback, also switch
Expand All @@ -696,27 +759,24 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
* bandwidth restrictions, as writeback of inode metadata is not
* accounted for.
*/
restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_attached, &nr);
if (!restart)
restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
restart = isw_prepare_wbs_switch(new_wb, isw, &wb->b_dirty_time,
&nr);
spin_unlock(&wb->list_lock);

/* no attached inodes? bail out */
if (nr == 0) {
rcu_read_unlock();
atomic_dec(&isw_nr_in_flight);
wb_put(isw->new_wb);
wb_put(new_wb);
kfree(isw);
return restart;
}

/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
* the RCU protected stat update paths to grab the i_page
* lock so that stat transfer can synchronize against them.
* Let's continue after I_WB_SWITCH is guaranteed to be visible.
*/
INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
queue_rcu_work(isw_wq, &isw->work);
trace_inode_switch_wbs_queue(wb, new_wb, nr);
wb_queue_isw(new_wb, isw);
rcu_read_unlock();

return restart;
}
Expand Down Expand Up @@ -1155,10 +1215,13 @@ void cgroup_writeback_umount(struct super_block *sb)

if (atomic_read(&isw_nr_in_flight)) {
/*
* Use rcu_barrier() to wait for all pending callbacks to
* ensure that all in-flight wb switches are in the workqueue.
* Paired with rcu_read_lock() in inode_switch_wbs() and
* cleanup_offline_cgwb(). synchronize_rcu() waits for any
* in-flight switcher that already passed the SB_ACTIVE check
* to finish queueing its work, so flush_workqueue() below
* will then drain it.
*/
rcu_barrier();
synchronize_rcu();
flush_workqueue(isw_wq);
}
}
Expand Down
7 changes: 7 additions & 0 deletions include/linux/backing-dev-defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,13 @@ struct bdi_writeback {
struct work_struct release_work;
struct rcu_head rcu;
};

RH_KABI_EXTEND(struct work_struct switch_work) /* work used to perform
* inode switching to
* this wb */
RH_KABI_EXTEND(struct llist_head switch_wbs_ctxs) /* queued contexts for
* writeback switching
* */
#endif
};

Expand Down
2 changes: 2 additions & 0 deletions include/linux/writeback.h
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
}

void inode_switch_wbs_work_fn(struct work_struct *work);

#else /* CONFIG_CGROUP_WRITEBACK */

static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
Expand Down
29 changes: 29 additions & 0 deletions include/trace/events/writeback.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,35 @@ TRACE_EVENT(inode_foreign_history,
)
);

TRACE_EVENT(inode_switch_wbs_queue,

TP_PROTO(struct bdi_writeback *old_wb, struct bdi_writeback *new_wb,
unsigned int count),

TP_ARGS(old_wb, new_wb, count),

TP_STRUCT__entry(
__array(char, name, 32)
__field(ino_t, old_cgroup_ino)
__field(ino_t, new_cgroup_ino)
__field(unsigned int, count)
),

TP_fast_assign(
strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32);
__entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb);
__entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb);
__entry->count = count;
),

TP_printk("bdi %s: old_cgroup_ino=%lu new_cgroup_ino=%lu count=%u",
__entry->name,
(unsigned long)__entry->old_cgroup_ino,
(unsigned long)__entry->new_cgroup_ino,
__entry->count
)
);

TRACE_EVENT(inode_switch_wbs,

TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb,
Expand Down
Loading