rcu/tree: Defer kvfree_rcu() allocation to a clean context
[ Upstream commit 56292e8609e39537297a7468dda4d87b9bd81d6a ]
The current memmory-allocation interface causes the following difficulties
for kvfree_rcu():
a) If built with CONFIG_PROVE_RAW_LOCK_NESTING, the lockdep will
complain about violation of the nesting rules, as in "BUG: Invalid
wait context". This Kconfig option checks for proper raw_spinlock
vs. spinlock nesting, in particular, it is not legal to acquire a
spinlock_t while holding a raw_spinlock_t.
This is a problem because kfree_rcu() uses raw_spinlock_t whereas the
"page allocator" internally deals with spinlock_t to access to its
zones. The code also can be broken from higher level of view:
<snip>
raw_spin_lock(&some_lock);
kfree_rcu(some_pointer, some_field_offset);
<snip>
b) If built with CONFIG_PREEMPT_RT, spinlock_t is converted into
sleeplock. This means that invoking the page allocator from atomic
contexts results in "BUG: scheduling while atomic".
c) Please note that call_rcu() is already invoked from raw atomic context,
so it is only reasonable to expaect that kfree_rcu() and kvfree_rcu()
will also be called from atomic raw context.
This commit therefore defers page allocation to a clean context using the
combination of an hrtimer and a workqueue. The hrtimer stage is required
in order to avoid deadlocks with the scheduler. This deferred allocation
is required only when kvfree_rcu()'s per-CPU page cache is empty.
Link: https://lore.kernel.org/lkml/20200630164543.4mdcf6zb4zfclhln@linutronix.de/
Fixes: 3042f83f19
("rcu: Support reclaim for head-less object")
Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
This commit is contained in:
parent
5cacd18c52
commit
9b81af9c84
|
@ -177,7 +177,7 @@ module_param(rcu_unlock_delay, int, 0444);
|
|||
* per-CPU. Object size is equal to one page. This value
|
||||
* can be changed at boot time.
|
||||
*/
|
||||
static int rcu_min_cached_objs = 2;
|
||||
static int rcu_min_cached_objs = 5;
|
||||
module_param(rcu_min_cached_objs, int, 0444);
|
||||
|
||||
/* Retrieve RCU kthreads priority for rcutorture */
|
||||
|
@ -3087,6 +3087,9 @@ struct kfree_rcu_cpu_work {
|
|||
* In order to save some per-cpu space the list is singular.
|
||||
* Even though it is lockless an access has to be protected by the
|
||||
* per-cpu lock.
|
||||
* @page_cache_work: A work to refill the cache when it is empty
|
||||
* @work_in_progress: Indicates that page_cache_work is running
|
||||
* @hrtimer: A hrtimer for scheduling a page_cache_work
|
||||
* @nr_bkv_objs: number of allocated objects at @bkvcache.
|
||||
*
|
||||
* This is a per-CPU structure. The reason that it is not included in
|
||||
|
@ -3103,6 +3106,11 @@ struct kfree_rcu_cpu {
|
|||
bool monitor_todo;
|
||||
bool initialized;
|
||||
int count;
|
||||
|
||||
struct work_struct page_cache_work;
|
||||
atomic_t work_in_progress;
|
||||
struct hrtimer hrtimer;
|
||||
|
||||
struct llist_head bkvcache;
|
||||
int nr_bkv_objs;
|
||||
};
|
||||
|
@ -3220,10 +3228,10 @@ static void kfree_rcu_work(struct work_struct *work)
|
|||
}
|
||||
rcu_lock_release(&rcu_callback_map);
|
||||
|
||||
krcp = krc_this_cpu_lock(&flags);
|
||||
raw_spin_lock_irqsave(&krcp->lock, flags);
|
||||
if (put_cached_bnode(krcp, bkvhead[i]))
|
||||
bkvhead[i] = NULL;
|
||||
krc_this_cpu_unlock(krcp, flags);
|
||||
raw_spin_unlock_irqrestore(&krcp->lock, flags);
|
||||
|
||||
if (bkvhead[i])
|
||||
free_page((unsigned long) bkvhead[i]);
|
||||
|
@ -3350,6 +3358,57 @@ static void kfree_rcu_monitor(struct work_struct *work)
|
|||
raw_spin_unlock_irqrestore(&krcp->lock, flags);
|
||||
}
|
||||
|
||||
static enum hrtimer_restart
|
||||
schedule_page_work_fn(struct hrtimer *t)
|
||||
{
|
||||
struct kfree_rcu_cpu *krcp =
|
||||
container_of(t, struct kfree_rcu_cpu, hrtimer);
|
||||
|
||||
queue_work(system_highpri_wq, &krcp->page_cache_work);
|
||||
return HRTIMER_NORESTART;
|
||||
}
|
||||
|
||||
static void fill_page_cache_func(struct work_struct *work)
|
||||
{
|
||||
struct kvfree_rcu_bulk_data *bnode;
|
||||
struct kfree_rcu_cpu *krcp =
|
||||
container_of(work, struct kfree_rcu_cpu,
|
||||
page_cache_work);
|
||||
unsigned long flags;
|
||||
bool pushed;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < rcu_min_cached_objs; i++) {
|
||||
bnode = (struct kvfree_rcu_bulk_data *)
|
||||
__get_free_page(GFP_KERNEL | __GFP_NOWARN);
|
||||
|
||||
if (bnode) {
|
||||
raw_spin_lock_irqsave(&krcp->lock, flags);
|
||||
pushed = put_cached_bnode(krcp, bnode);
|
||||
raw_spin_unlock_irqrestore(&krcp->lock, flags);
|
||||
|
||||
if (!pushed) {
|
||||
free_page((unsigned long) bnode);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
atomic_set(&krcp->work_in_progress, 0);
|
||||
}
|
||||
|
||||
static void
|
||||
run_page_cache_worker(struct kfree_rcu_cpu *krcp)
|
||||
{
|
||||
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
|
||||
!atomic_xchg(&krcp->work_in_progress, 1)) {
|
||||
hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
|
||||
HRTIMER_MODE_REL);
|
||||
krcp->hrtimer.function = schedule_page_work_fn;
|
||||
hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool
|
||||
kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
|
||||
{
|
||||
|
@ -3366,32 +3425,8 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
|
|||
if (!krcp->bkvhead[idx] ||
|
||||
krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
|
||||
bnode = get_cached_bnode(krcp);
|
||||
if (!bnode) {
|
||||
/*
|
||||
* To keep this path working on raw non-preemptible
|
||||
* sections, prevent the optional entry into the
|
||||
* allocator as it uses sleeping locks. In fact, even
|
||||
* if the caller of kfree_rcu() is preemptible, this
|
||||
* path still is not, as krcp->lock is a raw spinlock.
|
||||
* With additional page pre-allocation in the works,
|
||||
* hitting this return is going to be much less likely.
|
||||
*/
|
||||
if (IS_ENABLED(CONFIG_PREEMPT_RT))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* NOTE: For one argument of kvfree_rcu() we can
|
||||
* drop the lock and get the page in sleepable
|
||||
* context. That would allow to maintain an array
|
||||
* for the CONFIG_PREEMPT_RT as well if no cached
|
||||
* pages are available.
|
||||
*/
|
||||
bnode = (struct kvfree_rcu_bulk_data *)
|
||||
__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
|
||||
}
|
||||
|
||||
/* Switch to emergency path. */
|
||||
if (unlikely(!bnode))
|
||||
if (!bnode)
|
||||
return false;
|
||||
|
||||
/* Initialize the new block. */
|
||||
|
@ -3455,12 +3490,10 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
|
|||
goto unlock_return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Under high memory pressure GFP_NOWAIT can fail,
|
||||
* in that case the emergency path is maintained.
|
||||
*/
|
||||
success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
|
||||
if (!success) {
|
||||
run_page_cache_worker(krcp);
|
||||
|
||||
if (head == NULL)
|
||||
// Inline if kvfree_rcu(one_arg) call.
|
||||
goto unlock_return;
|
||||
|
@ -4452,24 +4485,14 @@ static void __init kfree_rcu_batch_init(void)
|
|||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
|
||||
struct kvfree_rcu_bulk_data *bnode;
|
||||
|
||||
for (i = 0; i < KFREE_N_BATCHES; i++) {
|
||||
INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
|
||||
krcp->krw_arr[i].krcp = krcp;
|
||||
}
|
||||
|
||||
for (i = 0; i < rcu_min_cached_objs; i++) {
|
||||
bnode = (struct kvfree_rcu_bulk_data *)
|
||||
__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
|
||||
|
||||
if (bnode)
|
||||
put_cached_bnode(krcp, bnode);
|
||||
else
|
||||
pr_err("Failed to preallocate for %d CPU!\n", cpu);
|
||||
}
|
||||
|
||||
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
|
||||
INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
|
||||
krcp->initialized = true;
|
||||
}
|
||||
if (register_shrinker(&kfree_rcu_shrinker))
|
||||
|
|
Loading…
Reference in New Issue
Block a user