rcu/tree: Defer kvfree_rcu() allocation to a clean context

[ Upstream commit 56292e8609e39537297a7468dda4d87b9bd81d6a ]

The current memmory-allocation interface causes the following difficulties
for kvfree_rcu():

a) If built with CONFIG_PROVE_RAW_LOCK_NESTING, the lockdep will
   complain about violation of the nesting rules, as in "BUG: Invalid
   wait context".  This Kconfig option checks for proper raw_spinlock
   vs. spinlock nesting, in particular, it is not legal to acquire a
   spinlock_t while holding a raw_spinlock_t.

   This is a problem because kfree_rcu() uses raw_spinlock_t whereas the
   "page allocator" internally deals with spinlock_t to access to its
   zones. The code also can be broken from higher level of view:
   <snip>
       raw_spin_lock(&some_lock);
       kfree_rcu(some_pointer, some_field_offset);
   <snip>

b) If built with CONFIG_PREEMPT_RT, spinlock_t is converted into
   sleeplock.  This means that invoking the page allocator from atomic
   contexts results in "BUG: scheduling while atomic".

c) Please note that call_rcu() is already invoked from raw atomic context,
   so it is only reasonable to expaect that kfree_rcu() and kvfree_rcu()
   will also be called from atomic raw context.

This commit therefore defers page allocation to a clean context using the
combination of an hrtimer and a workqueue.  The hrtimer stage is required
in order to avoid deadlocks with the scheduler.  This deferred allocation
is required only when kvfree_rcu()'s per-CPU page cache is empty.

Link: https://lore.kernel.org/lkml/20200630164543.4mdcf6zb4zfclhln@linutronix.de/
Fixes: 3042f83f19 ("rcu: Support reclaim for head-less object")
Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
This commit is contained in:
Uladzislau Rezki (Sony) 2020-10-29 17:50:04 +01:00 committed by Greg Kroah-Hartman
parent 5cacd18c52
commit 9b81af9c84

View File

@ -177,7 +177,7 @@ module_param(rcu_unlock_delay, int, 0444);
* per-CPU. Object size is equal to one page. This value
* can be changed at boot time.
*/
static int rcu_min_cached_objs = 2;
static int rcu_min_cached_objs = 5;
module_param(rcu_min_cached_objs, int, 0444);
/* Retrieve RCU kthreads priority for rcutorture */
@ -3087,6 +3087,9 @@ struct kfree_rcu_cpu_work {
* In order to save some per-cpu space the list is singular.
* Even though it is lockless an access has to be protected by the
* per-cpu lock.
* @page_cache_work: A work to refill the cache when it is empty
* @work_in_progress: Indicates that page_cache_work is running
* @hrtimer: A hrtimer for scheduling a page_cache_work
* @nr_bkv_objs: number of allocated objects at @bkvcache.
*
* This is a per-CPU structure. The reason that it is not included in
@ -3103,6 +3106,11 @@ struct kfree_rcu_cpu {
bool monitor_todo;
bool initialized;
int count;
struct work_struct page_cache_work;
atomic_t work_in_progress;
struct hrtimer hrtimer;
struct llist_head bkvcache;
int nr_bkv_objs;
};
@ -3220,10 +3228,10 @@ static void kfree_rcu_work(struct work_struct *work)
}
rcu_lock_release(&rcu_callback_map);
krcp = krc_this_cpu_lock(&flags);
raw_spin_lock_irqsave(&krcp->lock, flags);
if (put_cached_bnode(krcp, bkvhead[i]))
bkvhead[i] = NULL;
krc_this_cpu_unlock(krcp, flags);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
if (bkvhead[i])
free_page((unsigned long) bkvhead[i]);
@ -3350,6 +3358,57 @@ static void kfree_rcu_monitor(struct work_struct *work)
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
static enum hrtimer_restart
schedule_page_work_fn(struct hrtimer *t)
{
struct kfree_rcu_cpu *krcp =
container_of(t, struct kfree_rcu_cpu, hrtimer);
queue_work(system_highpri_wq, &krcp->page_cache_work);
return HRTIMER_NORESTART;
}
static void fill_page_cache_func(struct work_struct *work)
{
struct kvfree_rcu_bulk_data *bnode;
struct kfree_rcu_cpu *krcp =
container_of(work, struct kfree_rcu_cpu,
page_cache_work);
unsigned long flags;
bool pushed;
int i;
for (i = 0; i < rcu_min_cached_objs; i++) {
bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_KERNEL | __GFP_NOWARN);
if (bnode) {
raw_spin_lock_irqsave(&krcp->lock, flags);
pushed = put_cached_bnode(krcp, bnode);
raw_spin_unlock_irqrestore(&krcp->lock, flags);
if (!pushed) {
free_page((unsigned long) bnode);
break;
}
}
}
atomic_set(&krcp->work_in_progress, 0);
}
static void
run_page_cache_worker(struct kfree_rcu_cpu *krcp)
{
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
!atomic_xchg(&krcp->work_in_progress, 1)) {
hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL);
krcp->hrtimer.function = schedule_page_work_fn;
hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
}
}
static inline bool
kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
{
@ -3366,32 +3425,8 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
if (!krcp->bkvhead[idx] ||
krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
bnode = get_cached_bnode(krcp);
if (!bnode) {
/*
* To keep this path working on raw non-preemptible
* sections, prevent the optional entry into the
* allocator as it uses sleeping locks. In fact, even
* if the caller of kfree_rcu() is preemptible, this
* path still is not, as krcp->lock is a raw spinlock.
* With additional page pre-allocation in the works,
* hitting this return is going to be much less likely.
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT))
return false;
/*
* NOTE: For one argument of kvfree_rcu() we can
* drop the lock and get the page in sleepable
* context. That would allow to maintain an array
* for the CONFIG_PREEMPT_RT as well if no cached
* pages are available.
*/
bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
}
/* Switch to emergency path. */
if (unlikely(!bnode))
if (!bnode)
return false;
/* Initialize the new block. */
@ -3455,12 +3490,10 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
goto unlock_return;
}
/*
* Under high memory pressure GFP_NOWAIT can fail,
* in that case the emergency path is maintained.
*/
success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
if (!success) {
run_page_cache_worker(krcp);
if (head == NULL)
// Inline if kvfree_rcu(one_arg) call.
goto unlock_return;
@ -4452,24 +4485,14 @@ static void __init kfree_rcu_batch_init(void)
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
struct kvfree_rcu_bulk_data *bnode;
for (i = 0; i < KFREE_N_BATCHES; i++) {
INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
krcp->krw_arr[i].krcp = krcp;
}
for (i = 0; i < rcu_min_cached_objs; i++) {
bnode = (struct kvfree_rcu_bulk_data *)
__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
if (bnode)
put_cached_bnode(krcp, bnode);
else
pr_err("Failed to preallocate for %d CPU!\n", cpu);
}
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
krcp->initialized = true;
}
if (register_shrinker(&kfree_rcu_shrinker))