From: Paul E. McKenney <paulmck@kernel.org>
Date: Fri, 24 Jan 2020 18:37:27 +0000 (-0800)
Subject: Merge branches 'doc.2019.12.10a', 'exp.2019.12.09a', 'fixes.2020.01.24a', 'kfree_rcu... 
X-Git-Url: http://git.maquefel.me/?a=commitdiff_plain;h=0e247386d9ed5ab8b7dad010cf4b183efeb1e47d;p=linux.git

Merge branches 'doc.2019.12.10a', 'exp.2019.12.09a', 'fixes.2020.01.24a', 'kfree_rcu.2020.01.24a', 'list.2020.01.10a', 'preempt.2020.01.24a' and 'torture.2019.12.09a' into HEAD

doc.2019.12.10a: Documentations updates
exp.2019.12.09a: Expedited grace-period updates
fixes.2020.01.24a: Miscellaneous fixes
kfree_rcu.2020.01.24a: Batch kfree_rcu() work
list.2020.01.10a: RCU-protected-list updates
preempt.2020.01.24a: Preemptible RCU updates
torture.2019.12.09a: Torture-test updates
---

0e247386d9ed5ab8b7dad010cf4b183efeb1e47d
diff --cc kernel/rcu/tree.c
index 1694a6b57ad8c,6145e08a14072,878f62f218e90,31d2d9255d959,1694a6b57ad8c,b0e0612392a96,1694a6b57ad8c..d91c9156fab2e
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@@@@@@@ -2683,12 -2684,12 -2669,12 -2691,165 -2683,12 -2689,12 -2683,12 +2684,165 @@@@@@@@ void call_rcu(struct rcu_head *head, rc
       }
       EXPORT_SYMBOL_GPL(call_rcu);
       
+++ +++
+++ +++/* Maximum number of jiffies to wait before draining a batch. */
+++ +++#define KFREE_DRAIN_JIFFIES (HZ / 50)
+++ +++#define KFREE_N_BATCHES 2
+++ +++
+++ +++/**
+++ +++ * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
+++ +++ * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
+++ +++ * @head_free: List of kfree_rcu() objects waiting for a grace period
+++ +++ * @krcp: Pointer to @kfree_rcu_cpu structure
+++ +++ */
+++ +++
+++ +++struct kfree_rcu_cpu_work {
+++ +++	struct rcu_work rcu_work;
+++ +++	struct rcu_head *head_free;
+++ +++	struct kfree_rcu_cpu *krcp;
+++ +++};
+++ +++
+++ +++/**
+++ +++ * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
+++ +++ * @head: List of kfree_rcu() objects not yet waiting for a grace period
+++ +++ * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
+++ +++ * @lock: Synchronize access to this structure
+++ +++ * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
+++ +++ * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
+++ +++ * @initialized: The @lock and @rcu_work fields have been initialized
+++ +++ *
+++ +++ * This is a per-CPU structure.  The reason that it is not included in
+++ +++ * the rcu_data structure is to permit this code to be extracted from
+++ +++ * the RCU files.  Such extraction could allow further optimization of
+++ +++ * the interactions with the slab allocators.
+++ +++ */
+++ +++struct kfree_rcu_cpu {
+++ +++	struct rcu_head *head;
+++ +++	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
+++ +++	spinlock_t lock;
+++ +++	struct delayed_work monitor_work;
+++ +++	bool monitor_todo;
+++ +++	bool initialized;
+++ +++};
+++ +++
+++ +++static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
+++ +++
+++ +++/*
+++ +++ * This function is invoked in workqueue context after a grace period.
+++ +++ * It frees all the objects queued on ->head_free.
+++ +++ */
+++ +++static void kfree_rcu_work(struct work_struct *work)
+++ +++{
+++ +++	unsigned long flags;
+++ +++	struct rcu_head *head, *next;
+++ +++	struct kfree_rcu_cpu *krcp;
+++ +++	struct kfree_rcu_cpu_work *krwp;
+++ +++
+++ +++	krwp = container_of(to_rcu_work(work),
+++ +++			    struct kfree_rcu_cpu_work, rcu_work);
+++ +++	krcp = krwp->krcp;
+++ +++	spin_lock_irqsave(&krcp->lock, flags);
+++ +++	head = krwp->head_free;
+++ +++	krwp->head_free = NULL;
+++ +++	spin_unlock_irqrestore(&krcp->lock, flags);
+++ +++
+++ +++	// List "head" is now private, so traverse locklessly.
+++ +++	for (; head; head = next) {
+++ +++		unsigned long offset = (unsigned long)head->func;
+++ +++
+++ +++		next = head->next;
+++ +++		// Potentially optimize with kfree_bulk in future.
+++ +++		debug_rcu_head_unqueue(head);
+++ +++		rcu_lock_acquire(&rcu_callback_map);
+++ +++		trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
+++ +++
+++ +++		if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) {
+++ +++			/* Could be optimized with kfree_bulk() in future. */
+++ +++			kfree((void *)head - offset);
+++ +++		}
+++ +++
+++ +++		rcu_lock_release(&rcu_callback_map);
+++ +++		cond_resched_tasks_rcu_qs();
+++ +++	}
+++ +++}
+++ +++
  +  + /*
--  - - * Queue an RCU callback for lazy invocation after a grace period.
--  - - * This will likely be later named something like "call_rcu_lazy()",
--  - - * but this change will require some way of tagging the lazy RCU
--  - - * callbacks in the list of pending callbacks. Until then, this
--  - - * function may only be called from __kfree_rcu().
+++ +++ * Schedule the kfree batch RCU work to run in workqueue context after a GP.
+++ +++ *
+++ +++ * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES
+++ +++ * timeout has been reached.
+++ +++ */
+++ +++static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
+++ +++{
+++ +++	int i;
+++ +++	struct kfree_rcu_cpu_work *krwp = NULL;
+++ +++
+++ +++	lockdep_assert_held(&krcp->lock);
+++ +++	for (i = 0; i < KFREE_N_BATCHES; i++)
+++ +++		if (!krcp->krw_arr[i].head_free) {
+++ +++			krwp = &(krcp->krw_arr[i]);
+++ +++			break;
+++ +++		}
+++ +++
+++ +++	// If a previous RCU batch is in progress, we cannot immediately
+++ +++	// queue another one, so return false to tell caller to retry.
+++ +++	if (!krwp)
+++ +++		return false;
+++ +++
+++ +++	krwp->head_free = krcp->head;
+++ +++	krcp->head = NULL;
+++ +++	INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work);
+++ +++	queue_rcu_work(system_wq, &krwp->rcu_work);
+++ +++	return true;
+++ +++}
+++ +++
+++ +++static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
+++ +++					  unsigned long flags)
+++ +++{
+++ +++	// Attempt to start a new batch.
+++ +++	krcp->monitor_todo = false;
+++ +++	if (queue_kfree_rcu_work(krcp)) {
+++ +++		// Success! Our job is done here.
+++ +++		spin_unlock_irqrestore(&krcp->lock, flags);
+++ +++		return;
+++ +++	}
+++ +++
+++ +++	// Previous RCU batch still in progress, try again later.
+++ +++	krcp->monitor_todo = true;
+++ +++	schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+++ +++	spin_unlock_irqrestore(&krcp->lock, flags);
+++ +++}
+++ +++
++  +++/*
  -     * Queue an RCU callback for lazy invocation after a grace period.
  -     * This will likely be later named something like "call_rcu_lazy()",
  -     * but this change will require some way of tagging the lazy RCU
  -     * callbacks in the list of pending callbacks. Until then, this
  -     * function may only be called from __kfree_rcu().
+++ +++ * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
+++ +++ * It invokes kfree_rcu_drain_unlock() to attempt to start another batch.
+++ +++ */
+++ +++static void kfree_rcu_monitor(struct work_struct *work)
+++ +++{
+++ +++	unsigned long flags;
+++ +++	struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
+++ +++						 monitor_work.work);
+++ +++
+++ +++	spin_lock_irqsave(&krcp->lock, flags);
+++ +++	if (krcp->monitor_todo)
+++ +++		kfree_rcu_drain_unlock(krcp, flags);
+++ +++	else
+++ +++		spin_unlock_irqrestore(&krcp->lock, flags);
+++ +++}
+++ +++
+++ + +/*
     -  * Queue an RCU callback for lazy invocation after a grace period.
     -  * This will likely be later named something like "call_rcu_lazy()",
     -  * but this change will require some way of tagging the lazy RCU
     -  * callbacks in the list of pending callbacks. Until then, this
     -  * function may only be called from __kfree_rcu().
+++ +++ * Queue a request for lazy invocation of kfree() after a grace period.
+++ +++ *
+++ +++ * Each kfree_call_rcu() request is added to a batch. The batch will be drained
+++ +++ * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch
+++ +++ * will be kfree'd in workqueue context. This allows us to:
+++ +++ *
+++ +++ * 1.	Batch requests together to reduce the number of grace periods during
+++ +++ *	heavy kfree_rcu() load.
+++ +++ *
+++ +++ * 2.	It makes it possible to use kfree_bulk() on a large number of
+++ +++ *	kfree_rcu() requests thus reducing cache misses and the per-object
+++ +++ *	overhead of kfree().
        */
       void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
       {
@@@@@@@@ -2696,11 -2697,11 -2682,11 -2886,31 -2696,11 -2702,11 -2696,11 +2879,31 @@@@@@@@ unlock_return
       }
       EXPORT_SYMBOL_GPL(kfree_call_rcu);
       
+++ +++void __init kfree_rcu_scheduler_running(void)
+++ +++{
+++ +++	int cpu;
+++ +++	unsigned long flags;
+++ +++
+++ +++	for_each_online_cpu(cpu) {
+++ +++		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+++ +++
+++ +++		spin_lock_irqsave(&krcp->lock, flags);
+++ +++		if (!krcp->head || krcp->monitor_todo) {
+++ +++			spin_unlock_irqrestore(&krcp->lock, flags);
+++ +++			continue;
+++ +++		}
+++ +++		krcp->monitor_todo = true;
+++ +++		schedule_delayed_work_on(cpu, &krcp->monitor_work,
+++ +++					 KFREE_DRAIN_JIFFIES);
+++ +++		spin_unlock_irqrestore(&krcp->lock, flags);
+++ +++	}
+++ +++}
+++ +++
       /*
        * During early boot, any blocking grace-period wait automatically
----- - * implies a grace period.  Later on, this is never the case for PREEMPT.
+++++ + * implies a grace period.  Later on, this is never the case for PREEMPTION.
        *
----- - * Howevr, because a context switch is a grace period for !PREEMPT, any
+++++ + * Howevr, because a context switch is a grace period for !PREEMPTION, any
        * blocking grace-period wait automatically implies a grace period if
        * there is only one CPU online at any point time during execution of
        * either synchronize_rcu() or synchronize_rcu_expedited().  It is OK to
diff --cc kernel/rcu/tree.h
index 055c31781d3ae,f9253ed406ba4,ce90c68c184b4,15405420b40c1,055c31781d3ae,055c31781d3ae,055c31781d3ae..0c87e4c161c2f
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@@@@@@@ -182,8 -182,9 -181,8 -182,7 -182,8 -182,8 -182,8 +181,8 @@@@@@@@ struct rcu_data 
       	bool rcu_need_heavy_qs;		/* GP old, so heavy quiescent state! */
       	bool rcu_urgent_qs;		/* GP old need light quiescent state. */
       	bool rcu_forced_tick;		/* Forced tick to provide QS. */
+ +++++	bool rcu_forced_tick_exp;	/*   ... provide QS to expedited GP. */
       #ifdef CONFIG_RCU_FAST_NO_HZ
--- ---	bool all_lazy;			/* All CPU's CBs lazy at idle start? */
       	unsigned long last_accelerate;	/* Last jiffy CBs were accelerated. */
       	unsigned long last_advance_all;	/* Last jiffy CBs were all advanced. */
       	int tick_nohz_enabled_snap;	/* Previously seen value from sysfs. */