blk-cgroup: Optimize blkcg_rstat_flush()

author Waiman Long <longman@redhat.com>

Sat, 5 Nov 2022 00:59:01 +0000 (20:59 -0400)

committer Jens Axboe <axboe@kernel.dk>

Wed, 16 Nov 2022 23:58:44 +0000 (16:58 -0700)
author Waiman Long <longman@redhat.com>
Sat, 5 Nov 2022 00:59:01 +0000 (20:59 -0400)
committer Jens Axboe <axboe@kernel.dk>
Wed, 16 Nov 2022 23:58:44 +0000 (16:58 -0700)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c

index af8a4d2d1fd15d3f8b314e7941d21e604bc26c06..3e03c0d132537b561b43857fefdafae6bdbcbc30 100644 (file)
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -59,6 +59,37 @@ static struct workqueue_struct *blkcg_punt_bio_wq;
  
  #define BLKG_DESTROY_BATCH_SIZE  64
  
+/*
+ * Lockless lists for tracking IO stats update
+ *
+ * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg).
+ * There are multiple blkg's (one for each block device) attached to each
+ * blkcg. The rstat code keeps track of which cpu has IO stats updated,
+ * but it doesn't know which blkg has the updated stats. If there are many
+ * block devices in a system, the cost of iterating all the blkg's to flush
+ * out the IO stats can be high. To reduce such overhead, a set of percpu
+ * lockless lists (lhead) per blkcg are used to track the set of recently
+ * updated iostat_cpu's since the last flush. An iostat_cpu will be put
+ * onto the lockless list on the update side [blk_cgroup_bio_start()] if
+ * not there yet and then removed when being flushed [blkcg_rstat_flush()].
+ * References to blkg are gotten and then put back in the process to
+ * protect against blkg removal.
+ *
+ * Return: 0 if successful or -ENOMEM if allocation fails.
+ */
+static int init_blkcg_llists(struct blkcg *blkcg)
+{
+       int cpu;
+
+       blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
+       if (!blkcg->lhead)
+               return -ENOMEM;
+
+       for_each_possible_cpu(cpu)
+               init_llist_head(per_cpu_ptr(blkcg->lhead, cpu));
+       return 0;
+}
+
  /**
   * blkcg_css - find the current css
   *
@@ -236,8 +267,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
         blkg->blkcg = blkcg;
  
         u64_stats_init(&blkg->iostat.sync);
-       for_each_possible_cpu(cpu)
+       for_each_possible_cpu(cpu) {
                 u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
+               per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
+       }
  
         for (i = 0; i < BLKCG_MAX_POLS; i++) {
                 struct blkcg_policy *pol = blkcg_policy[i];
@@ -827,7 +860,9 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
  static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
  {
         struct blkcg *blkcg = css_to_blkcg(css);
-       struct blkcg_gq *blkg;
+       struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
+       struct llist_node *lnode;
+       struct blkg_iostat_set *bisc, *next_bisc;
  
         /* Root-level stats are sourced from system-wide IO stats */
         if (!cgroup_parent(css->cgroup))
@@ -835,12 +870,21 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
  
         rcu_read_lock();
  
-       hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+       lnode = llist_del_all(lhead);
+       if (!lnode)
+               goto out;
+
+       /*
+        * Iterate only the iostat_cpu's queued in the lockless list.
+        */
+       llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) {
+               struct blkcg_gq *blkg = bisc->blkg;
                 struct blkcg_gq *parent = blkg->parent;
-               struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
                 struct blkg_iostat cur;
                 unsigned int seq;
  
+               WRITE_ONCE(bisc->lqueued, false);
+
                 /* fetch the current per-cpu values */
                 do {
                         seq = u64_stats_fetch_begin(&bisc->sync);
@@ -853,8 +897,10 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
                 if (parent && parent->parent)
                         blkcg_iostat_update(parent, &blkg->iostat.cur,
                                             &blkg->iostat.last);
+               percpu_ref_put(&blkg->refcnt);
         }
  
+out:
         rcu_read_unlock();
  }
  
@@ -1132,6 +1178,7 @@ static void blkcg_css_free(struct cgroup_subsys_state *css)
  
         mutex_unlock(&blkcg_pol_mutex);
  
+       free_percpu(blkcg->lhead);
         kfree(blkcg);
  }
  
@@ -1151,6 +1198,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
                         goto unlock;
         }
  
+       if (init_blkcg_llists(blkcg))
+               goto free_blkcg;
+
         for (i = 0; i < BLKCG_MAX_POLS ; i++) {
                 struct blkcg_policy *pol = blkcg_policy[i];
                 struct blkcg_policy_data *cpd;
@@ -1191,7 +1241,8 @@ free_pd_blkcg:
         for (i--; i >= 0; i--)
                 if (blkcg->cpd[i])
                         blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
-
+       free_percpu(blkcg->lhead);
+free_blkcg:
         if (blkcg != &blkcg_root)
                 kfree(blkcg);
  unlock:
@@ -1939,6 +1990,7 @@ static int blk_cgroup_io_type(struct bio *bio)
  
  void blk_cgroup_bio_start(struct bio *bio)
  {
+       struct blkcg *blkcg = bio->bi_blkg->blkcg;
         int rwd = blk_cgroup_io_type(bio), cpu;
         struct blkg_iostat_set *bis;
         unsigned long flags;
@@ -1957,9 +2009,21 @@ void blk_cgroup_bio_start(struct bio *bio)
         }
         bis->cur.ios[rwd]++;
  
+       /*
+        * If the iostat_cpu isn't in a lockless list, put it into the
+        * list to indicate that a stat update is pending.
+        */
+       if (!READ_ONCE(bis->lqueued)) {
+               struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
+
+               llist_add(&bis->lnode, lhead);
+               WRITE_ONCE(bis->lqueued, true);
+               percpu_ref_get(&bis->blkg->refcnt);
+       }
+
         u64_stats_update_end_irqrestore(&bis->sync, flags);
         if (cgroup_subsys_on_dfl(io_cgrp_subsys))
-               cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
+               cgroup_rstat_updated(blkcg->css.cgroup, cpu);
         put_cpu();
  }
  
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h

index aa2b286bc825fbdcb3bc404dc106f158a6e750dd..1e94e404eaa80a70068e3fd55fe22bcf517a813f 100644 (file)
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -18,6 +18,7 @@
  #include <linux/cgroup.h>
  #include <linux/kthread.h>
  #include <linux/blk-mq.h>
+#include <linux/llist.h>
  
  struct blkcg_gq;
  struct blkg_policy_data;
@@ -43,6 +44,9 @@ struct blkg_iostat {
  
  struct blkg_iostat_set {
         struct u64_stats_sync           sync;
+       struct blkcg_gq                *blkg;
+       struct llist_node               lnode;
+       int                             lqueued;        /* queued in llist */
         struct blkg_iostat              cur;
         struct blkg_iostat              last;
  };
@@ -97,6 +101,12 @@ struct blkcg {
         struct blkcg_policy_data        *cpd[BLKCG_MAX_POLS];
  
         struct list_head                all_blkcgs_node;
+
+       /*
+        * List of updated percpu blkg_iostat_set's since the last flush.
+        */
+       struct llist_head __percpu      *lhead;
+
  #ifdef CONFIG_BLK_CGROUP_FC_APPID
         char                            fc_app_id[FC_APPID_LEN];
  #endif
author	Waiman Long <longman@redhat.com>
	Sat, 5 Nov 2022 00:59:01 +0000 (20:59 -0400)
committer	Jens Axboe <axboe@kernel.dk>
	Wed, 16 Nov 2022 23:58:44 +0000 (16:58 -0700)
block/blk-cgroup.c		patch \| blob \| history
block/blk-cgroup.h		patch \| blob \| history