command.  A value of '0' means write-same is not supported by this
 device.
 
+wb_lat_usec (RW)
+----------------
+If the device is registered for writeback throttling, then this file shows
+the target minimum read latency. If this latency is exceeded in a given
+window of time (see wb_window_usec), then the writeback throttling will start
+scaling back writes.
+
 
 Jens Axboe <jens.axboe@oracle.com>, February 2009
 
 
        See Documentation/block/cmdline-partition.txt for more information.
 
+config BLK_WBT
+       bool "Enable support for block device writeback throttling"
+       default n
+       ---help---
+       Enabling this option enables the block layer to throttle buffered
+       background writeback from the VM, making it more smooth and having
+       less impact on foreground operations. The throttling is done
+       dynamically on an algorithm loosely based on CoDel, factoring in
+       the realtime performance of the disk.
+
+config BLK_WBT_SQ
+       bool "Single queue writeback throttling"
+       default n
+       depends on BLK_WBT
+       ---help---
+       Enable writeback throttling by default on legacy single queue devices
+
+config BLK_WBT_MQ
+       bool "Multiqueue writeback throttling"
+       default y
+       depends on BLK_WBT
+       ---help---
+       Enable writeback throttling by default on multiqueue devices.
+       Multiqueue currently doesn't have support for IO scheduling,
+       enabling this option is recommended.
+
 menu "Partition Types"
 
 source "block/partitions/Kconfig"
 
 
 #include "blk.h"
 #include "blk-mq.h"
+#include "blk-wbt.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 
 fail:
        blk_free_flush_queue(q->fq);
+       wbt_exit(q);
        return NULL;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
        blk_delete_timer(rq);
        blk_clear_rq_complete(rq);
        trace_block_rq_requeue(q, rq);
+       wbt_requeue(q->rq_wb, &rq->issue_stat);
 
        if (rq->rq_flags & RQF_QUEUED)
                blk_queue_end_tag(q, rq);
        /* this is a bio leak */
        WARN_ON(req->bio != NULL);
 
+       wbt_done(q->rq_wb, &req->issue_stat);
+
        /*
         * Request may not have originated from ll_rw_blk. if not,
         * it didn't come out of our reserved rq pools
        int el_ret, where = ELEVATOR_INSERT_SORT;
        struct request *req;
        unsigned int request_count = 0;
+       unsigned int wb_acct;
 
        /*
         * low level driver can indicate that it wants pages above a
        }
 
 get_rq:
+       wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock);
+
        /*
         * Grab a free request. This is might sleep but can not fail.
         * Returns with the queue unlocked.
         */
        req = get_request(q, bio->bi_opf, bio, GFP_NOIO);
        if (IS_ERR(req)) {
+               __wbt_done(q->rq_wb, wb_acct);
                bio->bi_error = PTR_ERR(req);
                bio_endio(bio);
                goto out_unlock;
        }
 
+       wbt_track(&req->issue_stat, wb_acct);
+
        /*
         * After dropping the lock and possibly sleeping here, our request
         * may now be mergeable after it had proven unmergeable (above).
        if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
                blk_stat_set_issue_time(&req->issue_stat);
                req->rq_flags |= RQF_STATS;
+               wbt_issue(req->q->rq_wb, &req->issue_stat);
        }
 
        /*
 
        blk_account_io_done(req);
 
-       if (req->end_io)
+       if (req->end_io) {
+               wbt_done(req->q->rq_wb, &req->issue_stat);
                req->end_io(req, error);
-       else {
+       } else {
                if (blk_bidi_rq(req))
                        __blk_put_request(req->next_rq->q, req->next_rq);
 
 
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 #include "blk-stat.h"
+#include "blk-wbt.h"
 
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
 
        if (rq->rq_flags & RQF_MQ_INFLIGHT)
                atomic_dec(&hctx->nr_active);
+
+       wbt_done(q->rq_wb, &rq->issue_stat);
        rq->rq_flags = 0;
 
        clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
        blk_account_io_done(rq);
 
        if (rq->end_io) {
+               wbt_done(rq->q->rq_wb, &rq->issue_stat);
                rq->end_io(rq, error);
        } else {
                if (unlikely(blk_bidi_rq(rq)))
        if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
                blk_stat_set_issue_time(&rq->issue_stat);
                rq->rq_flags |= RQF_STATS;
+               wbt_issue(q->rq_wb, &rq->issue_stat);
        }
 
        blk_add_timer(rq);
        struct request_queue *q = rq->q;
 
        trace_block_rq_requeue(q, rq);
+       wbt_requeue(q->rq_wb, &rq->issue_stat);
 
        if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
                if (q->dma_drain_size && blk_rq_bytes(rq))
        struct blk_plug *plug;
        struct request *same_queue_rq = NULL;
        blk_qc_t cookie;
+       unsigned int wb_acct;
 
        blk_queue_bounce(q, &bio);
 
            blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
                return BLK_QC_T_NONE;
 
+       wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+
        rq = blk_mq_map_request(q, bio, &data);
-       if (unlikely(!rq))
+       if (unlikely(!rq)) {
+               __wbt_done(q->rq_wb, wb_acct);
                return BLK_QC_T_NONE;
+       }
+
+       wbt_track(&rq->issue_stat, wb_acct);
 
        cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
 
        struct blk_mq_alloc_data data;
        struct request *rq;
        blk_qc_t cookie;
+       unsigned int wb_acct;
 
        blk_queue_bounce(q, &bio);
 
        } else
                request_count = blk_plug_queued_count(q);
 
+       wb_acct = wbt_wait(q->rq_wb, bio, NULL);
+
        rq = blk_mq_map_request(q, bio, &data);
-       if (unlikely(!rq))
+       if (unlikely(!rq)) {
+               __wbt_done(q->rq_wb, wb_acct);
                return BLK_QC_T_NONE;
+       }
+
+       wbt_track(&rq->issue_stat, wb_acct);
 
        cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
 
        list_del_init(&q->all_q_node);
        mutex_unlock(&all_q_mutex);
 
+       wbt_exit(q);
+
        blk_mq_del_queue_tag_set(q);
 
        blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
 
 #include <linux/gfp.h>
 
 #include "blk.h"
+#include "blk-wbt.h"
 
 unsigned long blk_max_low_pfn;
 EXPORT_SYMBOL(blk_max_low_pfn);
 void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
 {
        q->queue_depth = depth;
+       wbt_set_queue_depth(q->rq_wb, depth);
 }
 EXPORT_SYMBOL(blk_set_queue_depth);
 
        else
                queue_flag_clear(QUEUE_FLAG_FUA, q);
        spin_unlock_irq(q->queue_lock);
+
+       wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
 }
 EXPORT_SYMBOL_GPL(blk_queue_write_cache);
 
 
 
 #include "blk.h"
 #include "blk-mq.h"
+#include "blk-wbt.h"
 
 struct queue_sysfs_entry {
        struct attribute attr;
        return count;
 }
 
+static ssize_t queue_var_store64(u64 *var, const char *page)
+{
+       int err;
+       u64 v;
+
+       err = kstrtou64(page, 10, &v);
+       if (err < 0)
+               return err;
+
+       *var = v;
+       return 0;
+}
+
 static ssize_t queue_requests_show(struct request_queue *q, char *page)
 {
        return queue_var_show(q->nr_requests, (page));
        return ret;
 }
 
+static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
+{
+       if (!q->rq_wb)
+               return -EINVAL;
+
+       return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000));
+}
+
+static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
+                                 size_t count)
+{
+       ssize_t ret;
+       u64 val;
+
+       if (!q->rq_wb)
+               return -EINVAL;
+
+       ret = queue_var_store64(&val, page);
+       if (ret < 0)
+               return ret;
+
+       q->rq_wb->min_lat_nsec = val * 1000ULL;
+       wbt_update_limits(q->rq_wb);
+       return count;
+}
+
 static ssize_t queue_wc_show(struct request_queue *q, char *page)
 {
        if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
        .show = queue_stats_show,
 };
 
+static struct queue_sysfs_entry queue_wb_lat_entry = {
+       .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
+       .show = queue_wb_lat_show,
+       .store = queue_wb_lat_store,
+};
+
 static struct attribute *default_attrs[] = {
        &queue_requests_entry.attr,
        &queue_ra_entry.attr,
        &queue_wc_entry.attr,
        &queue_dax_entry.attr,
        &queue_stats_entry.attr,
+       &queue_wb_lat_entry.attr,
        NULL,
 };
 
        struct request_queue *q =
                container_of(kobj, struct request_queue, kobj);
 
+       wbt_exit(q);
        bdi_exit(&q->backing_dev_info);
        blkcg_exit_queue(q);
 
        .release        = blk_release_queue,
 };
 
+static void blk_wb_stat_get(void *data, struct blk_rq_stat *stat)
+{
+       blk_queue_stat_get(data, stat);
+}
+
+static void blk_wb_stat_clear(void *data)
+{
+       blk_stat_clear(data);
+}
+
+static bool blk_wb_stat_is_current(struct blk_rq_stat *stat)
+{
+       return blk_stat_is_current(stat);
+}
+
+static struct wb_stat_ops wb_stat_ops = {
+       .get            = blk_wb_stat_get,
+       .is_current     = blk_wb_stat_is_current,
+       .clear          = blk_wb_stat_clear,
+};
+
+static void blk_wb_init(struct request_queue *q)
+{
+#ifndef CONFIG_BLK_WBT_MQ
+       if (q->mq_ops)
+               return;
+#endif
+#ifndef CONFIG_BLK_WBT_SQ
+       if (q->request_fn)
+               return;
+#endif
+
+       /*
+        * If this fails, we don't get throttling
+        */
+       wbt_init(q, &wb_stat_ops);
+}
+
 int blk_register_queue(struct gendisk *disk)
 {
        int ret;
        if (q->mq_ops)
                blk_mq_register_dev(dev, q);
 
+       blk_wb_init(q);
+
        if (!q->request_fn)
                return 0;
 
 
 #include <linux/blktrace_api.h>
 #include <linux/blk-cgroup.h>
 #include "blk.h"
+#include "blk-wbt.h"
 
 /*
  * tunables
        struct cfq_data *cfqd = cic_to_cfqd(cic);
        struct cfq_queue *cfqq;
        uint64_t serial_nr;
+       bool nonroot_cg;
 
        rcu_read_lock();
        serial_nr = bio_blkcg(bio)->css.serial_nr;
+       nonroot_cg = bio_blkcg(bio) != &blkcg_root;
        rcu_read_unlock();
 
        /*
        if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
                return;
 
+       /*
+        * If we have a non-root cgroup, we can depend on that to
+        * do proper throttling of writes. Turn off wbt for that
+        * case.
+        */
+       if (nonroot_cg) {
+               struct request_queue *q = cfqd->queue;
+
+               wbt_disable(q->rq_wb);
+       }
+
        /*
         * Drop reference to queues.  New queues will be assigned in new
         * group upon arrival of fresh requests.
 
 struct blkcg_gq;
 struct blk_flush_queue;
 struct pr_ops;
+struct rq_wb;
 
 #define BLKDEV_MIN_RQ  4
 #define BLKDEV_MAX_RQ  128     /* Default maximum */
        int                     nr_rqs[2];      /* # allocated [a]sync rqs */
        int                     nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
 
+       struct rq_wb            *rq_wb;
+
        /*
         * If blkcg is not used, @q->root_rl serves all requests.  If blkcg
         * is used, root blkg allocates from @q->root_rl and all other