unsigned int scale;
 
-       struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
-       struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
-       struct latency_bucket __percpu *latency_buckets;
+       struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
+       struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
+       struct latency_bucket __percpu *latency_buckets[2];
        unsigned long last_calculate_time;
        unsigned long filtered_latency;
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 static void throtl_update_latency_buckets(struct throtl_data *td)
 {
-       struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
-       int i, cpu;
-       unsigned long last_latency = 0;
-       unsigned long latency;
+       struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
+       int i, cpu, rw;
+       unsigned long last_latency[2] = { 0 };
+       unsigned long latency[2];
 
        if (!blk_queue_nonrot(td->queue))
                return;
        td->last_calculate_time = jiffies;
 
        memset(avg_latency, 0, sizeof(avg_latency));
-       for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
-               struct latency_bucket *tmp = &td->tmp_buckets[i];
-
-               for_each_possible_cpu(cpu) {
-                       struct latency_bucket *bucket;
-
-                       /* this isn't race free, but ok in practice */
-                       bucket = per_cpu_ptr(td->latency_buckets, cpu);
-                       tmp->total_latency += bucket[i].total_latency;
-                       tmp->samples += bucket[i].samples;
-                       bucket[i].total_latency = 0;
-                       bucket[i].samples = 0;
-               }
+       for (rw = READ; rw <= WRITE; rw++) {
+               for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+                       struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
+
+                       for_each_possible_cpu(cpu) {
+                               struct latency_bucket *bucket;
+
+                               /* this isn't race free, but ok in practice */
+                               bucket = per_cpu_ptr(td->latency_buckets[rw],
+                                       cpu);
+                               tmp->total_latency += bucket[i].total_latency;
+                               tmp->samples += bucket[i].samples;
+                               bucket[i].total_latency = 0;
+                               bucket[i].samples = 0;
+                       }
 
-               if (tmp->samples >= 32) {
-                       int samples = tmp->samples;
+                       if (tmp->samples >= 32) {
+                               int samples = tmp->samples;
 
-                       latency = tmp->total_latency;
+                               latency[rw] = tmp->total_latency;
 
-                       tmp->total_latency = 0;
-                       tmp->samples = 0;
-                       latency /= samples;
-                       if (latency == 0)
-                               continue;
-                       avg_latency[i].latency = latency;
+                               tmp->total_latency = 0;
+                               tmp->samples = 0;
+                               latency[rw] /= samples;
+                               if (latency[rw] == 0)
+                                       continue;
+                               avg_latency[rw][i].latency = latency[rw];
+                       }
                }
        }
 
-       for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
-               if (!avg_latency[i].latency) {
-                       if (td->avg_buckets[i].latency < last_latency)
-                               td->avg_buckets[i].latency = last_latency;
-                       continue;
-               }
+       for (rw = READ; rw <= WRITE; rw++) {
+               for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+                       if (!avg_latency[rw][i].latency) {
+                               if (td->avg_buckets[rw][i].latency < last_latency[rw])
+                                       td->avg_buckets[rw][i].latency =
+                                               last_latency[rw];
+                               continue;
+                       }
 
-               if (!td->avg_buckets[i].valid)
-                       latency = avg_latency[i].latency;
-               else
-                       latency = (td->avg_buckets[i].latency * 7 +
-                               avg_latency[i].latency) >> 3;
+                       if (!td->avg_buckets[rw][i].valid)
+                               latency[rw] = avg_latency[rw][i].latency;
+                       else
+                               latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
+                                       avg_latency[rw][i].latency) >> 3;
 
-               td->avg_buckets[i].latency = max(latency, last_latency);
-               td->avg_buckets[i].valid = true;
-               last_latency = td->avg_buckets[i].latency;
+                       td->avg_buckets[rw][i].latency = max(latency[rw],
+                               last_latency[rw]);
+                       td->avg_buckets[rw][i].valid = true;
+                       last_latency[rw] = td->avg_buckets[rw][i].latency;
+               }
        }
 
        for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
                throtl_log(&td->service_queue,
-                       "Latency bucket %d: latency=%ld, valid=%d", i,
-                       td->avg_buckets[i].latency, td->avg_buckets[i].valid);
+                       "Latency bucket %d: read latency=%ld, read valid=%d, "
+                       "write latency=%ld, write valid=%d", i,
+                       td->avg_buckets[READ][i].latency,
+                       td->avg_buckets[READ][i].valid,
+                       td->avg_buckets[WRITE][i].latency,
+                       td->avg_buckets[WRITE][i].valid);
 }
 #else
 static inline void throtl_update_latency_buckets(struct throtl_data *td)
        struct latency_bucket *latency;
        int index;
 
-       if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
+       if (!td || td->limit_index != LIMIT_LOW ||
+           !(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
            !blk_queue_nonrot(td->queue))
                return;
 
        index = request_bucket_index(size);
 
-       latency = get_cpu_ptr(td->latency_buckets);
+       latency = get_cpu_ptr(td->latency_buckets[op]);
        latency[index].total_latency += time;
        latency[index].samples++;
-       put_cpu_ptr(td->latency_buckets);
+       put_cpu_ptr(td->latency_buckets[op]);
 }
 
 void blk_throtl_stat_add(struct request *rq, u64 time_ns)
        unsigned long finish_time;
        unsigned long start_time;
        unsigned long lat;
+       int rw = bio_data_dir(bio);
 
        tg = bio->bi_cg_private;
        if (!tg)
 
                bucket = request_bucket_index(
                        blk_stat_size(&bio->bi_issue_stat));
-               threshold = tg->td->avg_buckets[bucket].latency +
+               threshold = tg->td->avg_buckets[rw][bucket].latency +
                        tg->latency_target;
                if (lat > threshold)
                        tg->bad_bio_cnt++;
        td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
        if (!td)
                return -ENOMEM;
-       td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
+       td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
+               LATENCY_BUCKET_SIZE, __alignof__(u64));
+       if (!td->latency_buckets[READ]) {
+               kfree(td);
+               return -ENOMEM;
+       }
+       td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
                LATENCY_BUCKET_SIZE, __alignof__(u64));
-       if (!td->latency_buckets) {
+       if (!td->latency_buckets[WRITE]) {
+               free_percpu(td->latency_buckets[READ]);
                kfree(td);
                return -ENOMEM;
        }
        /* activate policy */
        ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
        if (ret) {
-               free_percpu(td->latency_buckets);
+               free_percpu(td->latency_buckets[READ]);
+               free_percpu(td->latency_buckets[WRITE]);
                kfree(td);
        }
        return ret;
        BUG_ON(!q->td);
        throtl_shutdown_wq(q);
        blkcg_deactivate_policy(q, &blkcg_policy_throtl);
-       free_percpu(q->td->latency_buckets);
+       free_percpu(q->td->latency_buckets[READ]);
+       free_percpu(q->td->latency_buckets[WRITE]);
        kfree(q->td);
 }
 
        } else {
                td->throtl_slice = DFL_THROTL_SLICE_HD;
                td->filtered_latency = LATENCY_FILTERED_HD;
-               for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
-                       td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY;
+               for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+                       td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
+                       td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
+               }
        }
 #ifndef CONFIG_BLK_DEV_THROTTLING_LOW
        /* if no low limit, use previous default */