block/blk-iolatency.c

   1 /*
   2  * Block rq-qos base io controller
   3  *
   4  * This works similar to wbt with a few exceptions
   5  *
   6  * - It's bio based, so the latency covers the whole block layer in addition to
   7  *   the actual io.
   8  * - We will throttle all IO that comes in here if we need to.
   9  * - We use the mean latency over the 100ms window.  This is because writes can
  10  *   be particularly fast, which could give us a false sense of the impact of
  11  *   other workloads on our protected workload.
  12  * - By default there's no throttling, we set the queue_depth to UINT_MAX so
  13  *   that we can have as many outstanding bio's as we're allowed to.  Only at
  14  *   throttle time do we pay attention to the actual queue depth.
  15  *
  16  * The hierarchy works like the cpu controller does, we track the latency at
  17  * every configured node, and each configured node has it's own independent
  18  * queue depth.  This means that we only care about our latency targets at the
  19  * peer level.  Some group at the bottom of the hierarchy isn't going to affect
  20  * a group at the end of some other path if we're only configred at leaf level.
  21  *
  22  * Consider the following
  23  *
  24  *                   root blkg
  25  *             /                     \
  26  *        fast (target=5ms)     slow (target=10ms)
  27  *         /     \                  /        \
  28  *       a        b          normal(15ms)   unloved
  29  *
  30  * "a" and "b" have no target, but their combined io under "fast" cannot exceed
  31  * an average latency of 5ms.  If it does then we will throttle the "slow"
  32  * group.  In the case of "normal", if it exceeds its 15ms target, we will
  33  * throttle "unloved", but nobody else.
  34  *
  35  * In this example "fast", "slow", and "normal" will be the only groups actually
  36  * accounting their io latencies.  We have to walk up the heirarchy to the root
  37  * on every submit and complete so we can do the appropriate stat recording and
  38  * adjust the queue depth of ourselves if needed.
  39  *
  40  * There are 2 ways we throttle IO.
  41  *
  42  * 1) Queue depth throttling.  As we throttle down we will adjust the maximum
  43  * number of IO's we're allowed to have in flight.  This starts at (u64)-1 down
  44  * to 1.  If the group is only ever submitting IO for itself then this is the
  45  * only way we throttle.
  46  *
  47  * 2) Induced delay throttling.  This is for the case that a group is generating
  48  * IO that has to be issued by the root cg to avoid priority inversion. So think
  49  * REQ_META or REQ_SWAP.  If we are already at qd == 1 and we're getting a lot
  50  * of work done for us on behalf of the root cg and are being asked to scale
  51  * down more then we induce a latency at userspace return.  We accumulate the
  52  * total amount of time we need to be punished by doing
  53  *
  54  * total_time += min_lat_nsec - actual_io_completion
  55  *
  56  * and then at throttle time will do
  57  *
  58  * throttle_time = min(total_time, NSEC_PER_SEC)
  59  *
  60  * This induced delay will throttle back the activity that is generating the
  61  * root cg issued io's, wethere that's some metadata intensive operation or the
  62  * group is using so much memory that it is pushing us into swap.
  63  *
  64  * Copyright (C) 2018 Josef Bacik
  65  */
  66 #include <linux/kernel.h>
  67 #include <linux/blk_types.h>
  68 #include <linux/backing-dev.h>
  69 #include <linux/module.h>
  70 #include <linux/timer.h>
  71 #include <linux/memcontrol.h>
  72 #include <linux/sched/loadavg.h>
  73 #include <linux/sched/signal.h>
  74 #include <trace/events/block.h>
  75 #include <linux/blk-mq.h>
  76 #include "blk-rq-qos.h"
  77 #include "blk-stat.h"
  78 #include "blk.h"
  79
  80 #define DEFAULT_SCALE_COOKIE 1000000U
  81
  82 static struct blkcg_policy blkcg_policy_iolatency;
  83 struct iolatency_grp;
  84
  85 struct blk_iolatency {
  86         struct rq_qos rqos;
  87         struct timer_list timer;
  88
  89         /*
  90          * ->enabled is the master enable switch gating the throttling logic and
  91          * inflight tracking. The number of cgroups which have iolat enabled is
  92          * tracked in ->enable_cnt, and ->enable is flipped on/off accordingly
  93          * from ->enable_work with the request_queue frozen. For details, See
  94          * blkiolatency_enable_work_fn().
  95          */
  96         bool enabled;
  97         atomic_t enable_cnt;
  98         struct work_struct enable_work;
  99 };
 100
 101 static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos)
 102 {
 103         return container_of(rqos, struct blk_iolatency, rqos);
 104 }
 105
 106 struct child_latency_info {
 107         spinlock_t lock;
 108
 109         /* Last time we adjusted the scale of everybody. */
 110         u64 last_scale_event;
 111
 112         /* The latency that we missed. */
 113         u64 scale_lat;
 114
 115         /* Total io's from all of our children for the last summation. */
 116         u64 nr_samples;
 117
 118         /* The guy who actually changed the latency numbers. */
 119         struct iolatency_grp *scale_grp;
 120
 121         /* Cookie to tell if we need to scale up or down. */
 122         atomic_t scale_cookie;
 123 };
 124
 125 struct iolatency_grp {
 126         struct blkg_policy_data pd;
 127         struct blk_rq_stat __percpu *stats;
 128         struct blk_iolatency *blkiolat;
 129         struct rq_depth rq_depth;
 130         struct rq_wait rq_wait;
 131         atomic64_t window_start;
 132         atomic_t scale_cookie;
 133         u64 min_lat_nsec;
 134         u64 cur_win_nsec;
 135
 136         /* total running average of our io latency. */
 137         u64 lat_avg;
 138
 139         /* Our current number of IO's for the last summation. */
 140         u64 nr_samples;
 141
 142         struct child_latency_info child_lat;
 143 };
 144
 145 #define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC)
 146 #define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC
 147 /*
 148  * These are the constants used to fake the fixed-point moving average
 149  * calculation just like load average.  The call to CALC_LOAD folds
 150  * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg.  The sampling
 151  * window size is bucketed to try to approximately calculate average
 152  * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows
 153  * elapse immediately.  Note, windows only elapse with IO activity.  Idle
 154  * periods extend the most recent window.
 155  */
 156 #define BLKIOLATENCY_NR_EXP_FACTORS 5
 157 #define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \
 158                                       (BLKIOLATENCY_NR_EXP_FACTORS - 1))
 159 static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = {
 160         2045, // exp(1/600) - 600 samples
 161         2039, // exp(1/240) - 240 samples
 162         2031, // exp(1/120) - 120 samples
 163         2023, // exp(1/80)  - 80 samples
 164         2014, // exp(1/60)  - 60 samples
 165 };
 166
 167 static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd)
 168 {
 169         return pd ? container_of(pd, struct iolatency_grp, pd) : NULL;
 170 }
 171
 172 static inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg)
 173 {
 174         return pd_to_lat(blkg_to_pd(blkg, &blkcg_policy_iolatency));
 175 }
 176
 177 static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat)
 178 {
 179         return pd_to_blkg(&iolat->pd);
 180 }
 181
 182 static inline bool iolatency_may_queue(struct iolatency_grp *iolat,
 183                                        wait_queue_entry_t *wait,
 184                                        bool first_block)
 185 {
 186         struct rq_wait *rqw = &iolat->rq_wait;
 187
 188         if (first_block && waitqueue_active(&rqw->wait) &&
 189             rqw->wait.head.next != &wait->entry)
 190                 return false;
 191         return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth);
 192 }
 193
 194 static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
 195                                        struct iolatency_grp *iolat,
 196                                        spinlock_t *lock, bool issue_as_root,
 197                                        bool use_memdelay)
 198         __releases(lock)
 199         __acquires(lock)
 200 {
 201         struct rq_wait *rqw = &iolat->rq_wait;
 202         unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay);
 203         DEFINE_WAIT(wait);
 204         bool first_block = true;
 205
 206         if (use_delay)
 207                 blkcg_schedule_throttle(rqos->q, use_memdelay);
 208
 209         /*
 210          * To avoid priority inversions we want to just take a slot if we are
 211          * issuing as root.  If we're being killed off there's no point in
 212          * delaying things, we may have been killed by OOM so throttling may
 213          * make recovery take even longer, so just let the IO's through so the
 214          * task can go away.
 215          */
 216         if (issue_as_root || fatal_signal_pending(current)) {
 217                 atomic_inc(&rqw->inflight);
 218                 return;
 219         }
 220
 221         if (iolatency_may_queue(iolat, &wait, first_block))
 222                 return;
 223
 224         do {
 225                 prepare_to_wait_exclusive(&rqw->wait, &wait,
 226                                           TASK_UNINTERRUPTIBLE);
 227
 228                 if (iolatency_may_queue(iolat, &wait, first_block))
 229                         break;
 230                 first_block = false;
 231
 232                 if (lock) {
 233                         spin_unlock_irq(lock);
 234                         io_schedule();
 235                         spin_lock_irq(lock);
 236                 } else {
 237                         io_schedule();
 238                 }
 239         } while (1);
 240
 241         finish_wait(&rqw->wait, &wait);
 242 }
 243
 244 #define SCALE_DOWN_FACTOR 2
 245 #define SCALE_UP_FACTOR 4
 246
 247 static inline unsigned long scale_amount(unsigned long qd, bool up)
 248 {
 249         return max(up ? qd >> SCALE_UP_FACTOR : qd >> SCALE_DOWN_FACTOR, 1UL);
 250 }
 251
 252 /*
 253  * We scale the qd down faster than we scale up, so we need to use this helper
 254  * to adjust the scale_cookie accordingly so we don't prematurely get
 255  * scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much.
 256  *
 257  * Each group has their own local copy of the last scale cookie they saw, so if
 258  * the global scale cookie goes up or down they know which way they need to go
 259  * based on their last knowledge of it.
 260  */
 261 static void scale_cookie_change(struct blk_iolatency *blkiolat,
 262                                 struct child_latency_info *lat_info,
 263                                 bool up)
 264 {
 265         unsigned long qd = blk_queue_depth(blkiolat->rqos.q);
 266         unsigned long scale = scale_amount(qd, up);
 267         unsigned long old = atomic_read(&lat_info->scale_cookie);
 268         unsigned long max_scale = qd << 1;
 269         unsigned long diff = 0;
 270
 271         if (old < DEFAULT_SCALE_COOKIE)
 272                 diff = DEFAULT_SCALE_COOKIE - old;
 273
 274         if (up) {
 275                 if (scale + old > DEFAULT_SCALE_COOKIE)
 276                         atomic_set(&lat_info->scale_cookie,
 277                                    DEFAULT_SCALE_COOKIE);
 278                 else if (diff > qd)
 279                         atomic_inc(&lat_info->scale_cookie);
 280                 else
 281                         atomic_add(scale, &lat_info->scale_cookie);
 282         } else {
 283                 /*
 284                  * We don't want to dig a hole so deep that it takes us hours to
 285                  * dig out of it.  Just enough that we don't throttle/unthrottle
 286                  * with jagged workloads but can still unthrottle once pressure
 287                  * has sufficiently dissipated.
 288                  */
 289                 if (diff > qd) {
 290                         if (diff < max_scale)
 291                                 atomic_dec(&lat_info->scale_cookie);
 292                 } else {
 293                         atomic_sub(scale, &lat_info->scale_cookie);
 294                 }
 295         }
 296 }
 297
 298 /*
 299  * Change the queue depth of the iolatency_grp.  We add/subtract 1/16th of the
 300  * queue depth at a time so we don't get wild swings and hopefully dial in to
 301  * fairer distribution of the overall queue depth.
 302  */
 303 static void scale_change(struct iolatency_grp *iolat, bool up)
 304 {
 305         unsigned long qd = blk_queue_depth(iolat->blkiolat->rqos.q);
 306         unsigned long scale = scale_amount(qd, up);
 307         unsigned long old = iolat->rq_depth.max_depth;
 308         bool changed = false;
 309
 310         if (old > qd)
 311                 old = qd;
 312
 313         if (up) {
 314                 if (old == 1 && blkcg_unuse_delay(lat_to_blkg(iolat)))
 315                         return;
 316
 317                 if (old < qd) {
 318                         changed = true;
 319                         old += scale;
 320                         old = min(old, qd);
 321                         iolat->rq_depth.max_depth = old;
 322                         wake_up_all(&iolat->rq_wait.wait);
 323                 }
 324         } else if (old > 1) {
 325                 old >>= 1;
 326                 changed = true;
 327                 iolat->rq_depth.max_depth = max(old, 1UL);
 328         }
 329 }
 330
 331 /* Check our parent and see if the scale cookie has changed. */
 332 static void check_scale_change(struct iolatency_grp *iolat)
 333 {
 334         struct iolatency_grp *parent;
 335         struct child_latency_info *lat_info;
 336         unsigned int cur_cookie;
 337         unsigned int our_cookie = atomic_read(&iolat->scale_cookie);
 338         u64 scale_lat;
 339         unsigned int old;
 340         int direction = 0;
 341
 342         if (lat_to_blkg(iolat)->parent == NULL)
 343                 return;
 344
 345         parent = blkg_to_lat(lat_to_blkg(iolat)->parent);
 346         if (!parent)
 347                 return;
 348
 349         lat_info = &parent->child_lat;
 350         cur_cookie = atomic_read(&lat_info->scale_cookie);
 351         scale_lat = READ_ONCE(lat_info->scale_lat);
 352
 353         if (cur_cookie < our_cookie)
 354                 direction = -1;
 355         else if (cur_cookie > our_cookie)
 356                 direction = 1;
 357         else
 358                 return;
 359
 360         old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie);
 361
 362         /* Somebody beat us to the punch, just bail. */
 363         if (old != our_cookie)
 364                 return;
 365
 366         if (direction < 0 && iolat->min_lat_nsec) {
 367                 u64 samples_thresh;
 368
 369                 if (!scale_lat || iolat->min_lat_nsec <= scale_lat)
 370                         return;
 371
 372                 /*
 373                  * Sometimes high priority groups are their own worst enemy, so
 374                  * instead of taking it out on some poor other group that did 5%
 375                  * or less of the IO's for the last summation just skip this
 376                  * scale down event.
 377                  */
 378                 samples_thresh = lat_info->nr_samples * 5;
 379                 samples_thresh = div64_u64(samples_thresh, 100);
 380                 if (iolat->nr_samples <= samples_thresh)
 381                         return;
 382         }
 383
 384         /* We're as low as we can go. */
 385         if (iolat->rq_depth.max_depth == 1 && direction < 0) {
 386                 blkcg_use_delay(lat_to_blkg(iolat));
 387                 return;
 388         }
 389
 390         /* We're back to the default cookie, unthrottle all the things. */
 391         if (cur_cookie == DEFAULT_SCALE_COOKIE) {
 392                 blkcg_clear_delay(lat_to_blkg(iolat));
 393                 iolat->rq_depth.max_depth = UINT_MAX;
 394                 wake_up_all(&iolat->rq_wait.wait);
 395                 return;
 396         }
 397
 398         scale_change(iolat, direction > 0);
 399 }
 400
 401 static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio,
 402                                      spinlock_t *lock)
 403 {
 404         struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
 405         struct blkcg *blkcg;
 406         struct blkcg_gq *blkg;
 407         struct request_queue *q = rqos->q;
 408         bool issue_as_root = bio_issue_as_root_blkg(bio);
 409
 410         if (!blkiolat->enabled)
 411                 return;
 412
 413         rcu_read_lock();
 414         blkcg = bio_blkcg(bio);
 415         bio_associate_blkcg(bio, &blkcg->css);
 416         blkg = blkg_lookup(blkcg, q);
 417         if (unlikely(!blkg)) {
 418                 if (!lock)
 419                         spin_lock_irq(q->queue_lock);
 420                 blkg = blkg_lookup_create(blkcg, q);
 421                 if (IS_ERR(blkg))
 422                         blkg = NULL;
 423                 if (!lock)
 424                         spin_unlock_irq(q->queue_lock);
 425         }
 426         if (!blkg)
 427                 goto out;
 428
 429         bio_issue_init(&bio->bi_issue, bio_sectors(bio));
 430         bio_associate_blkg(bio, blkg);
 431 out:
 432         rcu_read_unlock();
 433         while (blkg && blkg->parent) {
 434                 struct iolatency_grp *iolat = blkg_to_lat(blkg);
 435                 if (!iolat) {
 436                         blkg = blkg->parent;
 437                         continue;
 438                 }
 439
 440                 check_scale_change(iolat);
 441                 __blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root,
 442                                      (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
 443                 blkg = blkg->parent;
 444         }
 445         if (!timer_pending(&blkiolat->timer))
 446                 mod_timer(&blkiolat->timer, jiffies + HZ);
 447 }
 448
 449 static void iolatency_record_time(struct iolatency_grp *iolat,
 450                                   struct bio_issue *issue, u64 now,
 451                                   bool issue_as_root)
 452 {
 453         struct blk_rq_stat *rq_stat;
 454         u64 start = bio_issue_time(issue);
 455         u64 req_time;
 456
 457         /*
 458          * Have to do this so we are truncated to the correct time that our
 459          * issue is truncated to.
 460          */
 461         now = __bio_issue_time(now);
 462
 463         if (now <= start)
 464                 return;
 465
 466         req_time = now - start;
 467
 468         /*
 469          * We don't want to count issue_as_root bio's in the cgroups latency
 470          * statistics as it could skew the numbers downwards.
 471          */
 472         if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) {
 473                 u64 sub = iolat->min_lat_nsec;
 474                 if (req_time < sub)
 475                         blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time);
 476                 return;
 477         }
 478
 479         rq_stat = get_cpu_ptr(iolat->stats);
 480         blk_rq_stat_add(rq_stat, req_time);
 481         put_cpu_ptr(rq_stat);
 482 }
 483
 484 #define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC)
 485 #define BLKIOLATENCY_MIN_GOOD_SAMPLES 5
 486
 487 static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
 488 {
 489         struct blkcg_gq *blkg = lat_to_blkg(iolat);
 490         struct iolatency_grp *parent;
 491         struct child_latency_info *lat_info;
 492         struct blk_rq_stat stat;
 493         unsigned long flags;
 494         int cpu, exp_idx;
 495
 496         blk_rq_stat_init(&stat);
 497         preempt_disable();
 498         for_each_online_cpu(cpu) {
 499                 struct blk_rq_stat *s;
 500                 s = per_cpu_ptr(iolat->stats, cpu);
 501                 blk_rq_stat_sum(&stat, s);
 502                 blk_rq_stat_init(s);
 503         }
 504         preempt_enable();
 505
 506         parent = blkg_to_lat(blkg->parent);
 507         if (!parent)
 508                 return;
 509
 510         lat_info = &parent->child_lat;
 511
 512         /*
 513          * CALC_LOAD takes in a number stored in fixed point representation.
 514          * Because we are using this for IO time in ns, the values stored
 515          * are significantly larger than the FIXED_1 denominator (2048).
 516          * Therefore, rounding errors in the calculation are negligible and
 517          * can be ignored.
 518          */
 519         exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
 520                         div64_u64(iolat->cur_win_nsec,
 521                                   BLKIOLATENCY_EXP_BUCKET_SIZE));
 522         CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean);
 523
 524         /* Everything is ok and we don't need to adjust the scale. */
 525         if (stat.mean <= iolat->min_lat_nsec &&
 526             atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE)
 527                 return;
 528
 529         /* Somebody beat us to the punch, just bail. */
 530         spin_lock_irqsave(&lat_info->lock, flags);
 531         lat_info->nr_samples -= iolat->nr_samples;
 532         lat_info->nr_samples += stat.nr_samples;
 533         iolat->nr_samples = stat.nr_samples;
 534
 535         if ((lat_info->last_scale_event >= now ||
 536             now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) &&
 537             lat_info->scale_lat <= iolat->min_lat_nsec)
 538                 goto out;
 539
 540         if (stat.mean <= iolat->min_lat_nsec &&
 541             stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) {
 542                 if (lat_info->scale_grp == iolat) {
 543                         lat_info->last_scale_event = now;
 544                         scale_cookie_change(iolat->blkiolat, lat_info, true);
 545                 }
 546         } else if (stat.mean > iolat->min_lat_nsec) {
 547                 lat_info->last_scale_event = now;
 548                 if (!lat_info->scale_grp ||
 549                     lat_info->scale_lat > iolat->min_lat_nsec) {
 550                         WRITE_ONCE(lat_info->scale_lat, iolat->min_lat_nsec);
 551                         lat_info->scale_grp = iolat;
 552                 }
 553                 scale_cookie_change(iolat->blkiolat, lat_info, false);
 554         }
 555 out:
 556         spin_unlock_irqrestore(&lat_info->lock, flags);
 557 }
 558
 559 static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
 560 {
 561         struct blkcg_gq *blkg;
 562         struct rq_wait *rqw;
 563         struct iolatency_grp *iolat;
 564         u64 window_start;
 565         u64 now = ktime_to_ns(ktime_get());
 566         bool issue_as_root = bio_issue_as_root_blkg(bio);
 567         int inflight = 0;
 568
 569         blkg = bio->bi_blkg;
 570         if (!blkg)
 571                 return;
 572
 573         iolat = blkg_to_lat(bio->bi_blkg);
 574         if (!iolat)
 575                 return;
 576
 577         if (!iolat->blkiolat->enabled)
 578                 return;
 579
 580         while (blkg && blkg->parent) {
 581                 iolat = blkg_to_lat(blkg);
 582                 if (!iolat) {
 583                         blkg = blkg->parent;
 584                         continue;
 585                 }
 586                 rqw = &iolat->rq_wait;
 587
 588                 inflight = atomic_dec_return(&rqw->inflight);
 589                 WARN_ON_ONCE(inflight < 0);
 590                 /*
 591                  * If bi_status is BLK_STS_AGAIN, the bio wasn't actually
 592                  * submitted, so do not account for it.
 593                  */
 594                 if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) {
 595                         iolatency_record_time(iolat, &bio->bi_issue, now,
 596                                               issue_as_root);
 597                         window_start = atomic64_read(&iolat->window_start);
 598                         if (now > window_start &&
 599                             (now - window_start) >= iolat->cur_win_nsec) {
 600                                 if (atomic64_cmpxchg(&iolat->window_start,
 601                                              window_start, now) == window_start)
 602                                         iolatency_check_latencies(iolat, now);
 603                         }
 604                 }
 605                 wake_up(&rqw->wait);
 606                 blkg = blkg->parent;
 607         }
 608 }
 609
 610 static void blkcg_iolatency_exit(struct rq_qos *rqos)
 611 {
 612         struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
 613
 614         del_timer_sync(&blkiolat->timer);
 615         flush_work(&blkiolat->enable_work);
 616         blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency);
 617         kfree(blkiolat);
 618 }
 619
 620 static struct rq_qos_ops blkcg_iolatency_ops = {
 621         .throttle = blkcg_iolatency_throttle,
 622         .done_bio = blkcg_iolatency_done_bio,
 623         .exit = blkcg_iolatency_exit,
 624 };
 625
 626 static void blkiolatency_timer_fn(struct timer_list *t)
 627 {
 628         struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer);
 629         struct blkcg_gq *blkg;
 630         struct cgroup_subsys_state *pos_css;
 631         u64 now = ktime_to_ns(ktime_get());
 632
 633         rcu_read_lock();
 634         blkg_for_each_descendant_pre(blkg, pos_css,
 635                                      blkiolat->rqos.q->root_blkg) {
 636                 struct iolatency_grp *iolat;
 637                 struct child_latency_info *lat_info;
 638                 unsigned long flags;
 639                 u64 cookie;
 640
 641                 /*
 642                  * We could be exiting, don't access the pd unless we have a
 643                  * ref on the blkg.
 644                  */
 645                 if (!blkg_try_get(blkg))
 646                         continue;
 647
 648                 iolat = blkg_to_lat(blkg);
 649                 if (!iolat)
 650                         goto next;
 651
 652                 lat_info = &iolat->child_lat;
 653                 cookie = atomic_read(&lat_info->scale_cookie);
 654
 655                 if (cookie >= DEFAULT_SCALE_COOKIE)
 656                         goto next;
 657
 658                 spin_lock_irqsave(&lat_info->lock, flags);
 659                 if (lat_info->last_scale_event >= now)
 660                         goto next_lock;
 661
 662                 /*
 663                  * We scaled down but don't have a scale_grp, scale up and carry
 664                  * on.
 665                  */
 666                 if (lat_info->scale_grp == NULL) {
 667                         scale_cookie_change(iolat->blkiolat, lat_info, true);
 668                         goto next_lock;
 669                 }
 670
 671                 /*
 672                  * It's been 5 seconds since our last scale event, clear the
 673                  * scale grp in case the group that needed the scale down isn't
 674                  * doing any IO currently.
 675                  */
 676                 if (now - lat_info->last_scale_event >=
 677                     ((u64)NSEC_PER_SEC * 5))
 678                         lat_info->scale_grp = NULL;
 679 next_lock:
 680                 spin_unlock_irqrestore(&lat_info->lock, flags);
 681 next:
 682                 blkg_put(blkg);
 683         }
 684         rcu_read_unlock();
 685 }
 686
 687 /**
 688  * blkiolatency_enable_work_fn - Enable or disable iolatency on the device
 689  * @work: enable_work of the blk_iolatency of interest
 690  *
 691  * iolatency needs to keep track of the number of in-flight IOs per cgroup. This
 692  * is relatively expensive as it involves walking up the hierarchy twice for
 693  * every IO. Thus, if iolatency is not enabled in any cgroup for the device, we
 694  * want to disable the in-flight tracking.
 695  *
 696  * We have to make sure that the counting is balanced - we don't want to leak
 697  * the in-flight counts by disabling accounting in the completion path while IOs
 698  * are in flight. This is achieved by ensuring that no IO is in flight by
 699  * freezing the queue while flipping ->enabled. As this requires a sleepable
 700  * context, ->enabled flipping is punted to this work function.
 701  */
 702 static void blkiolatency_enable_work_fn(struct work_struct *work)
 703 {
 704         struct blk_iolatency *blkiolat = container_of(work, struct blk_iolatency,
 705                                                       enable_work);
 706         bool enabled;
 707
 708         /*
 709          * There can only be one instance of this function running for @blkiolat
 710          * and it's guaranteed to be executed at least once after the latest
 711          * ->enabled_cnt modification. Acting on the latest ->enable_cnt is
 712          * sufficient.
 713          *
 714          * Also, we know @blkiolat is safe to access as ->enable_work is flushed
 715          * in blkcg_iolatency_exit().
 716          */
 717         enabled = atomic_read(&blkiolat->enable_cnt);
 718         if (enabled != blkiolat->enabled) {
 719                 blk_mq_freeze_queue(blkiolat->rqos.q);
 720                 blkiolat->enabled = enabled;
 721                 blk_mq_unfreeze_queue(blkiolat->rqos.q);
 722         }
 723 }
 724
 725 int blk_iolatency_init(struct request_queue *q)
 726 {
 727         struct blk_iolatency *blkiolat;
 728         struct rq_qos *rqos;
 729         int ret;
 730
 731         blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL);
 732         if (!blkiolat)
 733                 return -ENOMEM;
 734
 735         rqos = &blkiolat->rqos;
 736         rqos->id = RQ_QOS_CGROUP;
 737         rqos->ops = &blkcg_iolatency_ops;
 738         rqos->q = q;
 739
 740         rq_qos_add(q, rqos);
 741
 742         ret = blkcg_activate_policy(q, &blkcg_policy_iolatency);
 743         if (ret) {
 744                 rq_qos_del(q, rqos);
 745                 kfree(blkiolat);
 746                 return ret;
 747         }
 748
 749         timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0);
 750         INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn);
 751
 752         return 0;
 753 }
 754
 755 static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
 756 {
 757         struct iolatency_grp *iolat = blkg_to_lat(blkg);
 758         struct blk_iolatency *blkiolat = iolat->blkiolat;
 759         u64 oldval = iolat->min_lat_nsec;
 760
 761         iolat->min_lat_nsec = val;
 762         iolat->cur_win_nsec = max_t(u64, val << 4, BLKIOLATENCY_MIN_WIN_SIZE);
 763         iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec,
 764                                     BLKIOLATENCY_MAX_WIN_SIZE);
 765
 766         if (!oldval && val) {
 767                 if (atomic_inc_return(&blkiolat->enable_cnt) == 1)
 768                         schedule_work(&blkiolat->enable_work);
 769         }
 770         if (oldval && !val) {
 771                 blkcg_clear_delay(blkg);
 772                 if (atomic_dec_return(&blkiolat->enable_cnt) == 0)
 773                         schedule_work(&blkiolat->enable_work);
 774         }
 775 }
 776
 777 static void iolatency_clear_scaling(struct blkcg_gq *blkg)
 778 {
 779         if (blkg->parent) {
 780                 struct iolatency_grp *iolat = blkg_to_lat(blkg->parent);
 781                 struct child_latency_info *lat_info;
 782                 if (!iolat)
 783                         return;
 784
 785                 lat_info = &iolat->child_lat;
 786                 spin_lock(&lat_info->lock);
 787                 atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE);
 788                 lat_info->last_scale_event = 0;
 789                 lat_info->scale_grp = NULL;
 790                 lat_info->scale_lat = 0;
 791                 spin_unlock(&lat_info->lock);
 792         }
 793 }
 794
 795 static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 796                              size_t nbytes, loff_t off)
 797 {
 798         struct blkcg *blkcg = css_to_blkcg(of_css(of));
 799         struct blkcg_gq *blkg;
 800         struct blk_iolatency *blkiolat;
 801         struct blkg_conf_ctx ctx;
 802         struct iolatency_grp *iolat;
 803         char *p, *tok;
 804         u64 lat_val = 0;
 805         u64 oldval;
 806         int ret;
 807
 808         ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx);
 809         if (ret)
 810                 return ret;
 811
 812         iolat = blkg_to_lat(ctx.blkg);
 813         blkiolat = iolat->blkiolat;
 814         p = ctx.body;
 815
 816         ret = -EINVAL;
 817         while ((tok = strsep(&p, " "))) {
 818                 char key[16];
 819                 char val[21];   /* 18446744073709551616 */
 820
 821                 if (sscanf(tok, "%15[^=]=%20s", key, val) != 2)
 822                         goto out;
 823
 824                 if (!strcmp(key, "target")) {
 825                         u64 v;
 826
 827                         if (!strcmp(val, "max"))
 828                                 lat_val = 0;
 829                         else if (sscanf(val, "%llu", &v) == 1)
 830                                 lat_val = v * NSEC_PER_USEC;
 831                         else
 832                                 goto out;
 833                 } else {
 834                         goto out;
 835                 }
 836         }
 837
 838         /* Walk up the tree to see if our new val is lower than it should be. */
 839         blkg = ctx.blkg;
 840         oldval = iolat->min_lat_nsec;
 841
 842         iolatency_set_min_lat_nsec(blkg, lat_val);
 843         if (oldval != iolat->min_lat_nsec)
 844                 iolatency_clear_scaling(blkg);
 845         ret = 0;
 846 out:
 847         blkg_conf_finish(&ctx);
 848         return ret ?: nbytes;
 849 }
 850
 851 static u64 iolatency_prfill_limit(struct seq_file *sf,
 852                                   struct blkg_policy_data *pd, int off)
 853 {
 854         struct iolatency_grp *iolat = pd_to_lat(pd);
 855         const char *dname = blkg_dev_name(pd->blkg);
 856
 857         if (!dname || !iolat->min_lat_nsec)
 858                 return 0;
 859         seq_printf(sf, "%s target=%llu\n",
 860                    dname, div_u64(iolat->min_lat_nsec, NSEC_PER_USEC));
 861         return 0;
 862 }
 863
 864 static int iolatency_print_limit(struct seq_file *sf, void *v)
 865 {
 866         blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
 867                           iolatency_prfill_limit,
 868                           &blkcg_policy_iolatency, seq_cft(sf)->private, false);
 869         return 0;
 870 }
 871
 872 static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
 873                                 size_t size)
 874 {
 875         struct iolatency_grp *iolat = pd_to_lat(pd);
 876         unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
 877         unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
 878
 879         if (iolat->rq_depth.max_depth == UINT_MAX)
 880                 return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu",
 881                                  avg_lat, cur_win);
 882
 883         return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu",
 884                          iolat->rq_depth.max_depth, avg_lat, cur_win);
 885 }
 886
 887
 888 static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node)
 889 {
 890         struct iolatency_grp *iolat;
 891
 892         iolat = kzalloc_node(sizeof(*iolat), gfp, node);
 893         if (!iolat)
 894                 return NULL;
 895         iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat),
 896                                        __alignof__(struct blk_rq_stat), gfp);
 897         if (!iolat->stats) {
 898                 kfree(iolat);
 899                 return NULL;
 900         }
 901         return &iolat->pd;
 902 }
 903
 904 static void iolatency_pd_init(struct blkg_policy_data *pd)
 905 {
 906         struct iolatency_grp *iolat = pd_to_lat(pd);
 907         struct blkcg_gq *blkg = lat_to_blkg(iolat);
 908         struct rq_qos *rqos = blkcg_rq_qos(blkg->q);
 909         struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
 910         u64 now = ktime_to_ns(ktime_get());
 911         int cpu;
 912
 913         for_each_possible_cpu(cpu) {
 914                 struct blk_rq_stat *stat;
 915                 stat = per_cpu_ptr(iolat->stats, cpu);
 916                 blk_rq_stat_init(stat);
 917         }
 918
 919         rq_wait_init(&iolat->rq_wait);
 920         spin_lock_init(&iolat->child_lat.lock);
 921         iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q);
 922         iolat->rq_depth.max_depth = UINT_MAX;
 923         iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth;
 924         iolat->blkiolat = blkiolat;
 925         iolat->cur_win_nsec = 100 * NSEC_PER_MSEC;
 926         atomic64_set(&iolat->window_start, now);
 927
 928         /*
 929          * We init things in list order, so the pd for the parent may not be
 930          * init'ed yet for whatever reason.
 931          */
 932         if (blkg->parent && blkg_to_pd(blkg->parent, &blkcg_policy_iolatency)) {
 933                 struct iolatency_grp *parent = blkg_to_lat(blkg->parent);
 934                 atomic_set(&iolat->scale_cookie,
 935                            atomic_read(&parent->child_lat.scale_cookie));
 936         } else {
 937                 atomic_set(&iolat->scale_cookie, DEFAULT_SCALE_COOKIE);
 938         }
 939
 940         atomic_set(&iolat->child_lat.scale_cookie, DEFAULT_SCALE_COOKIE);
 941 }
 942
 943 static void iolatency_pd_offline(struct blkg_policy_data *pd)
 944 {
 945         struct iolatency_grp *iolat = pd_to_lat(pd);
 946         struct blkcg_gq *blkg = lat_to_blkg(iolat);
 947
 948         iolatency_set_min_lat_nsec(blkg, 0);
 949         iolatency_clear_scaling(blkg);
 950 }
 951
 952 static void iolatency_pd_free(struct blkg_policy_data *pd)
 953 {
 954         struct iolatency_grp *iolat = pd_to_lat(pd);
 955         free_percpu(iolat->stats);
 956         kfree(iolat);
 957 }
 958
 959 static struct cftype iolatency_files[] = {
 960         {
 961                 .name = "latency",
 962                 .flags = CFTYPE_NOT_ON_ROOT,
 963                 .seq_show = iolatency_print_limit,
 964                 .write = iolatency_set_limit,
 965         },
 966         {}
 967 };
 968
 969 static struct blkcg_policy blkcg_policy_iolatency = {
 970         .dfl_cftypes    = iolatency_files,
 971         .pd_alloc_fn    = iolatency_pd_alloc,
 972         .pd_init_fn     = iolatency_pd_init,
 973         .pd_offline_fn  = iolatency_pd_offline,
 974         .pd_free_fn     = iolatency_pd_free,
 975         .pd_stat_fn     = iolatency_pd_stat,
 976 };
 977
 978 static int __init iolatency_init(void)
 979 {
 980         return blkcg_policy_register(&blkcg_policy_iolatency);
 981 }
 982
 983 static void __exit iolatency_exit(void)
 984 {
 985         return blkcg_policy_unregister(&blkcg_policy_iolatency);
 986 }
 987
 988 module_init(iolatency_init);
 989 module_exit(iolatency_exit);