diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c195365c9817..d33dd6be1d9c 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1613,16 +1613,24 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { unsigned long pflags; + bool clamp; u64 now = ktime_to_ns(ktime_get()); u64 exp; u64 delay_nsec = 0; int tok; while (blkg->parent) { - if (atomic_read(&blkg->use_delay)) { + int use_delay = atomic_read(&blkg->use_delay); + + if (use_delay) { + u64 this_delay; + blkcg_scale_delay(blkg, now); - delay_nsec = max_t(u64, delay_nsec, - atomic64_read(&blkg->delay_nsec)); + this_delay = atomic64_read(&blkg->delay_nsec); + if (this_delay > delay_nsec) { + delay_nsec = this_delay; + clamp = use_delay > 0; + } } blkg = blkg->parent; } @@ -1634,10 +1642,13 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) * Let's not sleep for all eternity if we've amassed a huge delay. * Swapping or metadata IO can accumulate 10's of seconds worth of * delay, and we want userspace to be able to do _something_ so cap the - * delays at 1 second. If there's 10's of seconds worth of delay then - * the tasks will be delayed for 1 second for every syscall. + * delays at 0.25s. If there's 10's of seconds worth of delay then the + * tasks will be delayed for 0.25 second for every syscall. If + * blkcg_set_delay() was used as indicated by negative use_delay, the + * caller is responsible for regulating the range. */ - delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); + if (clamp) + delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); if (use_memdelay) psi_memstall_enter(&pflags); diff --git a/block/blk-iocost.c b/block/blk-iocost.c index d2b69d87f3e7..9366527d8c12 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -224,20 +224,12 @@ enum { MARGIN_MIN_PCT = 10, MARGIN_LOW_PCT = 20, MARGIN_TARGET_PCT = 50, - MARGIN_MAX_PCT = 100, INUSE_ADJ_STEP_PCT = 25, /* Have some play in timer operations */ TIMER_SLACK_PCT = 1, - /* - * vtime can wrap well within a reasonable uptime when vrate is - * consistently raised. Don't trust recorded cgroup vtime if the - * period counter indicates that it's older than 5mins. - */ - VTIME_VALID_DUR = 300 * USEC_PER_SEC, - /* 1/64k is granular enough and can easily be handled w/ u32 */ WEIGHT_ONE = 1 << 16, @@ -270,6 +262,38 @@ enum { /* unbusy hysterisis */ UNBUSY_THR_PCT = 75, + /* + * The effect of delay is indirect and non-linear and a huge amount of + * future debt can accumulate abruptly while unthrottled. Linearly scale + * up delay as debt is going up and then let it decay exponentially. + * This gives us quick ramp ups while delay is accumulating and long + * tails which can help reducing the frequency of debt explosions on + * unthrottle. The parameters are experimentally determined. + * + * The delay mechanism provides adequate protection and behavior in many + * cases. However, this is far from ideal and falls shorts on both + * fronts. The debtors are often throttled too harshly costing a + * significant level of fairness and possibly total work while the + * protection against their impacts on the system can be choppy and + * unreliable. + * + * The shortcoming primarily stems from the fact that, unlike for page + * cache, the kernel doesn't have well-defined back-pressure propagation + * mechanism and policies for anonymous memory. Fully addressing this + * issue will likely require substantial improvements in the area. + */ + MIN_DELAY_THR_PCT = 500, + MAX_DELAY_THR_PCT = 25000, + MIN_DELAY = 250, + MAX_DELAY = 250 * USEC_PER_MSEC, + + /* + * Halve debts if total usage keeps staying under 25% w/o any shortages + * for over 100ms. + */ + DEBT_BUSY_USAGE_PCT = 25, + DEBT_REDUCTION_IDLE_DUR = 100 * USEC_PER_MSEC, + /* don't let cmds which take a very long time pin lagging for too long */ MAX_LAGGING_PERIODS = 10, @@ -363,7 +387,6 @@ struct ioc_margins { s64 min; s64 low; s64 target; - s64 max; }; struct ioc_missed { @@ -400,6 +423,8 @@ struct ioc { enum ioc_running running; atomic64_t vtime_rate; + u64 vtime_base_rate; + s64 vtime_err; seqcount_spinlock_t period_seqcount; u64 period_at; /* wallclock starttime */ @@ -411,6 +436,9 @@ struct ioc { bool weights_updated; atomic_t hweight_gen; /* for lazy hweights */ + /* the last time debt cancel condition wasn't met */ + u64 debt_busy_at; + u64 autop_too_fast_at; u64 autop_too_slow_at; int autop_idx; @@ -473,6 +501,10 @@ struct ioc_gq { atomic64_t done_vtime; u64 abs_vdebt; + /* current delay in effect and when it started */ + u64 delay; + u64 delay_at; + /* * The period this iocg was last active in. Used for deactivation * and invalidating `vtime`. @@ -495,7 +527,6 @@ struct ioc_gq { struct wait_queue_head waitq; struct hrtimer waitq_timer; - struct hrtimer delay_timer; /* timestamp at the latest activation */ u64 activated_at; @@ -722,12 +753,11 @@ static void ioc_refresh_margins(struct ioc *ioc) { struct ioc_margins *margins = &ioc->margins; u32 period_us = ioc->period_us; - u64 vrate = atomic64_read(&ioc->vtime_rate); + u64 vrate = ioc->vtime_base_rate; margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate; margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate; - margins->max = (period_us * MARGIN_MAX_PCT / 100) * vrate; } /* latency Qos params changed, update period_us and all the dependent params */ @@ -793,8 +823,7 @@ static int ioc_autop_idx(struct ioc *ioc) return idx; /* step up/down based on the vrate */ - vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100, - VTIME_PER_USEC); + vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); now_ns = ktime_get_ns(); if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { @@ -902,6 +931,43 @@ static bool ioc_refresh_params(struct ioc *ioc, bool force) return true; } +/* + * When an iocg accumulates too much vtime or gets deactivated, we throw away + * some vtime, which lowers the overall device utilization. As the exact amount + * which is being thrown away is known, we can compensate by accelerating the + * vrate accordingly so that the extra vtime generated in the current period + * matches what got lost. + */ +static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now) +{ + s64 pleft = ioc->period_at + ioc->period_us - now->now; + s64 vperiod = ioc->period_us * ioc->vtime_base_rate; + s64 vcomp, vcomp_min, vcomp_max; + + lockdep_assert_held(&ioc->lock); + + /* we need some time left in this period */ + if (pleft <= 0) + goto done; + + /* + * Calculate how much vrate should be adjusted to offset the error. + * Limit the amount of adjustment and deduct the adjusted amount from + * the error. + */ + vcomp = -div64_s64(ioc->vtime_err, pleft); + vcomp_min = -(ioc->vtime_base_rate >> 1); + vcomp_max = ioc->vtime_base_rate; + vcomp = clamp(vcomp, vcomp_min, vcomp_max); + + ioc->vtime_err += vcomp * pleft; + + atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp); +done: + /* bound how much error can accumulate */ + ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); +} + /* take a snapshot of the current [v]time and vrate */ static void ioc_now(struct ioc *ioc, struct ioc_now *now) { @@ -1114,8 +1180,8 @@ static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now) static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; - u64 last_period, cur_period, max_period_delta; - u64 vtime, vmin; + u64 last_period, cur_period; + u64 vtime, vtarget; int i; /* @@ -1154,21 +1220,15 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) goto fail_unlock; /* - * vtime may wrap when vrate is raised substantially due to - * underestimated IO costs. Look at the period and ignore its - * vtime if the iocg has been idle for too long. Also, cap the - * budget it can start with to the margin. + * Always start with the target budget. On deactivation, we throw away + * anything above it. */ - max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us); + vtarget = now->vnow - ioc->margins.target; vtime = atomic64_read(&iocg->vtime); - vmin = now->vnow - ioc->margins.max; - if (last_period + max_period_delta < cur_period || - time_before64(vtime, vmin)) { - atomic64_add(vmin - vtime, &iocg->vtime); - atomic64_add(vmin - vtime, &iocg->done_vtime); - vtime = vmin; - } + atomic64_add(vtarget - vtime, &iocg->vtime); + atomic64_add(vtarget - vtime, &iocg->done_vtime); + vtime = vtarget; /* * Activate, propagate weight and start period timer if not @@ -1188,6 +1248,7 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) if (ioc->running == IOC_IDLE) { ioc->running = IOC_RUNNING; + ioc->debt_busy_at = now->now; ioc_start_period(ioc, now); } @@ -1204,58 +1265,51 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); - u64 vtime = atomic64_read(&iocg->vtime); - u64 delta_ns, expires, oexpires; + u64 tdelta, delay, new_delay; + s64 vover, vover_pct; u32 hwa; lockdep_assert_held(&iocg->waitq.lock); - /* debt-adjust vtime */ - current_hweight(iocg, &hwa, NULL); - vtime += abs_cost_to_cost(iocg->abs_vdebt, hwa); + /* calculate the current delay in effect - 1/2 every second */ + tdelta = now->now - iocg->delay_at; + if (iocg->delay) + delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC); + else + delay = 0; - /* - * Clear or maintain depending on the overage. Non-zero vdebt is what - * guarantees that @iocg is online and future iocg_kick_delay() will - * clear use_delay. Don't leave it on when there's no vdebt. - */ - if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) { + /* calculate the new delay from the debt amount */ + current_hweight(iocg, &hwa, NULL); + vover = atomic64_read(&iocg->vtime) + + abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow; + vover_pct = div64_s64(100 * vover, + ioc->period_us * ioc->vtime_base_rate); + + if (vover_pct <= MIN_DELAY_THR_PCT) + new_delay = 0; + else if (vover_pct >= MAX_DELAY_THR_PCT) + new_delay = MAX_DELAY; + else + new_delay = MIN_DELAY + + div_u64((MAX_DELAY - MIN_DELAY) * + (vover_pct - MIN_DELAY_THR_PCT), + MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT); + + /* pick the higher one and apply */ + if (new_delay > delay) { + iocg->delay = new_delay; + iocg->delay_at = now->now; + delay = new_delay; + } + + if (delay >= MIN_DELAY) { + blkcg_set_delay(blkg, delay * NSEC_PER_USEC); + return true; + } else { + iocg->delay = 0; blkcg_clear_delay(blkg); return false; } - if (!atomic_read(&blkg->use_delay) && - time_before_eq64(vtime, now->vnow + ioc->margins.target)) - return false; - - /* use delay */ - delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow, - now->vrate) * NSEC_PER_USEC; - blkcg_set_delay(blkg, delta_ns); - expires = now->now_ns + delta_ns; - - /* if already active and close enough, don't bother */ - oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer)); - if (hrtimer_is_queued(&iocg->delay_timer) && - abs(oexpires - expires) <= ioc->timer_slack_ns) - return true; - - hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires), - ioc->timer_slack_ns, HRTIMER_MODE_ABS); - return true; -} - -static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer) -{ - struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer); - struct ioc_now now; - unsigned long flags; - - spin_lock_irqsave(&iocg->waitq.lock, flags); - ioc_now(iocg->ioc, &now); - iocg_kick_delay(iocg, &now); - spin_unlock_irqrestore(&iocg->waitq.lock, flags); - - return HRTIMER_NORESTART; } static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost, @@ -1356,10 +1410,11 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, atomic64_add(vpay, &iocg->done_vtime); iocg_pay_debt(iocg, abs_vpay, now); vbudget -= vpay; - - iocg_kick_delay(iocg, now); } + if (iocg->abs_vdebt || iocg->delay) + iocg_kick_delay(iocg, now); + /* * Debt can still be outstanding if we haven't paid all yet or the * caller raced and called without @pay_debt. Shouldn't wake up waiters @@ -1389,7 +1444,8 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, /* determine next wakeup, add a timer margin to guarantee chunking */ vshortage = -ctx.vbudget; expires = now->now_ns + - DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC; + DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) * + NSEC_PER_USEC; expires += ioc->timer_slack_ns; /* if already active and close enough, don't bother */ @@ -1504,6 +1560,7 @@ static void iocg_build_inner_walk(struct ioc_gq *iocg, /* collect per-cpu counters and propagate the deltas to the parent */ static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) { + struct ioc *ioc = iocg->ioc; struct iocg_stat new_stat; u64 abs_vusage = 0; u64 vusage_delta; @@ -1519,7 +1576,7 @@ static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) vusage_delta = abs_vusage - iocg->last_stat_abs_vusage; iocg->last_stat_abs_vusage = abs_vusage; - iocg->usage_delta_us = div64_u64(vusage_delta, now->vrate); + iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); iocg->local_stat.usage_us += iocg->usage_delta_us; new_stat.usage_us = @@ -1561,8 +1618,8 @@ static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) * capacity. @hwm is the upper bound and used to signal no donation. This * function also throws away @iocg's excess budget. */ -static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage, - struct ioc_now *now) +static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm, + u32 usage, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; u64 vtime = atomic64_read(&iocg->vtime); @@ -1577,12 +1634,13 @@ static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage, time_after64(vtime, now->vnow - ioc->margins.min)) return hwm; - /* throw away excess above max */ - excess = now->vnow - vtime - ioc->margins.max; + /* throw away excess above target */ + excess = now->vnow - vtime - ioc->margins.target; if (excess > 0) { atomic64_add(excess, &iocg->vtime); atomic64_add(excess, &iocg->done_vtime); vtime += excess; + ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE); } /* @@ -1861,6 +1919,12 @@ static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) inuse = DIV64_U64_ROUND_UP( parent->child_adjusted_sum * iocg->hweight_after_donation, parent->hweight_inuse); + + TRACE_IOCG_PATH(inuse_transfer, iocg, now, + iocg->inuse, inuse, + iocg->hweight_inuse, + iocg->hweight_after_donation); + __propagate_weights(iocg, iocg->active, inuse, true, now); } @@ -1875,7 +1939,8 @@ static void ioc_timer_fn(struct timer_list *timer) struct ioc_gq *iocg, *tiocg; struct ioc_now now; LIST_HEAD(surpluses); - int nr_shortages = 0, nr_lagging = 0; + int nr_debtors = 0, nr_shortages = 0, nr_lagging = 0; + u64 usage_us_sum = 0; u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; u32 missed_ppm[2], rq_wait_pct; @@ -1906,16 +1971,37 @@ static void ioc_timer_fn(struct timer_list *timer) */ list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && - !iocg_is_idle(iocg)) + !iocg->delay && !iocg_is_idle(iocg)) continue; spin_lock(&iocg->waitq.lock); - if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) { + if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || + iocg->delay) { /* might be oversleeping vtime / hweight changes, kick */ iocg_kick_waitq(iocg, true, &now); + if (iocg->abs_vdebt) + nr_debtors++; } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ + u64 vtime = atomic64_read(&iocg->vtime); + s64 excess; + + /* + * @iocg has been inactive for a full duration and will + * have a high budget. Account anything above target as + * error and throw away. On reactivation, it'll start + * with the target budget. + */ + excess = now.vnow - vtime - ioc->margins.target; + if (excess > 0) { + u32 old_hwi; + + current_hweight(iocg, NULL, &old_hwi); + ioc->vtime_err -= div64_u64(excess * old_hwi, + WEIGHT_ONE); + } + __propagate_weights(iocg, 0, 0, false, &now); list_del_init(&iocg->active_list); } @@ -1956,11 +2042,12 @@ static void ioc_timer_fn(struct timer_list *timer) * high-latency completions appearing as idle. */ usage_us = iocg->usage_delta_us; + usage_us_sum += usage_us; if (vdone != vtime) { u64 inflight_us = DIV64_U64_ROUND_UP( cost_to_abs_cost(vtime - vdone, hw_inuse), - now.vrate); + ioc->vtime_base_rate); usage_us = max(usage_us, inflight_us); } @@ -1980,21 +2067,25 @@ static void ioc_timer_fn(struct timer_list *timer) if (hw_inuse < hw_active || (!waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow - ioc->margins.low))) { - u32 hwa, hwm, new_hwi; + u32 hwa, old_hwi, hwm, new_hwi; /* * Already donating or accumulated enough to start. * Determine the donation amount. */ - current_hweight(iocg, &hwa, NULL); + current_hweight(iocg, &hwa, &old_hwi); hwm = current_hweight_max(iocg); - new_hwi = hweight_after_donation(iocg, hwm, usage, - &now); + new_hwi = hweight_after_donation(iocg, old_hwi, hwm, + usage, &now); if (new_hwi < hwm) { iocg->hweight_donating = hwa; iocg->hweight_after_donation = new_hwi; list_add(&iocg->surplus_list, &surpluses); } else { + TRACE_IOCG_PATH(inuse_shortage, iocg, &now, + iocg->inuse, iocg->active, + iocg->hweight_inuse, new_hwi); + __propagate_weights(iocg, iocg->active, iocg->active, true, &now); nr_shortages++; @@ -2014,6 +2105,38 @@ static void ioc_timer_fn(struct timer_list *timer) list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list) list_del_init(&iocg->surplus_list); + /* + * A low weight iocg can amass a large amount of debt, for example, when + * anonymous memory gets reclaimed aggressively. If the system has a lot + * of memory paired with a slow IO device, the debt can span multiple + * seconds or more. If there are no other subsequent IO issuers, the + * in-debt iocg may end up blocked paying its debt while the IO device + * is idle. + * + * The following protects against such pathological cases. If the device + * has been sufficiently idle for a substantial amount of time, the + * debts are halved. The criteria are on the conservative side as we + * want to resolve the rare extreme cases without impacting regular + * operation by forgiving debts too readily. + */ + if (nr_shortages || + div64_u64(100 * usage_us_sum, now.now - ioc->period_at) >= + DEBT_BUSY_USAGE_PCT) + ioc->debt_busy_at = now.now; + + if (nr_debtors && + now.now - ioc->debt_busy_at >= DEBT_REDUCTION_IDLE_DUR) { + list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { + if (iocg->abs_vdebt) { + spin_lock(&iocg->waitq.lock); + iocg->abs_vdebt /= 2; + iocg_kick_waitq(iocg, true, &now); + spin_unlock(&iocg->waitq.lock); + } + } + ioc->debt_busy_at = now.now; + } + /* * If q is getting clogged or we're missing too much, we're issuing * too much IO and should lower vtime rate. If we're not missing @@ -2061,7 +2184,7 @@ static void ioc_timer_fn(struct timer_list *timer) ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) { - u64 vrate = atomic64_read(&ioc->vtime_rate); + u64 vrate = ioc->vtime_base_rate; u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; /* rq_wait signal is always reliable, ignore user vrate_min */ @@ -2098,7 +2221,7 @@ static void ioc_timer_fn(struct timer_list *timer) trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages); - atomic64_set(&ioc->vtime_rate, vrate); + ioc->vtime_base_rate = vrate; ioc_refresh_margins(ioc); } else if (ioc->busy_level != prev_busy_level || nr_lagging) { trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), @@ -2119,8 +2242,11 @@ static void ioc_timer_fn(struct timer_list *timer) ioc_start_period(ioc, &now); } else { ioc->busy_level = 0; + ioc->vtime_err = 0; ioc->running = IOC_IDLE; } + + ioc_refresh_vrate(ioc, &now); } spin_unlock_irq(&ioc->lock); @@ -2132,11 +2258,13 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, struct ioc *ioc = iocg->ioc; struct ioc_margins *margins = &ioc->margins; u32 adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100); + u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi; u32 hwi; s64 margin; u64 cost, new_inuse; current_hweight(iocg, NULL, &hwi); + old_hwi = hwi; cost = abs_cost_to_cost(abs_cost, hwi); margin = now->vnow - vtime - cost; @@ -2171,6 +2299,10 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, iocg->inuse != iocg->active); spin_unlock_irq(&ioc->lock); + + TRACE_IOCG_PATH(inuse_adjust, iocg, now, + old_inuse, iocg->inuse, old_hwi, hwi); + return cost; } @@ -2559,6 +2691,7 @@ static int blk_iocost_init(struct request_queue *q) INIT_LIST_HEAD(&ioc->active_iocgs); ioc->running = IOC_IDLE; + ioc->vtime_base_rate = VTIME_PER_USEC; atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); ioc->period_at = ktime_to_us(ktime_get()); @@ -2641,8 +2774,6 @@ static void ioc_pd_init(struct blkg_policy_data *pd) init_waitqueue_head(&iocg->waitq); hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); iocg->waitq_timer.function = iocg_waitq_timer_fn; - hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - iocg->delay_timer.function = iocg_delay_timer_fn; iocg->level = blkg->blkcg->css.cgroup->level; @@ -2679,7 +2810,6 @@ static void ioc_pd_free(struct blkg_policy_data *pd) spin_unlock_irqrestore(&ioc->lock, flags); hrtimer_cancel(&iocg->waitq_timer); - hrtimer_cancel(&iocg->delay_timer); } free_percpu(iocg->pcpu_stat); kfree(iocg); @@ -2696,7 +2826,7 @@ static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) if (iocg->level == 0) { unsigned vp10k = DIV64_U64_ROUND_CLOSEST( - atomic64_read(&ioc->vtime_rate) * 10000, + ioc->vtime_base_rate * 10000, VTIME_PER_USEC); pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100); diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h index ee024fe8fef6..b350860d2e71 100644 --- a/include/trace/events/iocost.h +++ b/include/trace/events/iocost.h @@ -95,7 +95,7 @@ DECLARE_EVENT_CLASS(iocg_inuse_update, ) ); -DEFINE_EVENT(iocg_inuse_update, iocost_inuse_takeback, +DEFINE_EVENT(iocg_inuse_update, iocost_inuse_shortage, TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, u32 old_inuse, u32 new_inuse, @@ -105,7 +105,7 @@ DEFINE_EVENT(iocg_inuse_update, iocost_inuse_takeback, old_hw_inuse, new_hw_inuse) ); -DEFINE_EVENT(iocg_inuse_update, iocost_inuse_giveaway, +DEFINE_EVENT(iocg_inuse_update, iocost_inuse_transfer, TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, u32 old_inuse, u32 new_inuse, @@ -115,7 +115,7 @@ DEFINE_EVENT(iocg_inuse_update, iocost_inuse_giveaway, old_hw_inuse, new_hw_inuse) ); -DEFINE_EVENT(iocg_inuse_update, iocost_inuse_reset, +DEFINE_EVENT(iocg_inuse_update, iocost_inuse_adjust, TP_PROTO(struct ioc_gq *iocg, const char *path, struct ioc_now *now, u32 old_inuse, u32 new_inuse,