// SPDX-License-Identifier: GPL-2.0 /* * buffered writeback throttling. loosely based on CoDel. We can't drop * packets for IO scheduling, so the logic is something like this: * * - Monitor latencies in a defined window of time. * - If the minimum latency in the above window exceeds some target, increment * scaling step and scale down queue depth by a factor of 2x. The monitoring * window is then shrunk to 100 / sqrt(scaling step + 1). * - For any window where we don't have solid data on what the latencies * look like, retain status quo. * - If latencies look good, decrement scaling step. * - If we're only doing writes, allow the scaling step to go negative. This * will temporarily boost write performance, snapping back to a stable * scaling step of 0 if reads show up or the heavy writers finish. Unlike * positive scaling steps where we shrink the monitoring window, a negative * scaling step retains the default step==0 window size. * * Copyright (C) 2016 Jens Axboe *
*/ #include <linux/kernel.h> #include <linux/blk_types.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/swap.h>
/* * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered * to WBT_STATE_OFF/ON_MANUAL.
*/ enum {
WBT_STATE_ON_DEFAULT = 1, /* on by default */
WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */
WBT_STATE_OFF_DEFAULT = 3, /* off by default */
WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */
};
struct rq_wb { /* * Settings that govern how we throttle
*/ unsignedint wb_background; /* background writeback */ unsignedint wb_normal; /* normal writeback */
short enable_state; /* WBT_STATE_* */
/* * Number of consecutive periods where we don't have enough * information to make a firm scale up/down decision.
*/ unsignedint unknown_cnt;
/* * Disregard stats, if we don't meet this minimum
*/
RWB_MIN_WRITE_SAMPLES = 3,
/* * If we have this number of consecutive windows without enough * information to scale up or down, slowly return to center state * (step == 0).
*/
RWB_UNKNOWN_BUMP = 5,
};
staticvoid wb_timestamp(struct rq_wb *rwb, unsignedlong *var)
{ if (rwb_enabled(rwb)) { constunsignedlong cur = jiffies;
if (cur != *var)
*var = cur;
}
}
/* * If a task was rate throttled in balance_dirty_pages() within the last * second or so, use that to indicate a higher cleaning rate.
*/ staticbool wb_recent_wait(struct rq_wb *rwb)
{ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
/* * For discards, our limit is always the background. For writes, if * the device does write back caching, drop further down before we * wake people up.
*/ if (wb_acct & WBT_DISCARD)
limit = rwb->wb_background; elseif (blk_queue_write_cache(rwb->rqos.disk->queue) &&
!wb_recent_wait(rwb))
limit = 0; else
limit = rwb->wb_normal;
/* * Don't wake anyone up if we are above the normal limit.
*/ if (inflight && inflight >= limit) return;
if (wq_has_sleeper(&rqw->wait)) { int diff = limit - inflight;
/* * Called on completion of a request. Note that it's also called when * a request is merged, when the request gets freed.
*/ staticvoid wbt_done(struct rq_qos *rqos, struct request *rq)
{ struct rq_wb *rwb = RQWB(rqos);
if (!wbt_is_tracked(rq)) { if (wbt_is_read(rq)) { if (rwb->sync_cookie == rq) {
rwb->sync_issue = 0;
rwb->sync_cookie = NULL;
}
staticinlinebool stat_sample_valid(struct blk_rq_stat *stat)
{ /* * We need at least one read sample, and a minimum of * RWB_MIN_WRITE_SAMPLES. We require some write samples to know * that it's writes impacting us, and not just some sole read on * a device that is in a lower power state.
*/ return (stat[READ].nr_samples >= 1 &&
stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES);
}
/* * If our stored sync issue exceeds the window size, or it * exceeds our min target AND we haven't logged any entries, * flag the latency as exceeded. wbt works off completion latencies, * but for a flooded device, a single sync IO can take a long time * to complete after being issued. If this time exceeds our * monitoring window AND we didn't see any other completions in that * window, then count that sync IO as a violation of the latency.
*/
thislat = rwb_sync_issue_lat(rwb); if (thislat > rwb->cur_win_nsec ||
(thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) {
trace_wbt_lat(bdi, thislat); return LAT_EXCEEDED;
}
/* * No read/write mix, if stat isn't valid
*/ if (!stat_sample_valid(stat)) { /* * If we had writes in this stat window and the window is * current, we're only doing writes. If a task recently * waited or still has writes in flights, consider us doing * just writes as well.
*/ if (stat[WRITE].nr_samples || wb_recent_wait(rwb) ||
wbt_inflight(rwb)) return LAT_UNKNOWN_WRITES; return LAT_UNKNOWN;
}
/* * If the 'min' latency exceeds our target, step down.
*/ if (stat[READ].min > rwb->min_lat_nsec) {
trace_wbt_lat(bdi, stat[READ].min);
trace_wbt_stat(bdi, stat); return LAT_EXCEEDED;
}
if (rqd->scale_step > 0) { /* * We should speed this up, using some variant of a fast * integer inverse square root calculation. Since we only do * this for every window expiration, it's not a huge deal, * though.
*/
rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
int_sqrt((rqd->scale_step + 1) << 8));
} else { /* * For step < 0, we don't want to increase/decrease the * window size.
*/
rwb->cur_win_nsec = rwb->win_nsec;
}
/* * If we exceeded the latency target, step down. If we did not, * step one level up. If we don't know enough to say either exceeded * or ok, then don't do anything.
*/ switch (status) { case LAT_EXCEEDED:
scale_down(rwb, true); break; case LAT_OK:
scale_up(rwb); break; case LAT_UNKNOWN_WRITES: /* * We don't have a valid read/write sample, but we do have * writes going on. Allow step to go negative, to increase * write performance.
*/
scale_up(rwb); break; case LAT_UNKNOWN: if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP) break; /* * We get here when previously scaled reduced depth, and we * currently don't have a valid read/write sample. For that * case, slowly return to center state (step == 0).
*/ if (rqd->scale_step > 0)
scale_up(rwb); elseif (rqd->scale_step < 0)
scale_down(rwb, false); break; default: break;
}
/* * Re-arm timer, if we have IO in flight
*/ if (rqd->scale_step || inflight)
rwb_arm_timer(rwb);
}
if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD) return rwb->wb_background;
/* * At this point we know it's a buffered write. If this is * swap trying to free memory, or REQ_SYNC is set, then * it's WB_SYNC_ALL writeback, and we'll use the max limit for * that. If the write is marked as a background write, then use * the idle limit, or go to normal if we haven't had competing * IO for a bit.
*/ if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb))
limit = rwb->rq_depth.max_depth; elseif ((opf & REQ_BACKGROUND) || close_io(rwb)) { /* * If less than 100ms since we completed unrelated IO, * limit us to half the depth for background writeback.
*/
limit = rwb->wb_background;
} else
limit = rwb->wb_normal;
/* * Block if we will exceed our limit, or if we are currently waiting for * the timer to kick off queuing again.
*/ staticvoid __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
blk_opf_t opf)
{ struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); struct wbt_wait_data data = {
.rwb = rwb,
.wb_acct = wb_acct,
.opf = opf,
};
/* May sleep, if we have exceeded the writeback limits. */ staticvoid wbt_wait(struct rq_qos *rqos, struct bio *bio)
{ struct rq_wb *rwb = RQWB(rqos); enum wbt_flags flags;
flags = bio_to_wbt_flags(rwb, bio); if (!(flags & WBT_TRACKED)) { if (flags & WBT_READ)
wb_timestamp(rwb, &rwb->last_issue); return;
}
__wbt_wait(rwb, flags, bio->bi_opf);
if (!blk_stat_is_active(rwb->cb))
rwb_arm_timer(rwb);
}
/* * Track sync issue, in case it takes a long time to complete. Allows us * to react quicker, if a sync IO takes a long time to complete. Note * that this is just a hint. The request can go away when it completes, * so it's important we never dereference it. We only use the address to * compare with, which is why we store the sync_issue time locally.
*/ if (wbt_is_read(rq) && !rwb->sync_issue) {
rwb->sync_cookie = rq;
rwb->sync_issue = rq->io_start_time_ns;
}
}
/* Queue not registered? Maybe shutting down... */ if (!blk_queue_registered(q)) return;
if (queue_is_mq(q) && enable)
wbt_init(disk);
}
EXPORT_SYMBOL_GPL(wbt_enable_default);
u64 wbt_default_latency_nsec(struct request_queue *q)
{ /* * We default to 2msec for non-rotational storage, and 75msec * for rotational storage.
*/ if (blk_queue_nonrot(q)) return 2000000ULL; else return 75000000ULL;
}
staticint wbt_data_dir(conststruct request *rq)
{ constenum req_op op = req_op(rq);
if (op == REQ_OP_READ) return READ; elseif (op_is_write(op)) return WRITE;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.