/* * Check if any of the ctx, dispatch list or elevator * have pending work in this hardware queue.
*/ staticbool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
{ return !list_empty_careful(&hctx->dispatch) ||
sbitmap_any_bit_set(&hctx->ctx_map) ||
blk_mq_sched_has_work(hctx);
}
/* * Mark this ctx as having pending work in this hardware queue
*/ staticvoid blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx)
{ constint bit = ctx->index_hw[hctx->type];
if (!sbitmap_test_bit(&hctx->ctx_map, bit))
sbitmap_set_bit(&hctx->ctx_map, bit);
}
/* * non_owner variant of blk_freeze_queue_start * * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen * by the same task. This is fragile and should not be used if at all * possible.
*/ void blk_freeze_queue_start_non_owner(struct request_queue *q)
{
__blk_freeze_queue_start(q, NULL);
}
EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner);
/* * FIXME: replace the scsi_internal_device_*block_nowait() calls in the * mpt3sas driver such that this function can be removed.
*/ void blk_mq_quiesce_queue_nowait(struct request_queue *q)
{ unsignedlong flags;
spin_lock_irqsave(&q->queue_lock, flags); if (!q->quiesce_depth++)
blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
spin_unlock_irqrestore(&q->queue_lock, flags);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
/** * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done * @set: tag_set to wait on * * Note: it is driver's responsibility for making sure that quiesce has * been started on or more of the request_queues of the tag_set. This * function only waits for the quiesce on those request_queues that had * the quiesce flag set using blk_mq_quiesce_queue_nowait.
*/ void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set)
{ if (set->flags & BLK_MQ_F_BLOCKING)
synchronize_srcu(set->srcu); else
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
/** * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished * @q: request queue. * * Note: this function does not prevent that the struct request end_io() * callback function is invoked. Once this function is returned, we make * sure no dispatch can happen until the queue is unquiesced via * blk_mq_unquiesce_queue().
*/ void blk_mq_quiesce_queue(struct request_queue *q)
{
blk_mq_quiesce_queue_nowait(q); /* nothing to wait for non-mq queues */ if (queue_is_mq(q))
blk_mq_wait_quiesce_done(q->tag_set);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
/* * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue() * @q: request queue. * * This function recovers queue into the state before quiescing * which is done by blk_mq_quiesce_queue.
*/ void blk_mq_unquiesce_queue(struct request_queue *q)
{ unsignedlong flags; bool run_queue = false;
/* Set start and alloc time when the allocated request is actually used */ staticinlinevoid blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
{ #ifdef CONFIG_BLK_RQ_ALLOC_TIME if (blk_queue_rq_alloc_time(rq->q))
rq->alloc_time_ns = alloc_time_ns; else
rq->alloc_time_ns = 0; #endif
}
if (q->elevator) { /* * All requests use scheduler tags when an I/O scheduler is * enabled for the queue.
*/
data->rq_flags |= RQF_SCHED_TAGS;
/* * Flush/passthrough requests are special and go directly to the * dispatch list.
*/ if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH &&
!blk_op_is_passthrough(data->cmd_flags)) { struct elevator_mq_ops *ops = &q->elevator->type->ops;
if (data->flags & BLK_MQ_REQ_RESERVED)
data->rq_flags |= RQF_RESV;
/* * Try batched alloc if we want more than 1 tag.
*/ if (data->nr_tags > 1) {
rq = __blk_mq_alloc_requests_batch(data); if (rq) {
blk_mq_rq_time_init(rq, alloc_time_ns); return rq;
}
data->nr_tags = 1;
}
/* * Waiting allocations only fail because of an inactive hctx. In that * case just retry the hctx assignment and tag allocation as CPU hotplug * should have migrated us to an online CPU by now.
*/
tag = blk_mq_get_tag(data); if (tag == BLK_MQ_NO_TAG) { if (data->flags & BLK_MQ_REQ_NOWAIT) return NULL; /* * Give up the CPU and sleep for a random short time to * ensure that thread using a realtime scheduling class * are migrated off the CPU, and thus off the hctx that * is going away.
*/
msleep(3); goto retry;
}
/* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q))
alloc_time_ns = blk_time_get_ns();
/* * If the tag allocator sleeps we could get an allocation for a * different hardware context. No need to complicate the low level * allocator for this for the rare use case of a command tied to * a specific queue.
*/ if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) ||
WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED))) return ERR_PTR(-EINVAL);
if (hctx_idx >= q->nr_hw_queues) return ERR_PTR(-EIO);
ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret);
/* * Check if the hardware context is actually mapped to anything. * If not tell the caller that it should skip this queue.
*/
ret = -EXDEV;
data.hctx = xa_load(&q->hctx_table, hctx_idx); if (!blk_mq_hw_queue_mapped(data.hctx)) goto out_queue_exit;
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); if (cpu >= nr_cpu_ids) goto out_queue_exit;
data.ctx = __blk_mq_get_ctx(q, cpu);
if (q->elevator)
data.rq_flags |= RQF_SCHED_TAGS; else
blk_mq_tag_busy(data.hctx);
if (flags & BLK_MQ_REQ_RESERVED)
data.rq_flags |= RQF_RESV;
ret = -EWOULDBLOCK;
tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; if (!(data.rq_flags & RQF_SCHED_TAGS))
blk_mq_inc_active_requests(data.hctx);
rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
blk_mq_rq_time_init(rq, alloc_time_ns);
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL; return rq;
if (rq->rq_flags & RQF_USE_SCHED) {
q->elevator->type->ops.finish_request(rq); /* * For postflush request that may need to be * completed twice, we should clear this flag * to avoid double finish_request() on the rq.
*/
rq->rq_flags &= ~RQF_USE_SCHED;
}
}
/* * Fully end IO on a request. Does not support partial completions, or * errors.
*/ staticvoid blk_complete_request(struct request *req)
{ constbool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0; int total_bytes = blk_rq_bytes(req); struct bio *bio = req->bio;
if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ)
blk_integrity_complete(req, total_bytes);
/* * Upper layers may call blk_crypto_evict_key() anytime after the last * bio_endio(). Therefore, the keyslot must be released before that.
*/
blk_crypto_rq_put_keyslot(req);
blk_account_io_completion(req, total_bytes);
do { struct bio *next = bio->bi_next;
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
if (blk_req_bio_is_zone_append(req, bio))
blk_zone_append_update_request_bio(req, bio);
if (!is_flush)
bio_endio(bio);
bio = next;
} while (bio);
/* * Reset counters so that the request stacking driver * can find how many bytes remain in the request * later.
*/ if (!req->end_io) {
req->bio = NULL;
req->__data_len = 0;
}
}
/** * blk_update_request - Complete multiple bytes without completing the request * @req: the request being processed * @error: block status code * @nr_bytes: number of bytes to complete for @req * * Description: * Ends I/O on a number of bytes attached to @req, but doesn't complete * the request structure even if @req doesn't have leftover. * If @req has leftover, sets it up for the next range of segments. * * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * * Note: * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function * except in the consistency check at the end of this function. * * Return: * %false - this request doesn't have any more data * %true - this request has more data
**/ bool blk_update_request(struct request *req, blk_status_t error, unsignedint nr_bytes)
{ bool is_flush = req->rq_flags & RQF_FLUSH_SEQ; bool quiet = req->rq_flags & RQF_QUIET; int total_bytes;
/* * Upper layers may call blk_crypto_evict_key() anytime after the last * bio_endio(). Therefore, the keyslot must be released before that.
*/ if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
__blk_crypto_rq_put_keyslot(req);
total_bytes = 0; while (req->bio) { struct bio *bio = req->bio; unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
if (unlikely(error))
bio->bi_status = error;
if (bio_bytes == bio->bi_iter.bi_size) {
req->bio = bio->bi_next;
} elseif (bio_is_zone_append(bio) && error == BLK_STS_OK) { /* * Partial zone append completions cannot be supported * as the BIO fragments may end up not being written * sequentially.
*/
bio->bi_status = BLK_STS_IOERR;
}
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION); if (unlikely(quiet))
bio_set_flag(bio, BIO_QUIET);
bio_advance(bio, bio_bytes);
/* Don't actually finish bio if it's part of flush sequence */ if (!bio->bi_iter.bi_size) { if (blk_req_bio_is_zone_append(req, bio))
blk_zone_append_update_request_bio(req, bio); if (!is_flush)
bio_endio(bio);
}
total_bytes += bio_bytes;
nr_bytes -= bio_bytes;
if (!nr_bytes) break;
}
/* * completely done
*/ if (!req->bio) { /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request * later.
*/
req->__data_len = 0; returnfalse;
}
req->__data_len -= total_bytes;
/* update sector only for requests with clear definition of sector */ if (!blk_rq_is_passthrough(req))
req->__sector += total_bytes >> 9;
/* mixed attributes always follow the first bio */ if (req->rq_flags & RQF_MIXED_MERGE) {
req->cmd_flags &= ~REQ_FAILFAST_MASK;
req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
}
if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { /* * If total number of sectors is less than the first segment * size, something has gone terribly wrong.
*/ if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
blk_dump_rq_flags(req, "request botched");
req->__data_len = blk_rq_cur_bytes(req);
}
/* recalculate the number of segments */
req->nr_phys_segments = blk_recalc_rq_segments(req);
}
/* * Account IO completion. flush_rq isn't accounted as a * normal IO on queueing nor completion. Accounting the * containing request is enough.
*/ if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) { constint sgrp = op_stat_group(req_op(req));
staticinlinebool blk_rq_passthrough_stats(struct request *req)
{ struct bio *bio = req->bio;
if (!blk_queue_passthrough_stat(req->q)) returnfalse;
/* Requests without a bio do not transfer data. */ if (!bio) returnfalse;
/* * Stats are accumulated in the bdev, so must have one attached to a * bio to track stats. Most drivers do not set the bdev for passthrough * requests, but nvme is one that will set it.
*/ if (!bio->bi_bdev) returnfalse;
/* * We don't know what a passthrough command does, but we know the * payload size and data direction. Ensuring the size is aligned to the * block size filters out most commands with payloads that don't * represent sector access.
*/ if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1)) returnfalse; returntrue;
}
/* * All non-passthrough requests are created from a bio with one * exception: when a flush command that is part of a flush sequence * generated by the state machine in blk-flush.c is cloned onto the * lower device by dm-multipath we can get here without a bio.
*/ if (req->bio)
req->part = req->bio->bi_bdev; else
req->part = req->q->disk->part0;
staticinlinebool blk_mq_complete_need_ipi(struct request *rq)
{ int cpu = raw_smp_processor_id();
if (!IS_ENABLED(CONFIG_SMP) ||
!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) returnfalse; /* * With force threaded interrupts enabled, raising softirq from an SMP * function call will always result in waking the ksoftirqd thread. * This is probably worse than completing the request on a different * cache domain.
*/ if (force_irqthreads()) returnfalse;
/* same CPU or cache domain and capacity? Complete locally */ if (cpu == rq->mq_ctx->cpu ||
(!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
cpus_share_cache(cpu, rq->mq_ctx->cpu) &&
cpus_equal_capacity(cpu, rq->mq_ctx->cpu))) returnfalse;
/* don't try to IPI to an offline CPU */ return cpu_online(rq->mq_ctx->cpu);
}
/* * For request which hctx has only one ctx mapping, * or a polled request, always complete locally, * it's pointless to redirect the completion.
*/ if ((rq->mq_hctx->nr_ctx == 1 &&
rq->mq_ctx->cpu == raw_smp_processor_id()) ||
rq->cmd_flags & REQ_POLLED) returnfalse;
if (blk_mq_complete_need_ipi(rq)) {
blk_mq_complete_send_ipi(rq); returntrue;
}
/** * blk_mq_complete_request - end I/O on a request * @rq: the request being processed * * Description: * Complete a request by scheduling the ->complete_rq operation.
**/ void blk_mq_complete_request(struct request *rq)
{ if (!blk_mq_complete_request_remote(rq))
rq->q->mq_ops->complete(rq);
}
EXPORT_SYMBOL(blk_mq_complete_request);
/** * blk_mq_start_request - Start processing a request * @rq: Pointer to request to be started * * Function used by device drivers to notify the block layer that a request * is going to be processed now, so blk layer can do proper initializations * such as starting the timeout timer.
*/ void blk_mq_start_request(struct request *rq)
{ struct request_queue *q = rq->q;
if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
blk_integrity_prepare(rq);
if (rq->bio && rq->bio->bi_opf & REQ_POLLED)
WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num);
}
EXPORT_SYMBOL(blk_mq_start_request);
/* * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple * queues. This is important for md arrays to benefit from merging * requests.
*/ staticinlineunsignedshort blk_plug_max_rq_count(struct blk_plug *plug)
{ if (plug->multiple_queues) return BLK_MAX_REQUEST_COUNT * 2; return BLK_MAX_REQUEST_COUNT;
}
if (!plug->multiple_queues && last && last->q != rq->q)
plug->multiple_queues = true; /* * Any request allocated from sched tags can't be issued to * ->queue_rqs() directly
*/ if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS))
plug->has_elevator = true;
rq_list_add_tail(&plug->mq_list, rq);
plug->rq_count++;
}
/** * blk_execute_rq_nowait - insert a request to I/O scheduler for execution * @rq: request to insert * @at_head: insert request at head or tail of queue * * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution. Don't wait for completion. * * Note: * This function will invoke @done directly if the queue is dead.
*/ void blk_execute_rq_nowait(struct request *rq, bool at_head)
{ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
bool blk_rq_is_poll(struct request *rq)
{ if (!rq->mq_hctx) returnfalse; if (rq->mq_hctx->type != HCTX_TYPE_POLL) returnfalse; returntrue;
}
EXPORT_SYMBOL_GPL(blk_rq_is_poll);
staticvoid blk_rq_poll_completion(struct request *rq, struct completion *wait)
{ do {
blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0);
cond_resched();
} while (!completion_done(wait));
}
/** * blk_execute_rq - insert a request into queue for execution * @rq: request to insert * @at_head: insert request at head or tail of queue * * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution and wait for completion. * Return: The blk_status_t result provided to blk_mq_end_request().
*/
blk_status_t blk_execute_rq(struct request *rq, bool at_head)
{ struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct blk_rq_wait wait = {
.done = COMPLETION_INITIALIZER_ONSTACK(wait.done),
};
while (!list_empty(&rq_list)) {
rq = list_entry(rq_list.next, struct request, queuelist);
list_del_init(&rq->queuelist); /* * If RQF_DONTPREP is set, the request has been started by the * driver already and might have driver-specific data allocated * already. Insert it into the hctx dispatch list to avoid * block layer merges for the request.
*/ if (rq->rq_flags & RQF_DONTPREP)
blk_mq_request_bypass_insert(rq, 0); else
blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD);
}
staticbool blk_mq_rq_inflight(struct request *rq, void *priv)
{ /* * If we find a request that isn't idle we know the queue is busy * as it's checked in the iter. * Return false to stop the iteration. * * In case of queue quiesce, if one flush data request is completed, * don't count it as inflight given the flush sequence is suspended, * and the original flush data request is invisible to driver, just * like other pending requests because of quiesce
*/ if (blk_mq_request_started(rq) && !(blk_queue_quiesced(rq->q) &&
blk_is_flush_data_rq(rq) &&
blk_mq_request_completed(rq))) { bool *busy = priv;
/* * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot * be reallocated underneath the timeout handler's processing, then * the expire check is reliable. If the request is not expired, then * it was completed and reallocated as a new request after returning * from blk_mq_check_expired().
*/ if (blk_mq_req_expired(rq, expired)) {
expired->has_timedout_rq = true; returnfalse;
} returntrue;
}
/* A deadlock might occur if a request is stuck requiring a * timeout at the same time a queue freeze is waiting * completion, since the timeout code would not be able to * acquire the queue reference here. * * That's why we don't use blk_queue_enter here; instead, we use * percpu_ref_tryget directly, because we need to be able to * obtain a reference even in the short window between the queue * starting to freeze, by dropping the first reference in * blk_freeze_queue_start, and the moment the last request is * consumed, marked by the instant q_usage_counter reaches * zero.
*/ if (!percpu_ref_tryget(&q->q_usage_counter)) return;
/* check if there is any timed-out request */
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired); if (expired.has_timedout_rq) { /* * Before walking tags, we must ensure any submit started * before the current time has finished. Since the submit * uses srcu or rcu, wait for a synchronization point to * ensure all running submits have finished
*/
blk_mq_wait_quiesce_done(q->tag_set);
if (expired.next != 0) {
mod_timer(&q->timeout, expired.next);
} else { /* * Request timeouts are handled as a forward rolling timer. If * we end up here it means that no requests are pending and * also that no request has been pending for a while. Mark * each hctx as idle.
*/
queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_idle(hctx);
}
}
blk_queue_exit(q);
}
/* * Process software queues that have been marked busy, splicing them * to the for-dispatch
*/ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{ struct flush_busy_ctx_data data = {
.hctx = hctx,
.list = list,
};
/* * Mark us waiting for a tag. For shared tags, this involves hooking us into * the tag wakeups. For non-shared tags, we can simply mark us needing a * restart. For both cases, take care to check the condition again after * marking us as waiting.
*/ staticbool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq)
{ struct sbitmap_queue *sbq; struct wait_queue_head *wq;
wait_queue_entry_t *wait; bool ret;
if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
!(blk_mq_is_shared_tags(hctx->flags))) {
blk_mq_sched_mark_restart_hctx(hctx);
/* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. * * Don't clear RESTART here, someone else could have set it. * At most this will cost an extra queue run.
*/ return blk_mq_get_driver_tag(rq);
}
wait = &hctx->dispatch_wait; if (!list_empty_careful(&wait->entry)) returnfalse;
/* * Add one explicit barrier since blk_mq_get_driver_tag() may * not imply barrier in case of failure. * * Order adding us to wait queue and allocating driver tag. * * The pair is the one implied in sbitmap_queue_wake_up() which * orders clearing sbitmap tag bits and waitqueue_active() in * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless * * Otherwise, re-order of adding wait queue and getting driver tag * may cause __sbitmap_queue_wake_up() to wake up nothing because * the waitqueue_active() may not observe us in wait queue.
*/
smp_mb();
/* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue.
*/
ret = blk_mq_get_driver_tag(rq); if (!ret) {
spin_unlock(&hctx->dispatch_wait_lock);
spin_unlock_irq(&wq->lock); returnfalse;
}
/* * We got a tag, remove ourselves from the wait queue to ensure * someone else gets the wakeup.
*/
list_del_init(&wait->entry);
atomic_dec(&sbq->ws_active);
spin_unlock(&hctx->dispatch_wait_lock);
spin_unlock_irq(&wq->lock);
returntrue;
}
#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8 #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4 /* * Update dispatch busy with the Exponential Weighted Moving Average(EWMA): * - EWMA is one simple way to compute running average value * - weight(7/8 and 1/8) is applied so that it can decrease exponentially * - take 4 as factor for avoiding to get too small(0) result, and this * factor doesn't matter because EWMA decreases exponentially
*/ staticvoid blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
{ unsignedint ewma;
if (need_budget) {
budget_token = blk_mq_get_dispatch_budget(rq->q); if (budget_token < 0) {
blk_mq_put_driver_tag(rq); return PREP_DISPATCH_NO_BUDGET;
}
blk_mq_set_rq_budget_token(rq, budget_token);
}
if (!blk_mq_get_driver_tag(rq)) { /* * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. The * waitqueue takes care of that. If the queue is run * before we add this entry back on the dispatch list, * we'll re-run it below.
*/ if (!blk_mq_mark_tag_wait(hctx, rq)) { /* * All budgets not got from this function will be put * together during handling partial dispatch
*/ if (need_budget)
blk_mq_put_dispatch_budget(rq->q, budget_token); return PREP_DISPATCH_NO_TAG;
}
}
return PREP_DISPATCH_OK;
}
/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */ staticvoid blk_mq_release_budgets(struct request_queue *q, struct list_head *list)
{ struct request *rq;
list_for_each_entry(rq, list, queuelist) { int budget_token = blk_mq_get_rq_budget_token(rq);
if (budget_token >= 0)
blk_mq_put_dispatch_budget(q, budget_token);
}
}
/* * blk_mq_commit_rqs will notify driver using bd->last that there is no * more requests. (See comment in struct blk_mq_ops for commit_rqs for * details) * Attention, we should explicitly call this in unusual cases: * 1) did not queue everything initially scheduled to queue * 2) the last attempt to queue a request failed
*/ staticvoid blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued, bool from_schedule)
{ if (hctx->queue->mq_ops->commit_rqs && queued) {
trace_block_unplug(hctx->queue, queued, !from_schedule);
hctx->queue->mq_ops->commit_rqs(hctx);
}
}
/* * Returns true if we did some work AND can potentially do more.
*/ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, bool get_budget)
{ enum prep_dispatch prep; struct request_queue *q = hctx->queue; struct request *rq; int queued;
blk_status_t ret = BLK_STS_OK; bool needs_resource = false;
if (list_empty(list)) returnfalse;
/* * Now process all the entries, sending them to the driver.
*/
queued = 0; do { struct blk_mq_queue_data bd;
ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK:
queued++; break; case BLK_STS_RESOURCE:
needs_resource = true;
fallthrough; case BLK_STS_DEV_RESOURCE:
blk_mq_handle_dev_resource(rq, list); goto out; default:
blk_mq_end_request(rq, ret);
}
} while (!list_empty(list));
out: /* If we didn't flush the entire list, we could have told the driver * there was more coming, but that turned out to be a lie.
*/ if (!list_empty(list) || ret != BLK_STS_OK)
blk_mq_commit_rqs(hctx, queued, false);
/* * Any items that need requeuing? Stuff them into hctx->dispatch, * that is where we will continue on next queue run.
*/ if (!list_empty(list)) { bool needs_restart; /* For non-shared tags, the RESTART check will suffice */ bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) ||
blk_mq_is_shared_tags(hctx->flags));
/* * If the caller allocated budgets, free the budgets of the * requests that have not yet been passed to the block driver.
*/ if (!get_budget)
blk_mq_release_budgets(q, list);
/* * Order adding requests to hctx->dispatch and checking * SCHED_RESTART flag. The pair of this smp_mb() is the one * in blk_mq_sched_restart(). Avoid restart code path to * miss the new added requests to hctx->dispatch, meantime * SCHED_RESTART is observed here.
*/
smp_mb();
/* * If SCHED_RESTART was set by the caller of this function and * it is no longer set that means that it was cleared by another * thread and hence that a queue rerun is needed. * * If 'no_tag' is set, that means that we failed getting * a driver tag with an I/O scheduler attached. If our dispatch * waitqueue is no longer active, ensure that we run the queue * AFTER adding our entries back to the list. * * If no I/O scheduler has been configured it is possible that * the hardware queue got stopped and restarted before requests * were pushed back onto the dispatch list. Rerun the queue to * avoid starvation. Notes: * - blk_mq_run_hw_queue() checks whether or not a queue has * been stopped before rerunning a queue. * - Some but not all block drivers stop a queue before * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq * and dm-rq. * * If driver returns BLK_STS_RESOURCE and SCHED_RESTART * bit is set, run queue after a delay to avoid IO stalls * that could otherwise occur if the queue is idle. We'll do * similar if we couldn't get budget or couldn't lock a zone * and SCHED_RESTART is set.
*/
needs_restart = blk_mq_sched_needs_restart(hctx); if (prep == PREP_DISPATCH_NO_BUDGET)
needs_resource = true; if (!needs_restart ||
(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
blk_mq_run_hw_queue(hctx, true); elseif (needs_resource)
blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
staticinlineint blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
{ int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
if (cpu >= nr_cpu_ids)
cpu = cpumask_first(hctx->cpumask); return cpu;
}
/* * ->next_cpu is always calculated from hctx->cpumask, so simply use * it for speeding up the check
*/ staticbool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx)
{ return hctx->next_cpu >= nr_cpu_ids;
}
/* * It'd be great if the workqueue API had a way to pass * in a mask and had some smarts for more clever placement. * For now we just round-robin here, switching for every * BLK_MQ_CPU_WORK_BATCH queued items.
*/ staticint blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
{ bool tried = false; int next_cpu = hctx->next_cpu;
/* Switch to unbound if no allowable CPUs in this hctx */ if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx)) return WORK_CPU_UNBOUND;
/* * Do unbound schedule if we can't find a online CPU for this hctx, * and it should only happen in the path of handling CPU DEAD.
*/ if (!cpu_online(next_cpu)) { if (!tried) {
tried = true; goto select_cpu;
}
/* * Make sure to re-select CPU next time once after CPUs * in hctx->cpumask become online again.
*/
hctx->next_cpu = next_cpu;
hctx->next_cpu_batch = 1; return WORK_CPU_UNBOUND;
}
hctx->next_cpu = next_cpu; return next_cpu;
}
/** * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. * @hctx: Pointer to the hardware queue to run. * @msecs: Milliseconds of delay to wait before running the queue. * * Run a hardware queue asynchronously with a delay of @msecs.
*/ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsignedlong msecs)
{ if (unlikely(blk_mq_hctx_stopped(hctx))) return;
kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
/* * When queue is quiesced, we may be switching io scheduler, or * updating nr_hw_queues, or other things, and we can't run queue * any more, even blk_mq_hctx_has_pending() can't be called safely. * * And queue will be rerun in blk_mq_unquiesce_queue() if it is * quiesced.
*/
__blk_mq_run_dispatch_ops(hctx->queue, false,
need_run = !blk_queue_quiesced(hctx->queue) &&
blk_mq_hctx_has_pending(hctx)); return need_run;
}
/** * blk_mq_run_hw_queue - Start to run a hardware queue. * @hctx: Pointer to the hardware queue to run. * @async: If we want to run the queue asynchronously. * * Check if the request queue is not in a quiesced state and if there are * pending requests to be sent. If this is true, run the queue to send requests * to hardware.
*/ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{ bool need_run;
/* * We can't run the queue inline with interrupts disabled.
*/
WARN_ON_ONCE(!async && in_interrupt());
need_run = blk_mq_hw_queue_need_run(hctx); if (!need_run) { unsignedlong flags;
/* * Synchronize with blk_mq_unquiesce_queue(), because we check * if hw queue is quiesced locklessly above, we need the use * ->queue_lock to make sure we see the up-to-date status to * not miss rerunning the hw queue.
*/
spin_lock_irqsave(&hctx->queue->queue_lock, flags);
need_run = blk_mq_hw_queue_need_run(hctx);
spin_unlock_irqrestore(&hctx->queue->queue_lock, flags);
if (!need_run) return;
}
if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
blk_mq_delay_run_hw_queue(hctx, 0); return;
}
/* * Return prefered queue to dispatch from (if any) for non-mq aware IO * scheduler.
*/ staticstruct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
{ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); /* * If the IO scheduler does not respect hardware queues when * dispatching, we just don't bother with multiple HW queues and * dispatch from hctx for the current CPU since running multiple queues * just causes lock contention inside the scheduler and pointless cache * bouncing.
*/ struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT];
if (!blk_mq_hctx_stopped(hctx)) return hctx; return NULL;
}
/** * blk_mq_run_hw_queues - Run all hardware queues in a request queue. * @q: Pointer to the request queue to run. * @async: If we want to run the queue asynchronously.
*/ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
{ struct blk_mq_hw_ctx *hctx, *sq_hctx; unsignedlong i;
sq_hctx = NULL; if (blk_queue_sq_sched(q))
sq_hctx = blk_mq_get_sq_hctx(q);
queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; /* * Dispatch from this hctx either if there's no hctx preferred * by IO scheduler or if it has requests that bypass the * scheduler.
*/ if (!sq_hctx || sq_hctx == hctx ||
!list_empty_careful(&hctx->dispatch))
blk_mq_run_hw_queue(hctx, async);
}
}
EXPORT_SYMBOL(blk_mq_run_hw_queues);
/** * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. * @q: Pointer to the request queue to run. * @msecs: Milliseconds of delay to wait before running the queues.
*/ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsignedlong msecs)
{ struct blk_mq_hw_ctx *hctx, *sq_hctx; unsignedlong i;
sq_hctx = NULL; if (blk_queue_sq_sched(q))
sq_hctx = blk_mq_get_sq_hctx(q);
queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; /* * If there is already a run_work pending, leave the * pending delay untouched. Otherwise, a hctx can stall * if another hctx is re-delaying the other's work * before the work executes.
*/ if (delayed_work_pending(&hctx->run_work)) continue; /* * Dispatch from this hctx either if there's no hctx preferred * by IO scheduler or if it has requests that bypass the * scheduler.
*/ if (!sq_hctx || sq_hctx == hctx ||
!list_empty_careful(&hctx->dispatch))
blk_mq_delay_run_hw_queue(hctx, msecs);
}
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
/* * This function is often used for pausing .queue_rq() by driver when * there isn't enough resource or some conditions aren't satisfied, and * BLK_STS_RESOURCE is usually returned. * * We do not guarantee that dispatch can be drained or blocked * after blk_mq_stop_hw_queue() returns. Please use * blk_mq_quiesce_queue() for that requirement.
*/ void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
{
cancel_delayed_work(&hctx->run_work);
/* * This function is often used for pausing .queue_rq() by driver when * there isn't enough resource or some conditions aren't satisfied, and * BLK_STS_RESOURCE is usually returned. * * We do not guarantee that dispatch can be drained or blocked * after blk_mq_stop_hw_queues() returns. Please use * blk_mq_quiesce_queue() for that requirement.
*/ void blk_mq_stop_hw_queues(struct request_queue *q)
{ struct blk_mq_hw_ctx *hctx; unsignedlong i;
void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{ if (!blk_mq_hctx_stopped(hctx)) return;
clear_bit(BLK_MQ_S_STOPPED, &hctx->state); /* * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch * list in the subsequent routine.
*/
smp_mb__after_atomic();
blk_mq_run_hw_queue(hctx, async);
}
EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
/** * blk_mq_request_bypass_insert - Insert a request at dispatch list. * @rq: Pointer to request to be inserted. * @flags: BLK_MQ_INSERT_* * * Should only be used carefully, when the caller knows we want to * bypass a potential IO scheduler on the target device.
*/ staticvoid blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags)
{ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
/* * Try to issue requests directly if the hw queue isn't busy to save an * extra enqueue & dequeue to the sw queue.
*/ if (!hctx->dispatch_busy && !run_queue_async) {
blk_mq_run_dispatch_ops(hctx->queue,
blk_mq_try_issue_list_directly(hctx, list)); if (list_empty(list)) goto out;
}
/* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now
*/
list_for_each_entry(rq, list, queuelist) {
BUG_ON(rq->mq_ctx != ctx);
trace_block_rq_insert(rq); if (rq->cmd_flags & REQ_NOWAIT)
run_queue_async = true;
}
if (blk_rq_is_passthrough(rq)) { /* * Passthrough request have to be added to hctx->dispatch * directly. The device may be in a situation where it can't * handle FS request, and always returns BLK_STS_RESOURCE for * them, which gets them added to hctx->dispatch. * * If a passthrough request is required to unblock the queues, * and it is added to the scheduler queue, there is no chance to * dispatch it given we prioritize requests in hctx->dispatch.
*/
blk_mq_request_bypass_insert(rq, flags);
} elseif (req_op(rq) == REQ_OP_FLUSH) { /* * Firstly normal IO request is inserted to scheduler queue or * sw queue, meantime we add flush request to dispatch queue( * hctx->dispatch) directly and there is at most one in-flight * flush request for each hw queue, so it doesn't matter to add * flush request to tail or front of the dispatch queue. * * Secondly in case of NCQ, flush request belongs to non-NCQ * command, and queueing it will fail when there is any * in-flight normal IO request(NCQ command). When adding flush * rq to the front of hctx->dispatch, it is easier to introduce * extra time to flush rq's latency because of S_SCHED_RESTART * compared with adding to the tail of dispatch queue, then * chance of flush merge is increased, and less flush requests * will be issued to controller. It is observed that ~10% time * is saved in blktests block/004 on disk attached to AHCI/NCQ * drive when adding flush rq to the front of hctx->dispatch. * * Simply queue flush rq to the front of hctx->dispatch so that * intensive flush workloads can benefit in case of NCQ HW.
*/
blk_mq_request_bypass_insert(rq, BLK_MQ_INSERT_AT_HEAD);
} elseif (q->elevator) {
LIST_HEAD(list);
/* * For OK queue, we are done. For error, caller may kill it. * Any other error (busy), just add it to our list as we * previously would have done.
*/
ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK:
blk_mq_update_dispatch_busy(hctx, false); break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE:
blk_mq_update_dispatch_busy(hctx, true);
__blk_mq_requeue_request(rq); break; default:
blk_mq_update_dispatch_busy(hctx, false); break;
}
return ret;
}
staticbool blk_mq_get_budget_and_tag(struct request *rq)
{ int budget_token;
budget_token = blk_mq_get_dispatch_budget(rq->q); if (budget_token < 0) returnfalse;
blk_mq_set_rq_budget_token(rq, budget_token); if (!blk_mq_get_driver_tag(rq)) {
blk_mq_put_dispatch_budget(rq->q, budget_token); returnfalse;
} returntrue;
}
/** * blk_mq_try_issue_directly - Try to send a request directly to device driver. * @hctx: Pointer of the associated hardware queue. * @rq: Pointer to request to be sent. * * If the device has enough resources to accept a new request now, send the * request directly to device driver. Else, insert at hctx->dispatch queue, so * we can try send it another time in the future. Requests inserted at this * queue have higher priority.
*/ staticvoid blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq)
{
blk_status_t ret;
/* * Peek first request and see if we have a ->queue_rqs() hook. * If we do, we can dispatch the whole list in one go. * We already know at this point that all requests belong to the * same queue, caller must ensure that's the case.
*/ if (q->mq_ops->queue_rqs) {
blk_mq_run_dispatch_ops(q, __blk_mq_flush_list(q, rqs)); if (rq_list_empty(rqs)) return;
}
/* * We may have been called recursively midway through handling * plug->mq_list via a schedule() in the driver's queue_rq() callback. * To avoid mq_list changing under our feet, clear rq_count early and * bail out specifically if rq_count is 0 rather than checking * whether the mq_list is empty.
*/ if (plug->rq_count == 0) return;
depth = plug->rq_count;
plug->rq_count = 0;
if (!plug->has_elevator && !from_schedule) { if (plug->multiple_queues) {
blk_mq_dispatch_multiple_queue_requests(&plug->mq_list); return;
}
blk_mq_dispatch_queue_requests(&plug->mq_list, depth); if (rq_list_empty(&plug->mq_list)) return;
}
do {
blk_mq_dispatch_list(&plug->mq_list, from_schedule);
} while (!rq_list_empty(&plug->mq_list));
}
staticvoid blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{ int queued = 0;
blk_status_t ret = BLK_STS_OK;
rq = __blk_mq_alloc_requests(&data); if (unlikely(!rq))
rq_qos_cleanup(q, bio); return rq;
}
/* * Check if there is a suitable cached request and return it.
*/ staticstruct request *blk_mq_peek_cached_request(struct blk_plug *plug, struct request_queue *q, blk_opf_t opf)
{ enum hctx_type type = blk_mq_get_hctx_type(opf); struct request *rq;
if (!plug) return NULL;
rq = rq_list_peek(&plug->cached_rqs); if (!rq || rq->q != q) return NULL; if (type != rq->mq_hctx->type &&
(type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT)) return NULL; if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) return NULL; return rq;
}
staticvoid blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, struct bio *bio)
{ if (rq_list_pop(&plug->cached_rqs) != rq)
WARN_ON_ONCE(1);
/* * If any qos ->throttle() end up blocking, we will have flushed the * plug and hence killed the cached_rq list as well. Pop this entry * before we throttle.
*/
rq_qos_throttle(rq->q, bio);
/* .bi_sector of any zero sized bio need to be initialized */ if ((bio->bi_iter.bi_size & bs_mask) ||
((bio->bi_iter.bi_sector << SECTOR_SHIFT) & bs_mask)) returntrue; returnfalse;
}
/** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. * * Builds up a request structure from @q and @bio and send to the device. The * request may not be queued directly to hardware if: * * This request can be merged with another one * * We want to place request at plug queue for possible future merging * * There is an IO scheduler active at this queue * * It will not queue the request if there is an error with the bio, or at the * request creation.
*/ void blk_mq_submit_bio(struct bio *bio)
{ struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct blk_plug *plug = current->plug; constint is_sync = op_is_sync(bio->bi_opf); struct blk_mq_hw_ctx *hctx; unsignedint nr_segs; struct request *rq;
blk_status_t ret;
/* * If the plug has a cached request for this queue, try to use it.
*/
rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
/* * A BIO that was released from a zone write plug has already been * through the preparation in this function, already holds a reference * on the queue usage counter, and is the only write BIO in-flight for * the target zone. Go straight to preparing a request for it.
*/ if (bio_zone_write_plugging(bio)) {
nr_segs = bio->__bi_nr_segments; if (rq)
blk_queue_exit(q); goto new_request;
}
/* * The cached request already holds a q_usage_counter reference and we * don't have to acquire a new one if we use it.
*/ if (!rq) { if (unlikely(bio_queue_enter(bio))) return;
}
/* * Device reconfiguration may change logical block size or reduce the * number of poll queues, so the checks for alignment and poll support * have to be done with queue usage counter held.
*/ if (unlikely(bio_unaligned(bio, q))) {
bio_io_error(bio); goto queue_exit;
}
queue_exit: /* * Don't drop the queue reference if we were trying to use a cached * request and thus didn't acquire one.
*/ if (!rq)
blk_queue_exit(q);
}
#ifdef CONFIG_BLK_MQ_STACKING /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request * @rq: the request being queued
*/
blk_status_t blk_insert_cloned_request(struct request *rq)
{ struct request_queue *q = rq->q; unsignedint max_sectors = blk_queue_get_max_sectors(rq); unsignedint max_segments = blk_rq_get_max_segments(rq);
blk_status_t ret;
if (blk_rq_sectors(rq) > max_sectors) { /* * SCSI device does not have a good way to return if * Write Same/Zero is actually supported. If a device rejects * a non-read/write command (discard, write same,etc.) the * low-level device driver will set the relevant queue limit to * 0 to prevent blk-lib from issuing more of the offending * operations. Commands queued prior to the queue limit being * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O * errors being propagated to upper layers.
*/ if (max_sectors == 0) return BLK_STS_NOTSUPP;
printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
__func__, blk_rq_sectors(rq), max_sectors); return BLK_STS_IOERR;
}
/* * The queue settings related to segment counting may differ from the * original queue.
*/
rq->nr_phys_segments = blk_recalc_rq_segments(rq); if (rq->nr_phys_segments > max_segments) {
printk(KERN_ERR "%s: over max segments limit. (%u > %u)\n",
__func__, rq->nr_phys_segments, max_segments); return BLK_STS_IOERR;
}
if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR;
ret = blk_crypto_rq_get_keyslot(rq); if (ret != BLK_STS_OK) return ret;
blk_account_io_start(rq);
/* * Since we have a scheduler attached on the top device, * bypass a potential scheduler on the bottom device for * insert.
*/
blk_mq_run_dispatch_ops(q,
ret = blk_mq_request_issue_directly(rq, true)); if (ret)
blk_account_io_done(rq, blk_time_get_ns()); return ret;
}
EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
/** * blk_rq_unprep_clone - Helper function to free all bios in a cloned request * @rq: the clone request to be cleaned up * * Description: * Free all bios in @rq for a cloned request.
*/ void blk_rq_unprep_clone(struct request *rq)
{ struct bio *bio;
while ((bio = rq->bio) != NULL) {
rq->bio = bio->bi_next;
/** * blk_rq_prep_clone - Helper function to setup clone request * @rq: the request to be setup * @rq_src: original request to be cloned * @bs: bio_set that bios for clone are allocated from * @gfp_mask: memory allocation mask for bio * @bio_ctr: setup function to be called for each clone bio. * Returns %0 for success, non %0 for failure. * @data: private data to be passed to @bio_ctr * * Description: * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. * Also, pages which the original bios are pointing to are not copied * and the cloned bios just point same pages. * So cloned bios must be completed before original bios, which means * the caller must complete @rq before @rq_src.
*/ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data)
{ struct bio *bio_src;
if (!bs)
bs = &fs_bio_set;
__rq_for_each_bio(bio_src, rq_src) { struct bio *bio = bio_alloc_clone(rq->q->disk->part0, bio_src,
gfp_mask, bs); if (!bio) goto free_and_out;
/* * Steal bios from a request and add them to a bio list. * The request must not have been partially completed before.
*/ void blk_steal_bios(struct bio_list *list, struct request *rq)
{ if (rq->bio) { if (list->tail)
list->tail->bi_next = rq->bio; else
list->head = rq->bio;
list->tail = rq->biotail;
/* called before freeing request pool in @tags */ staticvoid blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, struct blk_mq_tags *tags)
{ struct page *page; unsignedlong flags;
/* * There is no need to clear mapping if driver tags is not initialized * or the mapping belongs to the driver tags.
*/ if (!drv_tags || drv_tags == tags) return;
list_for_each_entry(page, &tags->page_list, lru) { unsignedlong start = (unsignedlong)page_address(page); unsignedlong end = start + order_to_size(page->private); int i;
for (i = 0; i < drv_tags->nr_tags; i++) { struct request *rq = drv_tags->rqs[i]; unsignedlong rq_addr = (unsignedlong)rq;
/* * Wait until all pending iteration is done. * * Request reference is cleared and it is guaranteed to be observed * after the ->lock is released.
*/
spin_lock_irqsave(&drv_tags->lock, flags);
spin_unlock_irqrestore(&drv_tags->lock, flags);
}
/* * rq_size is the size of the request plus driver payload, rounded * to the cacheline size
*/
rq_size = round_up(sizeof(struct request) + set->cmd_size,
cache_line_size());
left = rq_size * depth;
for (i = 0; i < depth; ) { int this_order = max_order; struct page *page; int to_do; void *p;
while (this_order && left < order_to_size(this_order - 1))
this_order--;
do {
page = alloc_pages_node(node,
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
this_order); if (page) break; if (!this_order--) break; if (order_to_size(this_order) < rq_size) break;
} while (1);
staticbool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx, unsignedint this_cpu)
{ enum hctx_type type = hctx->type; int cpu;
/* * hctx->cpumask has to rule out isolated CPUs, but userspace still * might submit IOs on these isolated CPUs, so use the queue map to * check if all CPUs mapped to this hctx are offline
*/
for_each_online_cpu(cpu) { struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue,
type, cpu);
if (h != hctx) continue;
/* this hctx has at least one online CPU */ if (this_cpu != cpu) returntrue;
}
if (blk_mq_hctx_has_online_cpu(hctx, cpu)) return 0;
/* * Prevent new request from being allocated on the current hctx. * * The smp_mb__after_atomic() Pairs with the implied barrier in * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is * seen once we return from the tag allocator.
*/
set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
smp_mb__after_atomic();
/* * Try to grab a reference to the queue and wait for any outstanding * requests. If we could not grab a reference the queue has been * frozen and there are no requests.
*/ if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) { while (blk_mq_hctx_has_requests(hctx))
msleep(5);
percpu_ref_put(&hctx->queue->q_usage_counter);
}
return 0;
}
/* * Check if one CPU is mapped to the specified hctx * * Isolated CPUs have been ruled out from hctx->cpumask, which is supposed * to be used for scheduling kworker only. For other usage, please call this * helper for checking if one CPU belongs to the specified hctx
*/ staticbool blk_mq_cpu_mapped_to_hctx(unsignedint cpu, conststruct blk_mq_hw_ctx *hctx)
{ struct blk_mq_hw_ctx *mapped_hctx = blk_mq_map_queue_type(hctx->queue,
hctx->type, cpu);
/* * Unregister cpuhp callbacks from exited hw queues * * Safe to call if this `request_queue` is live
*/ staticvoid blk_mq_remove_hw_queues_cpuhp(struct request_queue *q)
{
LIST_HEAD(hctx_list);
/* * Register cpuhp callbacks from all hw queues * * Safe to call if this `request_queue` is live
*/ staticvoid blk_mq_add_hw_queues_cpuhp(struct request_queue *q)
{ struct blk_mq_hw_ctx *hctx; unsignedlong i;
/* * Before freeing hw queue, clearing the flush request reference in * tags->rqs[] for avoiding potential UAF.
*/ staticvoid blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, unsignedint queue_depth, struct request *flush_rq)
{ int i; unsignedlong flags;
/* The hw queue may not be mapped yet */ if (!tags) return;
WARN_ON_ONCE(req_ref_read(flush_rq) != 0);
for (i = 0; i < queue_depth; i++)
cmpxchg(&tags->rqs[i], flush_rq, NULL);
/* * Wait until all pending iteration is done. * * Request reference is cleared and it is guaranteed to be observed * after the ->lock is released.
*/
spin_lock_irqsave(&tags->lock, flags);
spin_unlock_irqrestore(&tags->lock, flags);
}
/* hctx->ctxs will be freed in queue's release handler */ staticvoid blk_mq_exit_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsignedint hctx_idx)
{ struct request *flush_rq = hctx->fq->flush_rq;
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_idle(hctx);
if (blk_queue_init_done(q))
blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
set->queue_depth, flush_rq); if (set->ops->exit_request)
set->ops->exit_request(set, flush_rq, hctx_idx);
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
/* * Allocate space for all possible cpus to avoid allocation at * runtime
*/
hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
gfp, node); if (!hctx->ctxs) goto free_cpumask;
__ctx->cpu = i;
spin_lock_init(&__ctx->lock); for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
INIT_LIST_HEAD(&__ctx->rq_lists[k]);
__ctx->queue = q;
/* * Set local node, IFF we have more than one hw queue. If * not, we remain on the home node of the device
*/ for (j = 0; j < set->nr_maps; j++) {
hctx = blk_mq_map_queue_type(q, j, i); if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
hctx->numa_node = cpu_to_node(i);
}
}
}
/* * Map software to hardware queues. * * If the cpu isn't present, the cpu is mapped to first hctx.
*/
for_each_possible_cpu(i) {
ctx = per_cpu_ptr(q->queue_ctx, i); for (j = 0; j < set->nr_maps; j++) { if (!set->map[j].nr_queues) {
ctx->hctxs[j] = blk_mq_map_queue_type(q,
HCTX_TYPE_DEFAULT, i); continue;
}
hctx_idx = set->map[j].mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ if (!set->tags[hctx_idx] &&
!__blk_mq_alloc_map_and_rqs(set, hctx_idx)) { /* * If tags initialization fail for some hctx, * that hctx won't be brought online. In this * case, remap the current ctx to hctx[0] which * is guaranteed to always have tags allocated
*/
set->map[j].mq_map[i] = 0;
}
hctx = blk_mq_map_queue_type(q, j, i);
ctx->hctxs[j] = hctx; /* * If the CPU is already set in the mask, then we've * mapped this one already. This can happen if * devices share queues across queue maps.
*/ if (cpumask_test_cpu(i, hctx->cpumask)) continue;
/* * If no software queues are mapped to this hardware queue, * disable it and free the request entries.
*/ if (!hctx->nr_ctx) { /* Never unmap queue 0. We need it as a * fallback in case of a new remap fails * allocation
*/ if (i)
__blk_mq_free_map_and_rqs(set, i);
hctx->tags = NULL; continue;
}
hctx->tags = set->tags[i];
WARN_ON(!hctx->tags);
/* * Set the map size to the number of mapped software queues. * This is more accurate and more efficient than looping * over all possibly mapped software queues.
*/
sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
/* * Rule out isolated CPUs from hctx->cpumask to avoid * running block kworker on isolated CPUs
*/
for_each_cpu(cpu, hctx->cpumask) { if (cpu_is_isolated(cpu))
cpumask_clear_cpu(cpu, hctx->cpumask);
}
/* * Caller needs to ensure that we're either frozen/quiesced, or that * the queue isn't live yet.
*/ staticvoid queue_set_hctx_shared(struct request_queue *q, bool shared)
{ struct blk_mq_hw_ctx *hctx; unsignedlong i;
/* * Check to see if we're transitioning to shared (from 1 to 2 queues).
*/ if (!list_empty(&set->tag_list) &&
!(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; /* update existing queue */
blk_mq_update_tag_set_shared(set, true);
} if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
queue_set_hctx_shared(q, true);
list_add_tail(&q->tag_set_list, &set->tag_list);
mutex_unlock(&set->tag_list_lock);
}
/* All allocations will be freed in release handler of q->mq_kobj */ staticint blk_mq_alloc_ctxs(struct request_queue *q)
{ struct blk_mq_ctxs *ctxs; int cpu;
ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL); if (!ctxs) return -ENOMEM;
ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx); if (!ctxs->queue_ctx) goto fail;
/* * It is the actual release handler for mq, but we do it from * request queue's release handler for avoiding use-after-free * and headache because q->mq_kobj shouldn't have been introduced, * but we can't group ctx/kctx kobj without it.
*/ void blk_mq_release(struct request_queue *q)
{ struct blk_mq_hw_ctx *hctx, *next; unsignedlong i;
/* all hctx are in .unused_hctx_list now */
list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
list_del_init(&hctx->hctx_list);
kobject_put(&hctx->kobj);
}
xa_destroy(&q->hctx_table);
/* * release .mq_kobj and sw queue's kobject now because * both share lifetime with request queue.
*/
blk_mq_sysfs_deinit(q);
}
if (!lim)
lim = &default_lim;
lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; if (set->nr_maps > HCTX_TYPE_POLL)
lim->features |= BLK_FEAT_POLL;
q = blk_alloc_queue(lim, set->numa_node); if (IS_ERR(q)) return q;
q->queuedata = queuedata;
ret = blk_mq_init_allocated_queue(set, q); if (ret) {
blk_put_queue(q); return ERR_PTR(ret);
} return q;
}
EXPORT_SYMBOL(blk_mq_alloc_queue);
/** * blk_mq_destroy_queue - shutdown a request queue * @q: request queue to shutdown * * This shuts down a request queue allocated by blk_mq_alloc_queue(). All future * requests will be failed with -ENODEV. The caller is responsible for dropping * the reference from blk_mq_alloc_queue() by calling blk_put_queue(). * * Context: can sleep
*/ void blk_mq_destroy_queue(struct request_queue *q)
{
WARN_ON_ONCE(!queue_is_mq(q));
WARN_ON_ONCE(blk_queue_registered(q));
if (!blk_get_queue(q)) return NULL;
disk = __alloc_disk_node(q, NUMA_NO_NODE, lkclass); if (!disk)
blk_put_queue(q); return disk;
}
EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue);
/* * Only hctx removed from cpuhp list can be reused
*/ staticbool blk_mq_hctx_is_reusable(struct blk_mq_hw_ctx *hctx)
{ return hlist_unhashed(&hctx->cpuhp_online) &&
hlist_unhashed(&hctx->cpuhp_dead);
}
staticstruct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( struct blk_mq_tag_set *set, struct request_queue *q, int hctx_idx, int node)
{ struct blk_mq_hw_ctx *hctx = NULL, *tmp;
/* reuse dead hctx first */
spin_lock(&q->unused_hctx_lock);
list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) { if (tmp->numa_node == node && blk_mq_hctx_is_reusable(tmp)) {
hctx = tmp; break;
}
} if (hctx)
list_del_init(&hctx->hctx_list);
spin_unlock(&q->unused_hctx_lock);
if (!hctx)
hctx = blk_mq_alloc_hctx(q, set, node); if (!hctx) goto fail;
if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) goto free_hctx;
for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i);
if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) { if (!old_hctx) break;
pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
node, old_node);
hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node);
WARN_ON_ONCE(!hctx);
}
} /* * Increasing nr_hw_queues fails. Free the newly allocated * hctxs and keep the previous q->nr_hw_queues.
*/ if (i != set->nr_hw_queues) {
j = q->nr_hw_queues;
} else {
j = i;
q->nr_hw_queues = set->nr_hw_queues;
}
/* tags can _not_ be used after returning from blk_mq_exit_queue */ void blk_mq_exit_queue(struct request_queue *q)
{ struct blk_mq_tag_set *set = q->tag_set;
/* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */
blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */
blk_mq_del_queue_tag_set(q);
}
staticint __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
{ int i;
if (blk_mq_is_shared_tags(set->flags)) {
set->shared_tags = blk_mq_alloc_map_and_rqs(set,
BLK_MQ_NO_HCTX_IDX,
set->queue_depth); if (!set->shared_tags) return -ENOMEM;
}
for (i = 0; i < set->nr_hw_queues; i++) { if (!__blk_mq_alloc_map_and_rqs(set, i)) goto out_unwind;
cond_resched();
}
return 0;
out_unwind: while (--i >= 0)
__blk_mq_free_map_and_rqs(set, i);
if (blk_mq_is_shared_tags(set->flags)) {
blk_mq_free_map_and_rqs(set, set->shared_tags,
BLK_MQ_NO_HCTX_IDX);
}
return -ENOMEM;
}
/* * Allocate the request maps associated with this tag_set. Note that this * may reduce the depth asked for, if memory is tight. set->queue_depth * will be updated to reflect the allocated depth.
*/ staticint blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set)
{ unsignedint depth; int err;
depth = set->queue_depth; do {
err = __blk_mq_alloc_rq_maps(set); if (!err) break;
set->queue_depth >>= 1; if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
err = -ENOMEM; break;
}
} while (set->queue_depth);
if (!set->queue_depth || err) {
pr_err("blk-mq: failed to allocate request map\n"); return -ENOMEM;
}
if (depth != set->queue_depth)
pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
depth, set->queue_depth);
return 0;
}
staticvoid blk_mq_update_queue_map(struct blk_mq_tag_set *set)
{ /* * blk_mq_map_queues() and multiple .map_queues() implementations * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the * number of hardware queues.
*/ if (set->nr_maps == 1)
set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
if (set->ops->map_queues) { int i;
/* * transport .map_queues is usually done in the following * way: * * for (queue = 0; queue < set->nr_hw_queues; queue++) { * mask = get_cpu_mask(queue) * for_each_cpu(cpu, mask) * set->map[x].mq_map[cpu] = queue; * } * * When we need to remap, the table has to be cleared for * killing stale mapping since one CPU may not be mapped * to any hw queue.
*/ for (i = 0; i < set->nr_maps; i++)
blk_mq_clear_mq_map(&set->map[i]);
for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) { if (!__blk_mq_alloc_map_and_rqs(set, i)) { while (--i >= set->nr_hw_queues)
__blk_mq_free_map_and_rqs(set, i); return -ENOMEM;
}
cond_resched();
}
/* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the * requested depth down, if it's too large. In that case, the set * value will be stored in set->queue_depth.
*/ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
{ int i, ret;
/* * If a crashdump is active, then we are potentially in a very * memory constrained environment. Limit us to 64 tags to prevent * using too much memory.
*/ if (is_kdump_kernel())
set->queue_depth = min(64U, set->queue_depth);
/* * There is no use for more h/w queues than cpus if we just have * a single map
*/ if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
set->nr_hw_queues = nr_cpu_ids;
if (set->flags & BLK_MQ_F_BLOCKING) {
set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL); if (!set->srcu) return -ENOMEM;
ret = init_srcu_struct(set->srcu); if (ret) goto out_free_srcu;
}
init_rwsem(&set->update_nr_hwq_lock);
ret = -ENOMEM;
set->tags = kcalloc_node(set->nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL,
set->numa_node); if (!set->tags) goto out_cleanup_srcu;
for (i = 0; i < set->nr_maps; i++) {
set->map[i].mq_map = kcalloc_node(nr_cpu_ids, sizeof(set->map[i].mq_map[0]),
GFP_KERNEL, set->numa_node); if (!set->map[i].mq_map) goto out_free_mq_map;
set->map[i].nr_queues = set->nr_hw_queues;
}
blk_mq_update_queue_map(set);
ret = blk_mq_alloc_set_map_and_rqs(set); if (ret) goto out_free_mq_map;
if (blk_mq_is_shared_tags(set->flags)) { /* * Shared tags, for sched tags, we allocate max initially hence * tags can't grow, see blk_mq_alloc_sched_tags().
*/ if (q->elevator)
blk_mq_tag_update_sched_shared_tags(q, nr); else
blk_mq_tag_resize_shared_tags(set, nr);
} elseif (!q->elevator) { /* * Non-shared hardware tags, nr is already checked from * queue_requests_store() and tags can't grow.
*/
queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->tags) continue;
sbitmap_queue_resize(&hctx->tags->bitmap_tags,
nr - hctx->tags->nr_reserved_tags);
}
} elseif (nr <= q->elevator->et->nr_requests) { /* Non-shared sched tags, and tags don't grow. */
queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->sched_tags) continue;
sbitmap_queue_resize(&hctx->sched_tags->bitmap_tags,
nr - hctx->sched_tags->nr_reserved_tags);
}
} else { /* Non-shared sched tags, and tags grow */
queue_for_each_hw_ctx(q, hctx, i)
hctx->sched_tags = et->tags[i];
old_et = q->elevator->et;
q->elevator->et = et;
}
q->nr_requests = nr; if (q->elevator && q->elevator->type->ops.depth_updated)
q->elevator->type->ops.depth_updated(q);
blk_mq_unquiesce_queue(q); return old_et;
}
/* * Switch back to the elevator type stored in the xarray.
*/ staticvoid blk_mq_elv_switch_back(struct request_queue *q, struct xarray *elv_tbl, struct xarray *et_tbl)
{ struct elevator_type *e = xa_load(elv_tbl, q->id); struct elevator_tags *t = xa_load(et_tbl, q->id);
/* The elv_update_nr_hw_queues unfreezes the queue. */
elv_update_nr_hw_queues(q, e, t);
/* Drop the reference acquired in blk_mq_elv_switch_none. */ if (e)
elevator_put(e);
}
/* * Stores elevator type in xarray and set current elevator to none. It uses * q->id as an index to store the elevator type into the xarray.
*/ staticint blk_mq_elv_switch_none(struct request_queue *q, struct xarray *elv_tbl)
{ int ret = 0;
/* * Accessing q->elevator without holding q->elevator_lock is safe here * because we're called from nr_hw_queue update which is protected by * set->update_nr_hwq_lock in the writer context. So, scheduler update/ * switch code (which acquires the same lock in the reader context) * can't run concurrently.
*/ if (q->elevator) {
ret = xa_insert(elv_tbl, q->id, q->elevator->type, GFP_KERNEL); if (WARN_ON_ONCE(ret)) return ret;
/* * Before we switch elevator to 'none', take a reference to * the elevator module so that while nr_hw_queue update is * running, no one can remove elevator module. We'd put the * reference to elevator module later when we switch back * elevator.
*/
__elevator_get(q->elevator->type);
elevator_set_none(q);
} return ret;
}
staticvoid __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
{ struct request_queue *q; int prev_nr_hw_queues = set->nr_hw_queues; unsignedint memflags; int i; struct xarray elv_tbl, et_tbl; bool queues_frozen = false;
lockdep_assert_held(&set->tag_list_lock);
if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
nr_hw_queues = nr_cpu_ids; if (nr_hw_queues < 1) return; if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) return;
memflags = memalloc_noio_save();
xa_init(&et_tbl); if (blk_mq_alloc_sched_tags_batch(&et_tbl, set, nr_hw_queues) < 0) goto out_memalloc_restore;
/* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done * updating the new sw to hw queue mappings.
*/
list_for_each_entry(q, &set->tag_list, tag_set_list) if (blk_mq_elv_switch_none(q, &elv_tbl)) goto switch_back;
if (q->nr_hw_queues != set->nr_hw_queues) { int i = prev_nr_hw_queues;
pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
nr_hw_queues, prev_nr_hw_queues); for (; i < set->nr_hw_queues; i++)
__blk_mq_free_map_and_rqs(set, i);
set->nr_hw_queues = prev_nr_hw_queues; goto fallback;
}
blk_mq_map_swqueue(q);
}
switch_back: /* The blk_mq_elv_switch_back unfreezes queue for us. */
list_for_each_entry(q, &set->tag_list, tag_set_list) { /* switch_back expects queue to be frozen */ if (!queues_frozen)
blk_mq_freeze_queue_nomemsave(q);
blk_mq_elv_switch_back(q, &elv_tbl, &et_tbl);
}