1 /* 2 * Block multiqueue core code 3 * 4 * Copyright (C) 2013-2014 Jens Axboe 5 * Copyright (C) 2013-2014 Christoph Hellwig 6 */ 7 #include <linux/kernel.h> 8 #include <linux/module.h> 9 #include <linux/backing-dev.h> 10 #include <linux/bio.h> 11 #include <linux/blkdev.h> 12 #include <linux/kmemleak.h> 13 #include <linux/mm.h> 14 #include <linux/init.h> 15 #include <linux/slab.h> 16 #include <linux/workqueue.h> 17 #include <linux/smp.h> 18 #include <linux/llist.h> 19 #include <linux/list_sort.h> 20 #include <linux/cpu.h> 21 #include <linux/cache.h> 22 #include <linux/sched/sysctl.h> 23 #include <linux/delay.h> 24 #include <linux/crash_dump.h> 25 26 #include <trace/events/block.h> 27 28 #include <linux/blk-mq.h> 29 #include "blk.h" 30 #include "blk-mq.h" 31 #include "blk-mq-tag.h" 32 33 static DEFINE_MUTEX(all_q_mutex); 34 static LIST_HEAD(all_q_list); 35 36 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); 37 38 /* 39 * Check if any of the ctx's have pending work in this hardware queue 40 */ 41 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 42 { 43 unsigned int i; 44 45 for (i = 0; i < hctx->ctx_map.size; i++) 46 if (hctx->ctx_map.map[i].word) 47 return true; 48 49 return false; 50 } 51 52 static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, 53 struct blk_mq_ctx *ctx) 54 { 55 return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; 56 } 57 58 #define CTX_TO_BIT(hctx, ctx) \ 59 ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) 60 61 /* 62 * Mark this ctx as having pending work in this hardware queue 63 */ 64 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 65 struct blk_mq_ctx *ctx) 66 { 67 struct blk_align_bitmap *bm = get_bm(hctx, ctx); 68 69 if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) 70 set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); 71 } 72 73 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, 74 struct blk_mq_ctx *ctx) 75 { 76 struct blk_align_bitmap *bm = get_bm(hctx, ctx); 77 78 clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); 79 } 80 81 void blk_mq_freeze_queue_start(struct request_queue *q) 82 { 83 int freeze_depth; 84 85 freeze_depth = atomic_inc_return(&q->mq_freeze_depth); 86 if (freeze_depth == 1) { 87 percpu_ref_kill(&q->q_usage_counter); 88 blk_mq_run_hw_queues(q, false); 89 } 90 } 91 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); 92 93 static void blk_mq_freeze_queue_wait(struct request_queue *q) 94 { 95 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); 96 } 97 98 /* 99 * Guarantee no request is in use, so we can change any data structure of 100 * the queue afterward. 101 */ 102 void blk_freeze_queue(struct request_queue *q) 103 { 104 /* 105 * In the !blk_mq case we are only calling this to kill the 106 * q_usage_counter, otherwise this increases the freeze depth 107 * and waits for it to return to zero. For this reason there is 108 * no blk_unfreeze_queue(), and blk_freeze_queue() is not 109 * exported to drivers as the only user for unfreeze is blk_mq. 110 */ 111 blk_mq_freeze_queue_start(q); 112 blk_mq_freeze_queue_wait(q); 113 } 114 115 void blk_mq_freeze_queue(struct request_queue *q) 116 { 117 /* 118 * ...just an alias to keep freeze and unfreeze actions balanced 119 * in the blk_mq_* namespace 120 */ 121 blk_freeze_queue(q); 122 } 123 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); 124 125 void blk_mq_unfreeze_queue(struct request_queue *q) 126 { 127 int freeze_depth; 128 129 freeze_depth = atomic_dec_return(&q->mq_freeze_depth); 130 WARN_ON_ONCE(freeze_depth < 0); 131 if (!freeze_depth) { 132 percpu_ref_reinit(&q->q_usage_counter); 133 wake_up_all(&q->mq_freeze_wq); 134 } 135 } 136 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); 137 138 void blk_mq_wake_waiters(struct request_queue *q) 139 { 140 struct blk_mq_hw_ctx *hctx; 141 unsigned int i; 142 143 queue_for_each_hw_ctx(q, hctx, i) 144 if (blk_mq_hw_queue_mapped(hctx)) 145 blk_mq_tag_wakeup_all(hctx->tags, true); 146 147 /* 148 * If we are called because the queue has now been marked as 149 * dying, we need to ensure that processes currently waiting on 150 * the queue are notified as well. 151 */ 152 wake_up_all(&q->mq_freeze_wq); 153 } 154 155 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) 156 { 157 return blk_mq_has_free_tags(hctx->tags); 158 } 159 EXPORT_SYMBOL(blk_mq_can_queue); 160 161 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, 162 struct request *rq, int op, 163 unsigned int op_flags) 164 { 165 if (blk_queue_io_stat(q)) 166 op_flags |= REQ_IO_STAT; 167 168 INIT_LIST_HEAD(&rq->queuelist); 169 /* csd/requeue_work/fifo_time is initialized before use */ 170 rq->q = q; 171 rq->mq_ctx = ctx; 172 req_set_op_attrs(rq, op, op_flags); 173 /* do not touch atomic flags, it needs atomic ops against the timer */ 174 rq->cpu = -1; 175 INIT_HLIST_NODE(&rq->hash); 176 RB_CLEAR_NODE(&rq->rb_node); 177 rq->rq_disk = NULL; 178 rq->part = NULL; 179 rq->start_time = jiffies; 180 #ifdef CONFIG_BLK_CGROUP 181 rq->rl = NULL; 182 set_start_time_ns(rq); 183 rq->io_start_time_ns = 0; 184 #endif 185 rq->nr_phys_segments = 0; 186 #if defined(CONFIG_BLK_DEV_INTEGRITY) 187 rq->nr_integrity_segments = 0; 188 #endif 189 rq->special = NULL; 190 /* tag was already set */ 191 rq->errors = 0; 192 193 rq->cmd = rq->__cmd; 194 195 rq->extra_len = 0; 196 rq->sense_len = 0; 197 rq->resid_len = 0; 198 rq->sense = NULL; 199 200 INIT_LIST_HEAD(&rq->timeout_list); 201 rq->timeout = 0; 202 203 rq->end_io = NULL; 204 rq->end_io_data = NULL; 205 rq->next_rq = NULL; 206 207 ctx->rq_dispatched[rw_is_sync(op, op_flags)]++; 208 } 209 210 static struct request * 211 __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags) 212 { 213 struct request *rq; 214 unsigned int tag; 215 216 tag = blk_mq_get_tag(data); 217 if (tag != BLK_MQ_TAG_FAIL) { 218 rq = data->hctx->tags->rqs[tag]; 219 220 if (blk_mq_tag_busy(data->hctx)) { 221 rq->cmd_flags = REQ_MQ_INFLIGHT; 222 atomic_inc(&data->hctx->nr_active); 223 } 224 225 rq->tag = tag; 226 blk_mq_rq_ctx_init(data->q, data->ctx, rq, op, op_flags); 227 return rq; 228 } 229 230 return NULL; 231 } 232 233 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, 234 unsigned int flags) 235 { 236 struct blk_mq_ctx *ctx; 237 struct blk_mq_hw_ctx *hctx; 238 struct request *rq; 239 struct blk_mq_alloc_data alloc_data; 240 int ret; 241 242 ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); 243 if (ret) 244 return ERR_PTR(ret); 245 246 ctx = blk_mq_get_ctx(q); 247 hctx = q->mq_ops->map_queue(q, ctx->cpu); 248 blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); 249 250 rq = __blk_mq_alloc_request(&alloc_data, rw, 0); 251 if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) { 252 __blk_mq_run_hw_queue(hctx); 253 blk_mq_put_ctx(ctx); 254 255 ctx = blk_mq_get_ctx(q); 256 hctx = q->mq_ops->map_queue(q, ctx->cpu); 257 blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); 258 rq = __blk_mq_alloc_request(&alloc_data, rw, 0); 259 ctx = alloc_data.ctx; 260 } 261 blk_mq_put_ctx(ctx); 262 if (!rq) { 263 blk_queue_exit(q); 264 return ERR_PTR(-EWOULDBLOCK); 265 } 266 267 rq->__data_len = 0; 268 rq->__sector = (sector_t) -1; 269 rq->bio = rq->biotail = NULL; 270 return rq; 271 } 272 EXPORT_SYMBOL(blk_mq_alloc_request); 273 274 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, 275 unsigned int flags, unsigned int hctx_idx) 276 { 277 struct blk_mq_hw_ctx *hctx; 278 struct blk_mq_ctx *ctx; 279 struct request *rq; 280 struct blk_mq_alloc_data alloc_data; 281 int ret; 282 283 /* 284 * If the tag allocator sleeps we could get an allocation for a 285 * different hardware context. No need to complicate the low level 286 * allocator for this for the rare use case of a command tied to 287 * a specific queue. 288 */ 289 if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT))) 290 return ERR_PTR(-EINVAL); 291 292 if (hctx_idx >= q->nr_hw_queues) 293 return ERR_PTR(-EIO); 294 295 ret = blk_queue_enter(q, true); 296 if (ret) 297 return ERR_PTR(ret); 298 299 /* 300 * Check if the hardware context is actually mapped to anything. 301 * If not tell the caller that it should skip this queue. 302 */ 303 hctx = q->queue_hw_ctx[hctx_idx]; 304 if (!blk_mq_hw_queue_mapped(hctx)) { 305 ret = -EXDEV; 306 goto out_queue_exit; 307 } 308 ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask)); 309 310 blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); 311 rq = __blk_mq_alloc_request(&alloc_data, rw, 0); 312 if (!rq) { 313 ret = -EWOULDBLOCK; 314 goto out_queue_exit; 315 } 316 317 return rq; 318 319 out_queue_exit: 320 blk_queue_exit(q); 321 return ERR_PTR(ret); 322 } 323 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); 324 325 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, 326 struct blk_mq_ctx *ctx, struct request *rq) 327 { 328 const int tag = rq->tag; 329 struct request_queue *q = rq->q; 330 331 if (rq->cmd_flags & REQ_MQ_INFLIGHT) 332 atomic_dec(&hctx->nr_active); 333 rq->cmd_flags = 0; 334 335 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 336 blk_mq_put_tag(hctx, tag, &ctx->last_tag); 337 blk_queue_exit(q); 338 } 339 340 void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) 341 { 342 struct blk_mq_ctx *ctx = rq->mq_ctx; 343 344 ctx->rq_completed[rq_is_sync(rq)]++; 345 __blk_mq_free_request(hctx, ctx, rq); 346 347 } 348 EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); 349 350 void blk_mq_free_request(struct request *rq) 351 { 352 struct blk_mq_hw_ctx *hctx; 353 struct request_queue *q = rq->q; 354 355 hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); 356 blk_mq_free_hctx_request(hctx, rq); 357 } 358 EXPORT_SYMBOL_GPL(blk_mq_free_request); 359 360 inline void __blk_mq_end_request(struct request *rq, int error) 361 { 362 blk_account_io_done(rq); 363 364 if (rq->end_io) { 365 rq->end_io(rq, error); 366 } else { 367 if (unlikely(blk_bidi_rq(rq))) 368 blk_mq_free_request(rq->next_rq); 369 blk_mq_free_request(rq); 370 } 371 } 372 EXPORT_SYMBOL(__blk_mq_end_request); 373 374 void blk_mq_end_request(struct request *rq, int error) 375 { 376 if (blk_update_request(rq, error, blk_rq_bytes(rq))) 377 BUG(); 378 __blk_mq_end_request(rq, error); 379 } 380 EXPORT_SYMBOL(blk_mq_end_request); 381 382 static void __blk_mq_complete_request_remote(void *data) 383 { 384 struct request *rq = data; 385 386 rq->q->softirq_done_fn(rq); 387 } 388 389 static void blk_mq_ipi_complete_request(struct request *rq) 390 { 391 struct blk_mq_ctx *ctx = rq->mq_ctx; 392 bool shared = false; 393 int cpu; 394 395 if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { 396 rq->q->softirq_done_fn(rq); 397 return; 398 } 399 400 cpu = get_cpu(); 401 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) 402 shared = cpus_share_cache(cpu, ctx->cpu); 403 404 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { 405 rq->csd.func = __blk_mq_complete_request_remote; 406 rq->csd.info = rq; 407 rq->csd.flags = 0; 408 smp_call_function_single_async(ctx->cpu, &rq->csd); 409 } else { 410 rq->q->softirq_done_fn(rq); 411 } 412 put_cpu(); 413 } 414 415 static void __blk_mq_complete_request(struct request *rq) 416 { 417 struct request_queue *q = rq->q; 418 419 if (!q->softirq_done_fn) 420 blk_mq_end_request(rq, rq->errors); 421 else 422 blk_mq_ipi_complete_request(rq); 423 } 424 425 /** 426 * blk_mq_complete_request - end I/O on a request 427 * @rq: the request being processed 428 * 429 * Description: 430 * Ends all I/O on a request. It does not handle partial completions. 431 * The actual completion happens out-of-order, through a IPI handler. 432 **/ 433 void blk_mq_complete_request(struct request *rq, int error) 434 { 435 struct request_queue *q = rq->q; 436 437 if (unlikely(blk_should_fake_timeout(q))) 438 return; 439 if (!blk_mark_rq_complete(rq)) { 440 rq->errors = error; 441 __blk_mq_complete_request(rq); 442 } 443 } 444 EXPORT_SYMBOL(blk_mq_complete_request); 445 446 int blk_mq_request_started(struct request *rq) 447 { 448 return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 449 } 450 EXPORT_SYMBOL_GPL(blk_mq_request_started); 451 452 void blk_mq_start_request(struct request *rq) 453 { 454 struct request_queue *q = rq->q; 455 456 trace_block_rq_issue(q, rq); 457 458 rq->resid_len = blk_rq_bytes(rq); 459 if (unlikely(blk_bidi_rq(rq))) 460 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); 461 462 blk_add_timer(rq); 463 464 /* 465 * Ensure that ->deadline is visible before set the started 466 * flag and clear the completed flag. 467 */ 468 smp_mb__before_atomic(); 469 470 /* 471 * Mark us as started and clear complete. Complete might have been 472 * set if requeue raced with timeout, which then marked it as 473 * complete. So be sure to clear complete again when we start 474 * the request, otherwise we'll ignore the completion event. 475 */ 476 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 477 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 478 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) 479 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); 480 481 if (q->dma_drain_size && blk_rq_bytes(rq)) { 482 /* 483 * Make sure space for the drain appears. We know we can do 484 * this because max_hw_segments has been adjusted to be one 485 * fewer than the device can handle. 486 */ 487 rq->nr_phys_segments++; 488 } 489 } 490 EXPORT_SYMBOL(blk_mq_start_request); 491 492 static void __blk_mq_requeue_request(struct request *rq) 493 { 494 struct request_queue *q = rq->q; 495 496 trace_block_rq_requeue(q, rq); 497 498 if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { 499 if (q->dma_drain_size && blk_rq_bytes(rq)) 500 rq->nr_phys_segments--; 501 } 502 } 503 504 void blk_mq_requeue_request(struct request *rq) 505 { 506 __blk_mq_requeue_request(rq); 507 508 BUG_ON(blk_queued_rq(rq)); 509 blk_mq_add_to_requeue_list(rq, true); 510 } 511 EXPORT_SYMBOL(blk_mq_requeue_request); 512 513 static void blk_mq_requeue_work(struct work_struct *work) 514 { 515 struct request_queue *q = 516 container_of(work, struct request_queue, requeue_work); 517 LIST_HEAD(rq_list); 518 struct request *rq, *next; 519 unsigned long flags; 520 521 spin_lock_irqsave(&q->requeue_lock, flags); 522 list_splice_init(&q->requeue_list, &rq_list); 523 spin_unlock_irqrestore(&q->requeue_lock, flags); 524 525 list_for_each_entry_safe(rq, next, &rq_list, queuelist) { 526 if (!(rq->cmd_flags & REQ_SOFTBARRIER)) 527 continue; 528 529 rq->cmd_flags &= ~REQ_SOFTBARRIER; 530 list_del_init(&rq->queuelist); 531 blk_mq_insert_request(rq, true, false, false); 532 } 533 534 while (!list_empty(&rq_list)) { 535 rq = list_entry(rq_list.next, struct request, queuelist); 536 list_del_init(&rq->queuelist); 537 blk_mq_insert_request(rq, false, false, false); 538 } 539 540 /* 541 * Use the start variant of queue running here, so that running 542 * the requeue work will kick stopped queues. 543 */ 544 blk_mq_start_hw_queues(q); 545 } 546 547 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) 548 { 549 struct request_queue *q = rq->q; 550 unsigned long flags; 551 552 /* 553 * We abuse this flag that is otherwise used by the I/O scheduler to 554 * request head insertation from the workqueue. 555 */ 556 BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER); 557 558 spin_lock_irqsave(&q->requeue_lock, flags); 559 if (at_head) { 560 rq->cmd_flags |= REQ_SOFTBARRIER; 561 list_add(&rq->queuelist, &q->requeue_list); 562 } else { 563 list_add_tail(&rq->queuelist, &q->requeue_list); 564 } 565 spin_unlock_irqrestore(&q->requeue_lock, flags); 566 } 567 EXPORT_SYMBOL(blk_mq_add_to_requeue_list); 568 569 void blk_mq_cancel_requeue_work(struct request_queue *q) 570 { 571 cancel_work_sync(&q->requeue_work); 572 } 573 EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work); 574 575 void blk_mq_kick_requeue_list(struct request_queue *q) 576 { 577 kblockd_schedule_work(&q->requeue_work); 578 } 579 EXPORT_SYMBOL(blk_mq_kick_requeue_list); 580 581 void blk_mq_abort_requeue_list(struct request_queue *q) 582 { 583 unsigned long flags; 584 LIST_HEAD(rq_list); 585 586 spin_lock_irqsave(&q->requeue_lock, flags); 587 list_splice_init(&q->requeue_list, &rq_list); 588 spin_unlock_irqrestore(&q->requeue_lock, flags); 589 590 while (!list_empty(&rq_list)) { 591 struct request *rq; 592 593 rq = list_first_entry(&rq_list, struct request, queuelist); 594 list_del_init(&rq->queuelist); 595 rq->errors = -EIO; 596 blk_mq_end_request(rq, rq->errors); 597 } 598 } 599 EXPORT_SYMBOL(blk_mq_abort_requeue_list); 600 601 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) 602 { 603 if (tag < tags->nr_tags) 604 return tags->rqs[tag]; 605 606 return NULL; 607 } 608 EXPORT_SYMBOL(blk_mq_tag_to_rq); 609 610 struct blk_mq_timeout_data { 611 unsigned long next; 612 unsigned int next_set; 613 }; 614 615 void blk_mq_rq_timed_out(struct request *req, bool reserved) 616 { 617 struct blk_mq_ops *ops = req->q->mq_ops; 618 enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; 619 620 /* 621 * We know that complete is set at this point. If STARTED isn't set 622 * anymore, then the request isn't active and the "timeout" should 623 * just be ignored. This can happen due to the bitflag ordering. 624 * Timeout first checks if STARTED is set, and if it is, assumes 625 * the request is active. But if we race with completion, then 626 * we both flags will get cleared. So check here again, and ignore 627 * a timeout event with a request that isn't active. 628 */ 629 if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) 630 return; 631 632 if (ops->timeout) 633 ret = ops->timeout(req, reserved); 634 635 switch (ret) { 636 case BLK_EH_HANDLED: 637 __blk_mq_complete_request(req); 638 break; 639 case BLK_EH_RESET_TIMER: 640 blk_add_timer(req); 641 blk_clear_rq_complete(req); 642 break; 643 case BLK_EH_NOT_HANDLED: 644 break; 645 default: 646 printk(KERN_ERR "block: bad eh return: %d\n", ret); 647 break; 648 } 649 } 650 651 static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, 652 struct request *rq, void *priv, bool reserved) 653 { 654 struct blk_mq_timeout_data *data = priv; 655 656 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { 657 /* 658 * If a request wasn't started before the queue was 659 * marked dying, kill it here or it'll go unnoticed. 660 */ 661 if (unlikely(blk_queue_dying(rq->q))) { 662 rq->errors = -EIO; 663 blk_mq_end_request(rq, rq->errors); 664 } 665 return; 666 } 667 668 if (time_after_eq(jiffies, rq->deadline)) { 669 if (!blk_mark_rq_complete(rq)) 670 blk_mq_rq_timed_out(rq, reserved); 671 } else if (!data->next_set || time_after(data->next, rq->deadline)) { 672 data->next = rq->deadline; 673 data->next_set = 1; 674 } 675 } 676 677 static void blk_mq_timeout_work(struct work_struct *work) 678 { 679 struct request_queue *q = 680 container_of(work, struct request_queue, timeout_work); 681 struct blk_mq_timeout_data data = { 682 .next = 0, 683 .next_set = 0, 684 }; 685 int i; 686 687 /* A deadlock might occur if a request is stuck requiring a 688 * timeout at the same time a queue freeze is waiting 689 * completion, since the timeout code would not be able to 690 * acquire the queue reference here. 691 * 692 * That's why we don't use blk_queue_enter here; instead, we use 693 * percpu_ref_tryget directly, because we need to be able to 694 * obtain a reference even in the short window between the queue 695 * starting to freeze, by dropping the first reference in 696 * blk_mq_freeze_queue_start, and the moment the last request is 697 * consumed, marked by the instant q_usage_counter reaches 698 * zero. 699 */ 700 if (!percpu_ref_tryget(&q->q_usage_counter)) 701 return; 702 703 blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); 704 705 if (data.next_set) { 706 data.next = blk_rq_timeout(round_jiffies_up(data.next)); 707 mod_timer(&q->timeout, data.next); 708 } else { 709 struct blk_mq_hw_ctx *hctx; 710 711 queue_for_each_hw_ctx(q, hctx, i) { 712 /* the hctx may be unmapped, so check it here */ 713 if (blk_mq_hw_queue_mapped(hctx)) 714 blk_mq_tag_idle(hctx); 715 } 716 } 717 blk_queue_exit(q); 718 } 719 720 /* 721 * Reverse check our software queue for entries that we could potentially 722 * merge with. Currently includes a hand-wavy stop count of 8, to not spend 723 * too much time checking for merges. 724 */ 725 static bool blk_mq_attempt_merge(struct request_queue *q, 726 struct blk_mq_ctx *ctx, struct bio *bio) 727 { 728 struct request *rq; 729 int checked = 8; 730 731 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { 732 int el_ret; 733 734 if (!checked--) 735 break; 736 737 if (!blk_rq_merge_ok(rq, bio)) 738 continue; 739 740 el_ret = blk_try_merge(rq, bio); 741 if (el_ret == ELEVATOR_BACK_MERGE) { 742 if (bio_attempt_back_merge(q, rq, bio)) { 743 ctx->rq_merged++; 744 return true; 745 } 746 break; 747 } else if (el_ret == ELEVATOR_FRONT_MERGE) { 748 if (bio_attempt_front_merge(q, rq, bio)) { 749 ctx->rq_merged++; 750 return true; 751 } 752 break; 753 } 754 } 755 756 return false; 757 } 758 759 /* 760 * Process software queues that have been marked busy, splicing them 761 * to the for-dispatch 762 */ 763 static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) 764 { 765 struct blk_mq_ctx *ctx; 766 int i; 767 768 for (i = 0; i < hctx->ctx_map.size; i++) { 769 struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; 770 unsigned int off, bit; 771 772 if (!bm->word) 773 continue; 774 775 bit = 0; 776 off = i * hctx->ctx_map.bits_per_word; 777 do { 778 bit = find_next_bit(&bm->word, bm->depth, bit); 779 if (bit >= bm->depth) 780 break; 781 782 ctx = hctx->ctxs[bit + off]; 783 clear_bit(bit, &bm->word); 784 spin_lock(&ctx->lock); 785 list_splice_tail_init(&ctx->rq_list, list); 786 spin_unlock(&ctx->lock); 787 788 bit++; 789 } while (1); 790 } 791 } 792 793 /* 794 * Run this hardware queue, pulling any software queues mapped to it in. 795 * Note that this function currently has various problems around ordering 796 * of IO. In particular, we'd like FIFO behaviour on handling existing 797 * items on the hctx->dispatch list. Ignore that for now. 798 */ 799 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 800 { 801 struct request_queue *q = hctx->queue; 802 struct request *rq; 803 LIST_HEAD(rq_list); 804 LIST_HEAD(driver_list); 805 struct list_head *dptr; 806 int queued; 807 808 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) 809 return; 810 811 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && 812 cpu_online(hctx->next_cpu)); 813 814 hctx->run++; 815 816 /* 817 * Touch any software queue that has pending entries. 818 */ 819 flush_busy_ctxs(hctx, &rq_list); 820 821 /* 822 * If we have previous entries on our dispatch list, grab them 823 * and stuff them at the front for more fair dispatch. 824 */ 825 if (!list_empty_careful(&hctx->dispatch)) { 826 spin_lock(&hctx->lock); 827 if (!list_empty(&hctx->dispatch)) 828 list_splice_init(&hctx->dispatch, &rq_list); 829 spin_unlock(&hctx->lock); 830 } 831 832 /* 833 * Start off with dptr being NULL, so we start the first request 834 * immediately, even if we have more pending. 835 */ 836 dptr = NULL; 837 838 /* 839 * Now process all the entries, sending them to the driver. 840 */ 841 queued = 0; 842 while (!list_empty(&rq_list)) { 843 struct blk_mq_queue_data bd; 844 int ret; 845 846 rq = list_first_entry(&rq_list, struct request, queuelist); 847 list_del_init(&rq->queuelist); 848 849 bd.rq = rq; 850 bd.list = dptr; 851 bd.last = list_empty(&rq_list); 852 853 ret = q->mq_ops->queue_rq(hctx, &bd); 854 switch (ret) { 855 case BLK_MQ_RQ_QUEUE_OK: 856 queued++; 857 break; 858 case BLK_MQ_RQ_QUEUE_BUSY: 859 list_add(&rq->queuelist, &rq_list); 860 __blk_mq_requeue_request(rq); 861 break; 862 default: 863 pr_err("blk-mq: bad return on queue: %d\n", ret); 864 case BLK_MQ_RQ_QUEUE_ERROR: 865 rq->errors = -EIO; 866 blk_mq_end_request(rq, rq->errors); 867 break; 868 } 869 870 if (ret == BLK_MQ_RQ_QUEUE_BUSY) 871 break; 872 873 /* 874 * We've done the first request. If we have more than 1 875 * left in the list, set dptr to defer issue. 876 */ 877 if (!dptr && rq_list.next != rq_list.prev) 878 dptr = &driver_list; 879 } 880 881 if (!queued) 882 hctx->dispatched[0]++; 883 else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) 884 hctx->dispatched[ilog2(queued) + 1]++; 885 886 /* 887 * Any items that need requeuing? Stuff them into hctx->dispatch, 888 * that is where we will continue on next queue run. 889 */ 890 if (!list_empty(&rq_list)) { 891 spin_lock(&hctx->lock); 892 list_splice(&rq_list, &hctx->dispatch); 893 spin_unlock(&hctx->lock); 894 /* 895 * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but 896 * it's possible the queue is stopped and restarted again 897 * before this. Queue restart will dispatch requests. And since 898 * requests in rq_list aren't added into hctx->dispatch yet, 899 * the requests in rq_list might get lost. 900 * 901 * blk_mq_run_hw_queue() already checks the STOPPED bit 902 **/ 903 blk_mq_run_hw_queue(hctx, true); 904 } 905 } 906 907 /* 908 * It'd be great if the workqueue API had a way to pass 909 * in a mask and had some smarts for more clever placement. 910 * For now we just round-robin here, switching for every 911 * BLK_MQ_CPU_WORK_BATCH queued items. 912 */ 913 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) 914 { 915 if (hctx->queue->nr_hw_queues == 1) 916 return WORK_CPU_UNBOUND; 917 918 if (--hctx->next_cpu_batch <= 0) { 919 int cpu = hctx->next_cpu, next_cpu; 920 921 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); 922 if (next_cpu >= nr_cpu_ids) 923 next_cpu = cpumask_first(hctx->cpumask); 924 925 hctx->next_cpu = next_cpu; 926 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 927 928 return cpu; 929 } 930 931 return hctx->next_cpu; 932 } 933 934 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 935 { 936 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) || 937 !blk_mq_hw_queue_mapped(hctx))) 938 return; 939 940 if (!async) { 941 int cpu = get_cpu(); 942 if (cpumask_test_cpu(cpu, hctx->cpumask)) { 943 __blk_mq_run_hw_queue(hctx); 944 put_cpu(); 945 return; 946 } 947 948 put_cpu(); 949 } 950 951 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), 952 &hctx->run_work, 0); 953 } 954 955 void blk_mq_run_hw_queues(struct request_queue *q, bool async) 956 { 957 struct blk_mq_hw_ctx *hctx; 958 int i; 959 960 queue_for_each_hw_ctx(q, hctx, i) { 961 if ((!blk_mq_hctx_has_pending(hctx) && 962 list_empty_careful(&hctx->dispatch)) || 963 test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 964 continue; 965 966 blk_mq_run_hw_queue(hctx, async); 967 } 968 } 969 EXPORT_SYMBOL(blk_mq_run_hw_queues); 970 971 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 972 { 973 cancel_delayed_work(&hctx->run_work); 974 cancel_delayed_work(&hctx->delay_work); 975 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 976 } 977 EXPORT_SYMBOL(blk_mq_stop_hw_queue); 978 979 void blk_mq_stop_hw_queues(struct request_queue *q) 980 { 981 struct blk_mq_hw_ctx *hctx; 982 int i; 983 984 queue_for_each_hw_ctx(q, hctx, i) 985 blk_mq_stop_hw_queue(hctx); 986 } 987 EXPORT_SYMBOL(blk_mq_stop_hw_queues); 988 989 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 990 { 991 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 992 993 blk_mq_run_hw_queue(hctx, false); 994 } 995 EXPORT_SYMBOL(blk_mq_start_hw_queue); 996 997 void blk_mq_start_hw_queues(struct request_queue *q) 998 { 999 struct blk_mq_hw_ctx *hctx; 1000 int i; 1001 1002 queue_for_each_hw_ctx(q, hctx, i) 1003 blk_mq_start_hw_queue(hctx); 1004 } 1005 EXPORT_SYMBOL(blk_mq_start_hw_queues); 1006 1007 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) 1008 { 1009 struct blk_mq_hw_ctx *hctx; 1010 int i; 1011 1012 queue_for_each_hw_ctx(q, hctx, i) { 1013 if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 1014 continue; 1015 1016 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 1017 blk_mq_run_hw_queue(hctx, async); 1018 } 1019 } 1020 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 1021 1022 static void blk_mq_run_work_fn(struct work_struct *work) 1023 { 1024 struct blk_mq_hw_ctx *hctx; 1025 1026 hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); 1027 1028 __blk_mq_run_hw_queue(hctx); 1029 } 1030 1031 static void blk_mq_delay_work_fn(struct work_struct *work) 1032 { 1033 struct blk_mq_hw_ctx *hctx; 1034 1035 hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work); 1036 1037 if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state)) 1038 __blk_mq_run_hw_queue(hctx); 1039 } 1040 1041 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 1042 { 1043 if (unlikely(!blk_mq_hw_queue_mapped(hctx))) 1044 return; 1045 1046 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), 1047 &hctx->delay_work, msecs_to_jiffies(msecs)); 1048 } 1049 EXPORT_SYMBOL(blk_mq_delay_queue); 1050 1051 static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, 1052 struct request *rq, 1053 bool at_head) 1054 { 1055 struct blk_mq_ctx *ctx = rq->mq_ctx; 1056 1057 trace_block_rq_insert(hctx->queue, rq); 1058 1059 if (at_head) 1060 list_add(&rq->queuelist, &ctx->rq_list); 1061 else 1062 list_add_tail(&rq->queuelist, &ctx->rq_list); 1063 } 1064 1065 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 1066 struct request *rq, bool at_head) 1067 { 1068 struct blk_mq_ctx *ctx = rq->mq_ctx; 1069 1070 __blk_mq_insert_req_list(hctx, rq, at_head); 1071 blk_mq_hctx_mark_pending(hctx, ctx); 1072 } 1073 1074 void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, 1075 bool async) 1076 { 1077 struct blk_mq_ctx *ctx = rq->mq_ctx; 1078 struct request_queue *q = rq->q; 1079 struct blk_mq_hw_ctx *hctx; 1080 1081 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1082 1083 spin_lock(&ctx->lock); 1084 __blk_mq_insert_request(hctx, rq, at_head); 1085 spin_unlock(&ctx->lock); 1086 1087 if (run_queue) 1088 blk_mq_run_hw_queue(hctx, async); 1089 } 1090 1091 static void blk_mq_insert_requests(struct request_queue *q, 1092 struct blk_mq_ctx *ctx, 1093 struct list_head *list, 1094 int depth, 1095 bool from_schedule) 1096 1097 { 1098 struct blk_mq_hw_ctx *hctx; 1099 1100 trace_block_unplug(q, depth, !from_schedule); 1101 1102 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1103 1104 /* 1105 * preemption doesn't flush plug list, so it's possible ctx->cpu is 1106 * offline now 1107 */ 1108 spin_lock(&ctx->lock); 1109 while (!list_empty(list)) { 1110 struct request *rq; 1111 1112 rq = list_first_entry(list, struct request, queuelist); 1113 BUG_ON(rq->mq_ctx != ctx); 1114 list_del_init(&rq->queuelist); 1115 __blk_mq_insert_req_list(hctx, rq, false); 1116 } 1117 blk_mq_hctx_mark_pending(hctx, ctx); 1118 spin_unlock(&ctx->lock); 1119 1120 blk_mq_run_hw_queue(hctx, from_schedule); 1121 } 1122 1123 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 1124 { 1125 struct request *rqa = container_of(a, struct request, queuelist); 1126 struct request *rqb = container_of(b, struct request, queuelist); 1127 1128 return !(rqa->mq_ctx < rqb->mq_ctx || 1129 (rqa->mq_ctx == rqb->mq_ctx && 1130 blk_rq_pos(rqa) < blk_rq_pos(rqb))); 1131 } 1132 1133 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 1134 { 1135 struct blk_mq_ctx *this_ctx; 1136 struct request_queue *this_q; 1137 struct request *rq; 1138 LIST_HEAD(list); 1139 LIST_HEAD(ctx_list); 1140 unsigned int depth; 1141 1142 list_splice_init(&plug->mq_list, &list); 1143 1144 list_sort(NULL, &list, plug_ctx_cmp); 1145 1146 this_q = NULL; 1147 this_ctx = NULL; 1148 depth = 0; 1149 1150 while (!list_empty(&list)) { 1151 rq = list_entry_rq(list.next); 1152 list_del_init(&rq->queuelist); 1153 BUG_ON(!rq->q); 1154 if (rq->mq_ctx != this_ctx) { 1155 if (this_ctx) { 1156 blk_mq_insert_requests(this_q, this_ctx, 1157 &ctx_list, depth, 1158 from_schedule); 1159 } 1160 1161 this_ctx = rq->mq_ctx; 1162 this_q = rq->q; 1163 depth = 0; 1164 } 1165 1166 depth++; 1167 list_add_tail(&rq->queuelist, &ctx_list); 1168 } 1169 1170 /* 1171 * If 'this_ctx' is set, we know we have entries to complete 1172 * on 'ctx_list'. Do those. 1173 */ 1174 if (this_ctx) { 1175 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, 1176 from_schedule); 1177 } 1178 } 1179 1180 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 1181 { 1182 init_request_from_bio(rq, bio); 1183 1184 blk_account_io_start(rq, 1); 1185 } 1186 1187 static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) 1188 { 1189 return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 1190 !blk_queue_nomerges(hctx->queue); 1191 } 1192 1193 static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, 1194 struct blk_mq_ctx *ctx, 1195 struct request *rq, struct bio *bio) 1196 { 1197 if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) { 1198 blk_mq_bio_to_request(rq, bio); 1199 spin_lock(&ctx->lock); 1200 insert_rq: 1201 __blk_mq_insert_request(hctx, rq, false); 1202 spin_unlock(&ctx->lock); 1203 return false; 1204 } else { 1205 struct request_queue *q = hctx->queue; 1206 1207 spin_lock(&ctx->lock); 1208 if (!blk_mq_attempt_merge(q, ctx, bio)) { 1209 blk_mq_bio_to_request(rq, bio); 1210 goto insert_rq; 1211 } 1212 1213 spin_unlock(&ctx->lock); 1214 __blk_mq_free_request(hctx, ctx, rq); 1215 return true; 1216 } 1217 } 1218 1219 struct blk_map_ctx { 1220 struct blk_mq_hw_ctx *hctx; 1221 struct blk_mq_ctx *ctx; 1222 }; 1223 1224 static struct request *blk_mq_map_request(struct request_queue *q, 1225 struct bio *bio, 1226 struct blk_map_ctx *data) 1227 { 1228 struct blk_mq_hw_ctx *hctx; 1229 struct blk_mq_ctx *ctx; 1230 struct request *rq; 1231 int op = bio_data_dir(bio); 1232 int op_flags = 0; 1233 struct blk_mq_alloc_data alloc_data; 1234 1235 blk_queue_enter_live(q); 1236 ctx = blk_mq_get_ctx(q); 1237 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1238 1239 if (rw_is_sync(bio_op(bio), bio->bi_opf)) 1240 op_flags |= REQ_SYNC; 1241 1242 trace_block_getrq(q, bio, op); 1243 blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx); 1244 rq = __blk_mq_alloc_request(&alloc_data, op, op_flags); 1245 if (unlikely(!rq)) { 1246 __blk_mq_run_hw_queue(hctx); 1247 blk_mq_put_ctx(ctx); 1248 trace_block_sleeprq(q, bio, op); 1249 1250 ctx = blk_mq_get_ctx(q); 1251 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1252 blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); 1253 rq = __blk_mq_alloc_request(&alloc_data, op, op_flags); 1254 ctx = alloc_data.ctx; 1255 hctx = alloc_data.hctx; 1256 } 1257 1258 hctx->queued++; 1259 data->hctx = hctx; 1260 data->ctx = ctx; 1261 return rq; 1262 } 1263 1264 static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) 1265 { 1266 int ret; 1267 struct request_queue *q = rq->q; 1268 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, 1269 rq->mq_ctx->cpu); 1270 struct blk_mq_queue_data bd = { 1271 .rq = rq, 1272 .list = NULL, 1273 .last = 1 1274 }; 1275 blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num); 1276 1277 /* 1278 * For OK queue, we are done. For error, kill it. Any other 1279 * error (busy), just add it to our list as we previously 1280 * would have done 1281 */ 1282 ret = q->mq_ops->queue_rq(hctx, &bd); 1283 if (ret == BLK_MQ_RQ_QUEUE_OK) { 1284 *cookie = new_cookie; 1285 return 0; 1286 } 1287 1288 __blk_mq_requeue_request(rq); 1289 1290 if (ret == BLK_MQ_RQ_QUEUE_ERROR) { 1291 *cookie = BLK_QC_T_NONE; 1292 rq->errors = -EIO; 1293 blk_mq_end_request(rq, rq->errors); 1294 return 0; 1295 } 1296 1297 return -1; 1298 } 1299 1300 /* 1301 * Multiple hardware queue variant. This will not use per-process plugs, 1302 * but will attempt to bypass the hctx queueing if we can go straight to 1303 * hardware for SYNC IO. 1304 */ 1305 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) 1306 { 1307 const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf); 1308 const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); 1309 struct blk_map_ctx data; 1310 struct request *rq; 1311 unsigned int request_count = 0; 1312 struct blk_plug *plug; 1313 struct request *same_queue_rq = NULL; 1314 blk_qc_t cookie; 1315 1316 blk_queue_bounce(q, &bio); 1317 1318 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 1319 bio_io_error(bio); 1320 return BLK_QC_T_NONE; 1321 } 1322 1323 blk_queue_split(q, &bio, q->bio_split); 1324 1325 if (!is_flush_fua && !blk_queue_nomerges(q) && 1326 blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) 1327 return BLK_QC_T_NONE; 1328 1329 rq = blk_mq_map_request(q, bio, &data); 1330 if (unlikely(!rq)) 1331 return BLK_QC_T_NONE; 1332 1333 cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); 1334 1335 if (unlikely(is_flush_fua)) { 1336 blk_mq_bio_to_request(rq, bio); 1337 blk_insert_flush(rq); 1338 goto run_queue; 1339 } 1340 1341 plug = current->plug; 1342 /* 1343 * If the driver supports defer issued based on 'last', then 1344 * queue it up like normal since we can potentially save some 1345 * CPU this way. 1346 */ 1347 if (((plug && !blk_queue_nomerges(q)) || is_sync) && 1348 !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { 1349 struct request *old_rq = NULL; 1350 1351 blk_mq_bio_to_request(rq, bio); 1352 1353 /* 1354 * We do limited pluging. If the bio can be merged, do that. 1355 * Otherwise the existing request in the plug list will be 1356 * issued. So the plug list will have one request at most 1357 */ 1358 if (plug) { 1359 /* 1360 * The plug list might get flushed before this. If that 1361 * happens, same_queue_rq is invalid and plug list is 1362 * empty 1363 */ 1364 if (same_queue_rq && !list_empty(&plug->mq_list)) { 1365 old_rq = same_queue_rq; 1366 list_del_init(&old_rq->queuelist); 1367 } 1368 list_add_tail(&rq->queuelist, &plug->mq_list); 1369 } else /* is_sync */ 1370 old_rq = rq; 1371 blk_mq_put_ctx(data.ctx); 1372 if (!old_rq) 1373 goto done; 1374 if (test_bit(BLK_MQ_S_STOPPED, &data.hctx->state) || 1375 blk_mq_direct_issue_request(old_rq, &cookie) != 0) 1376 blk_mq_insert_request(old_rq, false, true, true); 1377 goto done; 1378 } 1379 1380 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { 1381 /* 1382 * For a SYNC request, send it to the hardware immediately. For 1383 * an ASYNC request, just ensure that we run it later on. The 1384 * latter allows for merging opportunities and more efficient 1385 * dispatching. 1386 */ 1387 run_queue: 1388 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); 1389 } 1390 blk_mq_put_ctx(data.ctx); 1391 done: 1392 return cookie; 1393 } 1394 1395 /* 1396 * Single hardware queue variant. This will attempt to use any per-process 1397 * plug for merging and IO deferral. 1398 */ 1399 static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) 1400 { 1401 const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf); 1402 const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); 1403 struct blk_plug *plug; 1404 unsigned int request_count = 0; 1405 struct blk_map_ctx data; 1406 struct request *rq; 1407 blk_qc_t cookie; 1408 1409 blk_queue_bounce(q, &bio); 1410 1411 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 1412 bio_io_error(bio); 1413 return BLK_QC_T_NONE; 1414 } 1415 1416 blk_queue_split(q, &bio, q->bio_split); 1417 1418 if (!is_flush_fua && !blk_queue_nomerges(q)) { 1419 if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) 1420 return BLK_QC_T_NONE; 1421 } else 1422 request_count = blk_plug_queued_count(q); 1423 1424 rq = blk_mq_map_request(q, bio, &data); 1425 if (unlikely(!rq)) 1426 return BLK_QC_T_NONE; 1427 1428 cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); 1429 1430 if (unlikely(is_flush_fua)) { 1431 blk_mq_bio_to_request(rq, bio); 1432 blk_insert_flush(rq); 1433 goto run_queue; 1434 } 1435 1436 /* 1437 * A task plug currently exists. Since this is completely lockless, 1438 * utilize that to temporarily store requests until the task is 1439 * either done or scheduled away. 1440 */ 1441 plug = current->plug; 1442 if (plug) { 1443 blk_mq_bio_to_request(rq, bio); 1444 if (!request_count) 1445 trace_block_plug(q); 1446 1447 blk_mq_put_ctx(data.ctx); 1448 1449 if (request_count >= BLK_MAX_REQUEST_COUNT) { 1450 blk_flush_plug_list(plug, false); 1451 trace_block_plug(q); 1452 } 1453 1454 list_add_tail(&rq->queuelist, &plug->mq_list); 1455 return cookie; 1456 } 1457 1458 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { 1459 /* 1460 * For a SYNC request, send it to the hardware immediately. For 1461 * an ASYNC request, just ensure that we run it later on. The 1462 * latter allows for merging opportunities and more efficient 1463 * dispatching. 1464 */ 1465 run_queue: 1466 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); 1467 } 1468 1469 blk_mq_put_ctx(data.ctx); 1470 return cookie; 1471 } 1472 1473 /* 1474 * Default mapping to a software queue, since we use one per CPU. 1475 */ 1476 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) 1477 { 1478 return q->queue_hw_ctx[q->mq_map[cpu]]; 1479 } 1480 EXPORT_SYMBOL(blk_mq_map_queue); 1481 1482 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, 1483 struct blk_mq_tags *tags, unsigned int hctx_idx) 1484 { 1485 struct page *page; 1486 1487 if (tags->rqs && set->ops->exit_request) { 1488 int i; 1489 1490 for (i = 0; i < tags->nr_tags; i++) { 1491 if (!tags->rqs[i]) 1492 continue; 1493 set->ops->exit_request(set->driver_data, tags->rqs[i], 1494 hctx_idx, i); 1495 tags->rqs[i] = NULL; 1496 } 1497 } 1498 1499 while (!list_empty(&tags->page_list)) { 1500 page = list_first_entry(&tags->page_list, struct page, lru); 1501 list_del_init(&page->lru); 1502 /* 1503 * Remove kmemleak object previously allocated in 1504 * blk_mq_init_rq_map(). 1505 */ 1506 kmemleak_free(page_address(page)); 1507 __free_pages(page, page->private); 1508 } 1509 1510 kfree(tags->rqs); 1511 1512 blk_mq_free_tags(tags); 1513 } 1514 1515 static size_t order_to_size(unsigned int order) 1516 { 1517 return (size_t)PAGE_SIZE << order; 1518 } 1519 1520 static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, 1521 unsigned int hctx_idx) 1522 { 1523 struct blk_mq_tags *tags; 1524 unsigned int i, j, entries_per_page, max_order = 4; 1525 size_t rq_size, left; 1526 1527 tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, 1528 set->numa_node, 1529 BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); 1530 if (!tags) 1531 return NULL; 1532 1533 INIT_LIST_HEAD(&tags->page_list); 1534 1535 tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), 1536 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, 1537 set->numa_node); 1538 if (!tags->rqs) { 1539 blk_mq_free_tags(tags); 1540 return NULL; 1541 } 1542 1543 /* 1544 * rq_size is the size of the request plus driver payload, rounded 1545 * to the cacheline size 1546 */ 1547 rq_size = round_up(sizeof(struct request) + set->cmd_size, 1548 cache_line_size()); 1549 left = rq_size * set->queue_depth; 1550 1551 for (i = 0; i < set->queue_depth; ) { 1552 int this_order = max_order; 1553 struct page *page; 1554 int to_do; 1555 void *p; 1556 1557 while (this_order && left < order_to_size(this_order - 1)) 1558 this_order--; 1559 1560 do { 1561 page = alloc_pages_node(set->numa_node, 1562 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, 1563 this_order); 1564 if (page) 1565 break; 1566 if (!this_order--) 1567 break; 1568 if (order_to_size(this_order) < rq_size) 1569 break; 1570 } while (1); 1571 1572 if (!page) 1573 goto fail; 1574 1575 page->private = this_order; 1576 list_add_tail(&page->lru, &tags->page_list); 1577 1578 p = page_address(page); 1579 /* 1580 * Allow kmemleak to scan these pages as they contain pointers 1581 * to additional allocations like via ops->init_request(). 1582 */ 1583 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL); 1584 entries_per_page = order_to_size(this_order) / rq_size; 1585 to_do = min(entries_per_page, set->queue_depth - i); 1586 left -= to_do * rq_size; 1587 for (j = 0; j < to_do; j++) { 1588 tags->rqs[i] = p; 1589 if (set->ops->init_request) { 1590 if (set->ops->init_request(set->driver_data, 1591 tags->rqs[i], hctx_idx, i, 1592 set->numa_node)) { 1593 tags->rqs[i] = NULL; 1594 goto fail; 1595 } 1596 } 1597 1598 p += rq_size; 1599 i++; 1600 } 1601 } 1602 return tags; 1603 1604 fail: 1605 blk_mq_free_rq_map(set, tags, hctx_idx); 1606 return NULL; 1607 } 1608 1609 static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) 1610 { 1611 kfree(bitmap->map); 1612 } 1613 1614 static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) 1615 { 1616 unsigned int bpw = 8, total, num_maps, i; 1617 1618 bitmap->bits_per_word = bpw; 1619 1620 num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; 1621 bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), 1622 GFP_KERNEL, node); 1623 if (!bitmap->map) 1624 return -ENOMEM; 1625 1626 total = nr_cpu_ids; 1627 for (i = 0; i < num_maps; i++) { 1628 bitmap->map[i].depth = min(total, bitmap->bits_per_word); 1629 total -= bitmap->map[i].depth; 1630 } 1631 1632 return 0; 1633 } 1634 1635 /* 1636 * 'cpu' is going away. splice any existing rq_list entries from this 1637 * software queue to the hw queue dispatch list, and ensure that it 1638 * gets run. 1639 */ 1640 static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) 1641 { 1642 struct blk_mq_ctx *ctx; 1643 LIST_HEAD(tmp); 1644 1645 ctx = __blk_mq_get_ctx(hctx->queue, cpu); 1646 1647 spin_lock(&ctx->lock); 1648 if (!list_empty(&ctx->rq_list)) { 1649 list_splice_init(&ctx->rq_list, &tmp); 1650 blk_mq_hctx_clear_pending(hctx, ctx); 1651 } 1652 spin_unlock(&ctx->lock); 1653 1654 if (list_empty(&tmp)) 1655 return NOTIFY_OK; 1656 1657 spin_lock(&hctx->lock); 1658 list_splice_tail_init(&tmp, &hctx->dispatch); 1659 spin_unlock(&hctx->lock); 1660 1661 blk_mq_run_hw_queue(hctx, true); 1662 return NOTIFY_OK; 1663 } 1664 1665 static int blk_mq_hctx_notify(void *data, unsigned long action, 1666 unsigned int cpu) 1667 { 1668 struct blk_mq_hw_ctx *hctx = data; 1669 1670 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) 1671 return blk_mq_hctx_cpu_offline(hctx, cpu); 1672 1673 /* 1674 * In case of CPU online, tags may be reallocated 1675 * in blk_mq_map_swqueue() after mapping is updated. 1676 */ 1677 1678 return NOTIFY_OK; 1679 } 1680 1681 /* hctx->ctxs will be freed in queue's release handler */ 1682 static void blk_mq_exit_hctx(struct request_queue *q, 1683 struct blk_mq_tag_set *set, 1684 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 1685 { 1686 unsigned flush_start_tag = set->queue_depth; 1687 1688 blk_mq_tag_idle(hctx); 1689 1690 if (set->ops->exit_request) 1691 set->ops->exit_request(set->driver_data, 1692 hctx->fq->flush_rq, hctx_idx, 1693 flush_start_tag + hctx_idx); 1694 1695 if (set->ops->exit_hctx) 1696 set->ops->exit_hctx(hctx, hctx_idx); 1697 1698 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1699 blk_free_flush_queue(hctx->fq); 1700 blk_mq_free_bitmap(&hctx->ctx_map); 1701 } 1702 1703 static void blk_mq_exit_hw_queues(struct request_queue *q, 1704 struct blk_mq_tag_set *set, int nr_queue) 1705 { 1706 struct blk_mq_hw_ctx *hctx; 1707 unsigned int i; 1708 1709 queue_for_each_hw_ctx(q, hctx, i) { 1710 if (i == nr_queue) 1711 break; 1712 blk_mq_exit_hctx(q, set, hctx, i); 1713 } 1714 } 1715 1716 static void blk_mq_free_hw_queues(struct request_queue *q, 1717 struct blk_mq_tag_set *set) 1718 { 1719 struct blk_mq_hw_ctx *hctx; 1720 unsigned int i; 1721 1722 queue_for_each_hw_ctx(q, hctx, i) 1723 free_cpumask_var(hctx->cpumask); 1724 } 1725 1726 static int blk_mq_init_hctx(struct request_queue *q, 1727 struct blk_mq_tag_set *set, 1728 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) 1729 { 1730 int node; 1731 unsigned flush_start_tag = set->queue_depth; 1732 1733 node = hctx->numa_node; 1734 if (node == NUMA_NO_NODE) 1735 node = hctx->numa_node = set->numa_node; 1736 1737 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); 1738 INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); 1739 spin_lock_init(&hctx->lock); 1740 INIT_LIST_HEAD(&hctx->dispatch); 1741 hctx->queue = q; 1742 hctx->queue_num = hctx_idx; 1743 hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; 1744 1745 blk_mq_init_cpu_notifier(&hctx->cpu_notifier, 1746 blk_mq_hctx_notify, hctx); 1747 blk_mq_register_cpu_notifier(&hctx->cpu_notifier); 1748 1749 hctx->tags = set->tags[hctx_idx]; 1750 1751 /* 1752 * Allocate space for all possible cpus to avoid allocation at 1753 * runtime 1754 */ 1755 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), 1756 GFP_KERNEL, node); 1757 if (!hctx->ctxs) 1758 goto unregister_cpu_notifier; 1759 1760 if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) 1761 goto free_ctxs; 1762 1763 hctx->nr_ctx = 0; 1764 1765 if (set->ops->init_hctx && 1766 set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) 1767 goto free_bitmap; 1768 1769 hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); 1770 if (!hctx->fq) 1771 goto exit_hctx; 1772 1773 if (set->ops->init_request && 1774 set->ops->init_request(set->driver_data, 1775 hctx->fq->flush_rq, hctx_idx, 1776 flush_start_tag + hctx_idx, node)) 1777 goto free_fq; 1778 1779 return 0; 1780 1781 free_fq: 1782 kfree(hctx->fq); 1783 exit_hctx: 1784 if (set->ops->exit_hctx) 1785 set->ops->exit_hctx(hctx, hctx_idx); 1786 free_bitmap: 1787 blk_mq_free_bitmap(&hctx->ctx_map); 1788 free_ctxs: 1789 kfree(hctx->ctxs); 1790 unregister_cpu_notifier: 1791 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1792 1793 return -1; 1794 } 1795 1796 static void blk_mq_init_cpu_queues(struct request_queue *q, 1797 unsigned int nr_hw_queues) 1798 { 1799 unsigned int i; 1800 1801 for_each_possible_cpu(i) { 1802 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 1803 struct blk_mq_hw_ctx *hctx; 1804 1805 memset(__ctx, 0, sizeof(*__ctx)); 1806 __ctx->cpu = i; 1807 spin_lock_init(&__ctx->lock); 1808 INIT_LIST_HEAD(&__ctx->rq_list); 1809 __ctx->queue = q; 1810 1811 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1812 if (!cpu_online(i)) 1813 continue; 1814 1815 hctx = q->mq_ops->map_queue(q, i); 1816 1817 /* 1818 * Set local node, IFF we have more than one hw queue. If 1819 * not, we remain on the home node of the device 1820 */ 1821 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 1822 hctx->numa_node = local_memory_node(cpu_to_node(i)); 1823 } 1824 } 1825 1826 static void blk_mq_map_swqueue(struct request_queue *q, 1827 const struct cpumask *online_mask) 1828 { 1829 unsigned int i; 1830 struct blk_mq_hw_ctx *hctx; 1831 struct blk_mq_ctx *ctx; 1832 struct blk_mq_tag_set *set = q->tag_set; 1833 1834 /* 1835 * Avoid others reading imcomplete hctx->cpumask through sysfs 1836 */ 1837 mutex_lock(&q->sysfs_lock); 1838 1839 queue_for_each_hw_ctx(q, hctx, i) { 1840 cpumask_clear(hctx->cpumask); 1841 hctx->nr_ctx = 0; 1842 } 1843 1844 /* 1845 * Map software to hardware queues 1846 */ 1847 for_each_possible_cpu(i) { 1848 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1849 if (!cpumask_test_cpu(i, online_mask)) 1850 continue; 1851 1852 ctx = per_cpu_ptr(q->queue_ctx, i); 1853 hctx = q->mq_ops->map_queue(q, i); 1854 1855 cpumask_set_cpu(i, hctx->cpumask); 1856 ctx->index_hw = hctx->nr_ctx; 1857 hctx->ctxs[hctx->nr_ctx++] = ctx; 1858 } 1859 1860 mutex_unlock(&q->sysfs_lock); 1861 1862 queue_for_each_hw_ctx(q, hctx, i) { 1863 struct blk_mq_ctxmap *map = &hctx->ctx_map; 1864 1865 /* 1866 * If no software queues are mapped to this hardware queue, 1867 * disable it and free the request entries. 1868 */ 1869 if (!hctx->nr_ctx) { 1870 if (set->tags[i]) { 1871 blk_mq_free_rq_map(set, set->tags[i], i); 1872 set->tags[i] = NULL; 1873 } 1874 hctx->tags = NULL; 1875 continue; 1876 } 1877 1878 /* unmapped hw queue can be remapped after CPU topo changed */ 1879 if (!set->tags[i]) 1880 set->tags[i] = blk_mq_init_rq_map(set, i); 1881 hctx->tags = set->tags[i]; 1882 WARN_ON(!hctx->tags); 1883 1884 cpumask_copy(hctx->tags->cpumask, hctx->cpumask); 1885 /* 1886 * Set the map size to the number of mapped software queues. 1887 * This is more accurate and more efficient than looping 1888 * over all possibly mapped software queues. 1889 */ 1890 map->size = DIV_ROUND_UP(hctx->nr_ctx, map->bits_per_word); 1891 1892 /* 1893 * Initialize batch roundrobin counts 1894 */ 1895 hctx->next_cpu = cpumask_first(hctx->cpumask); 1896 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 1897 } 1898 } 1899 1900 static void queue_set_hctx_shared(struct request_queue *q, bool shared) 1901 { 1902 struct blk_mq_hw_ctx *hctx; 1903 int i; 1904 1905 queue_for_each_hw_ctx(q, hctx, i) { 1906 if (shared) 1907 hctx->flags |= BLK_MQ_F_TAG_SHARED; 1908 else 1909 hctx->flags &= ~BLK_MQ_F_TAG_SHARED; 1910 } 1911 } 1912 1913 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared) 1914 { 1915 struct request_queue *q; 1916 1917 list_for_each_entry(q, &set->tag_list, tag_set_list) { 1918 blk_mq_freeze_queue(q); 1919 queue_set_hctx_shared(q, shared); 1920 blk_mq_unfreeze_queue(q); 1921 } 1922 } 1923 1924 static void blk_mq_del_queue_tag_set(struct request_queue *q) 1925 { 1926 struct blk_mq_tag_set *set = q->tag_set; 1927 1928 mutex_lock(&set->tag_list_lock); 1929 list_del_init(&q->tag_set_list); 1930 if (list_is_singular(&set->tag_list)) { 1931 /* just transitioned to unshared */ 1932 set->flags &= ~BLK_MQ_F_TAG_SHARED; 1933 /* update existing queue */ 1934 blk_mq_update_tag_set_depth(set, false); 1935 } 1936 mutex_unlock(&set->tag_list_lock); 1937 } 1938 1939 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, 1940 struct request_queue *q) 1941 { 1942 q->tag_set = set; 1943 1944 mutex_lock(&set->tag_list_lock); 1945 1946 /* Check to see if we're transitioning to shared (from 1 to 2 queues). */ 1947 if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) { 1948 set->flags |= BLK_MQ_F_TAG_SHARED; 1949 /* update existing queue */ 1950 blk_mq_update_tag_set_depth(set, true); 1951 } 1952 if (set->flags & BLK_MQ_F_TAG_SHARED) 1953 queue_set_hctx_shared(q, true); 1954 list_add_tail(&q->tag_set_list, &set->tag_list); 1955 1956 mutex_unlock(&set->tag_list_lock); 1957 } 1958 1959 /* 1960 * It is the actual release handler for mq, but we do it from 1961 * request queue's release handler for avoiding use-after-free 1962 * and headache because q->mq_kobj shouldn't have been introduced, 1963 * but we can't group ctx/kctx kobj without it. 1964 */ 1965 void blk_mq_release(struct request_queue *q) 1966 { 1967 struct blk_mq_hw_ctx *hctx; 1968 unsigned int i; 1969 1970 /* hctx kobj stays in hctx */ 1971 queue_for_each_hw_ctx(q, hctx, i) { 1972 if (!hctx) 1973 continue; 1974 kfree(hctx->ctxs); 1975 kfree(hctx); 1976 } 1977 1978 kfree(q->mq_map); 1979 q->mq_map = NULL; 1980 1981 kfree(q->queue_hw_ctx); 1982 1983 /* ctx kobj stays in queue_ctx */ 1984 free_percpu(q->queue_ctx); 1985 } 1986 1987 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) 1988 { 1989 struct request_queue *uninit_q, *q; 1990 1991 uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); 1992 if (!uninit_q) 1993 return ERR_PTR(-ENOMEM); 1994 1995 q = blk_mq_init_allocated_queue(set, uninit_q); 1996 if (IS_ERR(q)) 1997 blk_cleanup_queue(uninit_q); 1998 1999 return q; 2000 } 2001 EXPORT_SYMBOL(blk_mq_init_queue); 2002 2003 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, 2004 struct request_queue *q) 2005 { 2006 int i, j; 2007 struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; 2008 2009 blk_mq_sysfs_unregister(q); 2010 for (i = 0; i < set->nr_hw_queues; i++) { 2011 int node; 2012 2013 if (hctxs[i]) 2014 continue; 2015 2016 node = blk_mq_hw_queue_to_node(q->mq_map, i); 2017 hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), 2018 GFP_KERNEL, node); 2019 if (!hctxs[i]) 2020 break; 2021 2022 if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, 2023 node)) { 2024 kfree(hctxs[i]); 2025 hctxs[i] = NULL; 2026 break; 2027 } 2028 2029 atomic_set(&hctxs[i]->nr_active, 0); 2030 hctxs[i]->numa_node = node; 2031 hctxs[i]->queue_num = i; 2032 2033 if (blk_mq_init_hctx(q, set, hctxs[i], i)) { 2034 free_cpumask_var(hctxs[i]->cpumask); 2035 kfree(hctxs[i]); 2036 hctxs[i] = NULL; 2037 break; 2038 } 2039 blk_mq_hctx_kobj_init(hctxs[i]); 2040 } 2041 for (j = i; j < q->nr_hw_queues; j++) { 2042 struct blk_mq_hw_ctx *hctx = hctxs[j]; 2043 2044 if (hctx) { 2045 if (hctx->tags) { 2046 blk_mq_free_rq_map(set, hctx->tags, j); 2047 set->tags[j] = NULL; 2048 } 2049 blk_mq_exit_hctx(q, set, hctx, j); 2050 free_cpumask_var(hctx->cpumask); 2051 kobject_put(&hctx->kobj); 2052 kfree(hctx->ctxs); 2053 kfree(hctx); 2054 hctxs[j] = NULL; 2055 2056 } 2057 } 2058 q->nr_hw_queues = i; 2059 blk_mq_sysfs_register(q); 2060 } 2061 2062 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 2063 struct request_queue *q) 2064 { 2065 /* mark the queue as mq asap */ 2066 q->mq_ops = set->ops; 2067 2068 q->queue_ctx = alloc_percpu(struct blk_mq_ctx); 2069 if (!q->queue_ctx) 2070 goto err_exit; 2071 2072 q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)), 2073 GFP_KERNEL, set->numa_node); 2074 if (!q->queue_hw_ctx) 2075 goto err_percpu; 2076 2077 q->mq_map = blk_mq_make_queue_map(set); 2078 if (!q->mq_map) 2079 goto err_map; 2080 2081 blk_mq_realloc_hw_ctxs(set, q); 2082 if (!q->nr_hw_queues) 2083 goto err_hctxs; 2084 2085 INIT_WORK(&q->timeout_work, blk_mq_timeout_work); 2086 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); 2087 2088 q->nr_queues = nr_cpu_ids; 2089 2090 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 2091 2092 if (!(set->flags & BLK_MQ_F_SG_MERGE)) 2093 q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE; 2094 2095 q->sg_reserved_size = INT_MAX; 2096 2097 INIT_WORK(&q->requeue_work, blk_mq_requeue_work); 2098 INIT_LIST_HEAD(&q->requeue_list); 2099 spin_lock_init(&q->requeue_lock); 2100 2101 if (q->nr_hw_queues > 1) 2102 blk_queue_make_request(q, blk_mq_make_request); 2103 else 2104 blk_queue_make_request(q, blk_sq_make_request); 2105 2106 /* 2107 * Do this after blk_queue_make_request() overrides it... 2108 */ 2109 q->nr_requests = set->queue_depth; 2110 2111 if (set->ops->complete) 2112 blk_queue_softirq_done(q, set->ops->complete); 2113 2114 blk_mq_init_cpu_queues(q, set->nr_hw_queues); 2115 2116 get_online_cpus(); 2117 mutex_lock(&all_q_mutex); 2118 2119 list_add_tail(&q->all_q_node, &all_q_list); 2120 blk_mq_add_queue_tag_set(set, q); 2121 blk_mq_map_swqueue(q, cpu_online_mask); 2122 2123 mutex_unlock(&all_q_mutex); 2124 put_online_cpus(); 2125 2126 return q; 2127 2128 err_hctxs: 2129 kfree(q->mq_map); 2130 err_map: 2131 kfree(q->queue_hw_ctx); 2132 err_percpu: 2133 free_percpu(q->queue_ctx); 2134 err_exit: 2135 q->mq_ops = NULL; 2136 return ERR_PTR(-ENOMEM); 2137 } 2138 EXPORT_SYMBOL(blk_mq_init_allocated_queue); 2139 2140 void blk_mq_free_queue(struct request_queue *q) 2141 { 2142 struct blk_mq_tag_set *set = q->tag_set; 2143 2144 mutex_lock(&all_q_mutex); 2145 list_del_init(&q->all_q_node); 2146 mutex_unlock(&all_q_mutex); 2147 2148 blk_mq_del_queue_tag_set(q); 2149 2150 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); 2151 blk_mq_free_hw_queues(q, set); 2152 } 2153 2154 /* Basically redo blk_mq_init_queue with queue frozen */ 2155 static void blk_mq_queue_reinit(struct request_queue *q, 2156 const struct cpumask *online_mask) 2157 { 2158 WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth)); 2159 2160 blk_mq_sysfs_unregister(q); 2161 2162 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask); 2163 2164 /* 2165 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe 2166 * we should change hctx numa_node according to new topology (this 2167 * involves free and re-allocate memory, worthy doing?) 2168 */ 2169 2170 blk_mq_map_swqueue(q, online_mask); 2171 2172 blk_mq_sysfs_register(q); 2173 } 2174 2175 static int blk_mq_queue_reinit_notify(struct notifier_block *nb, 2176 unsigned long action, void *hcpu) 2177 { 2178 struct request_queue *q; 2179 int cpu = (unsigned long)hcpu; 2180 /* 2181 * New online cpumask which is going to be set in this hotplug event. 2182 * Declare this cpumasks as global as cpu-hotplug operation is invoked 2183 * one-by-one and dynamically allocating this could result in a failure. 2184 */ 2185 static struct cpumask online_new; 2186 2187 /* 2188 * Before hotadded cpu starts handling requests, new mappings must 2189 * be established. Otherwise, these requests in hw queue might 2190 * never be dispatched. 2191 * 2192 * For example, there is a single hw queue (hctx) and two CPU queues 2193 * (ctx0 for CPU0, and ctx1 for CPU1). 2194 * 2195 * Now CPU1 is just onlined and a request is inserted into 2196 * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is 2197 * still zero. 2198 * 2199 * And then while running hw queue, flush_busy_ctxs() finds bit0 is 2200 * set in pending bitmap and tries to retrieve requests in 2201 * hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0, 2202 * so the request in ctx1->rq_list is ignored. 2203 */ 2204 switch (action & ~CPU_TASKS_FROZEN) { 2205 case CPU_DEAD: 2206 case CPU_UP_CANCELED: 2207 cpumask_copy(&online_new, cpu_online_mask); 2208 break; 2209 case CPU_UP_PREPARE: 2210 cpumask_copy(&online_new, cpu_online_mask); 2211 cpumask_set_cpu(cpu, &online_new); 2212 break; 2213 default: 2214 return NOTIFY_OK; 2215 } 2216 2217 mutex_lock(&all_q_mutex); 2218 2219 /* 2220 * We need to freeze and reinit all existing queues. Freezing 2221 * involves synchronous wait for an RCU grace period and doing it 2222 * one by one may take a long time. Start freezing all queues in 2223 * one swoop and then wait for the completions so that freezing can 2224 * take place in parallel. 2225 */ 2226 list_for_each_entry(q, &all_q_list, all_q_node) 2227 blk_mq_freeze_queue_start(q); 2228 list_for_each_entry(q, &all_q_list, all_q_node) { 2229 blk_mq_freeze_queue_wait(q); 2230 2231 /* 2232 * timeout handler can't touch hw queue during the 2233 * reinitialization 2234 */ 2235 del_timer_sync(&q->timeout); 2236 } 2237 2238 list_for_each_entry(q, &all_q_list, all_q_node) 2239 blk_mq_queue_reinit(q, &online_new); 2240 2241 list_for_each_entry(q, &all_q_list, all_q_node) 2242 blk_mq_unfreeze_queue(q); 2243 2244 mutex_unlock(&all_q_mutex); 2245 return NOTIFY_OK; 2246 } 2247 2248 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 2249 { 2250 int i; 2251 2252 for (i = 0; i < set->nr_hw_queues; i++) { 2253 set->tags[i] = blk_mq_init_rq_map(set, i); 2254 if (!set->tags[i]) 2255 goto out_unwind; 2256 } 2257 2258 return 0; 2259 2260 out_unwind: 2261 while (--i >= 0) 2262 blk_mq_free_rq_map(set, set->tags[i], i); 2263 2264 return -ENOMEM; 2265 } 2266 2267 /* 2268 * Allocate the request maps associated with this tag_set. Note that this 2269 * may reduce the depth asked for, if memory is tight. set->queue_depth 2270 * will be updated to reflect the allocated depth. 2271 */ 2272 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 2273 { 2274 unsigned int depth; 2275 int err; 2276 2277 depth = set->queue_depth; 2278 do { 2279 err = __blk_mq_alloc_rq_maps(set); 2280 if (!err) 2281 break; 2282 2283 set->queue_depth >>= 1; 2284 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { 2285 err = -ENOMEM; 2286 break; 2287 } 2288 } while (set->queue_depth); 2289 2290 if (!set->queue_depth || err) { 2291 pr_err("blk-mq: failed to allocate request map\n"); 2292 return -ENOMEM; 2293 } 2294 2295 if (depth != set->queue_depth) 2296 pr_info("blk-mq: reduced tag depth (%u -> %u)\n", 2297 depth, set->queue_depth); 2298 2299 return 0; 2300 } 2301 2302 struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags) 2303 { 2304 return tags->cpumask; 2305 } 2306 EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask); 2307 2308 /* 2309 * Alloc a tag set to be associated with one or more request queues. 2310 * May fail with EINVAL for various error conditions. May adjust the 2311 * requested depth down, if if it too large. In that case, the set 2312 * value will be stored in set->queue_depth. 2313 */ 2314 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) 2315 { 2316 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); 2317 2318 if (!set->nr_hw_queues) 2319 return -EINVAL; 2320 if (!set->queue_depth) 2321 return -EINVAL; 2322 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) 2323 return -EINVAL; 2324 2325 if (!set->ops->queue_rq || !set->ops->map_queue) 2326 return -EINVAL; 2327 2328 if (set->queue_depth > BLK_MQ_MAX_DEPTH) { 2329 pr_info("blk-mq: reduced tag depth to %u\n", 2330 BLK_MQ_MAX_DEPTH); 2331 set->queue_depth = BLK_MQ_MAX_DEPTH; 2332 } 2333 2334 /* 2335 * If a crashdump is active, then we are potentially in a very 2336 * memory constrained environment. Limit us to 1 queue and 2337 * 64 tags to prevent using too much memory. 2338 */ 2339 if (is_kdump_kernel()) { 2340 set->nr_hw_queues = 1; 2341 set->queue_depth = min(64U, set->queue_depth); 2342 } 2343 /* 2344 * There is no use for more h/w queues than cpus. 2345 */ 2346 if (set->nr_hw_queues > nr_cpu_ids) 2347 set->nr_hw_queues = nr_cpu_ids; 2348 2349 set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *), 2350 GFP_KERNEL, set->numa_node); 2351 if (!set->tags) 2352 return -ENOMEM; 2353 2354 if (blk_mq_alloc_rq_maps(set)) 2355 goto enomem; 2356 2357 mutex_init(&set->tag_list_lock); 2358 INIT_LIST_HEAD(&set->tag_list); 2359 2360 return 0; 2361 enomem: 2362 kfree(set->tags); 2363 set->tags = NULL; 2364 return -ENOMEM; 2365 } 2366 EXPORT_SYMBOL(blk_mq_alloc_tag_set); 2367 2368 void blk_mq_free_tag_set(struct blk_mq_tag_set *set) 2369 { 2370 int i; 2371 2372 for (i = 0; i < nr_cpu_ids; i++) { 2373 if (set->tags[i]) 2374 blk_mq_free_rq_map(set, set->tags[i], i); 2375 } 2376 2377 kfree(set->tags); 2378 set->tags = NULL; 2379 } 2380 EXPORT_SYMBOL(blk_mq_free_tag_set); 2381 2382 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) 2383 { 2384 struct blk_mq_tag_set *set = q->tag_set; 2385 struct blk_mq_hw_ctx *hctx; 2386 int i, ret; 2387 2388 if (!set || nr > set->queue_depth) 2389 return -EINVAL; 2390 2391 ret = 0; 2392 queue_for_each_hw_ctx(q, hctx, i) { 2393 if (!hctx->tags) 2394 continue; 2395 ret = blk_mq_tag_update_depth(hctx->tags, nr); 2396 if (ret) 2397 break; 2398 } 2399 2400 if (!ret) 2401 q->nr_requests = nr; 2402 2403 return ret; 2404 } 2405 2406 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) 2407 { 2408 struct request_queue *q; 2409 2410 if (nr_hw_queues > nr_cpu_ids) 2411 nr_hw_queues = nr_cpu_ids; 2412 if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) 2413 return; 2414 2415 list_for_each_entry(q, &set->tag_list, tag_set_list) 2416 blk_mq_freeze_queue(q); 2417 2418 set->nr_hw_queues = nr_hw_queues; 2419 list_for_each_entry(q, &set->tag_list, tag_set_list) { 2420 blk_mq_realloc_hw_ctxs(set, q); 2421 2422 if (q->nr_hw_queues > 1) 2423 blk_queue_make_request(q, blk_mq_make_request); 2424 else 2425 blk_queue_make_request(q, blk_sq_make_request); 2426 2427 blk_mq_queue_reinit(q, cpu_online_mask); 2428 } 2429 2430 list_for_each_entry(q, &set->tag_list, tag_set_list) 2431 blk_mq_unfreeze_queue(q); 2432 } 2433 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); 2434 2435 void blk_mq_disable_hotplug(void) 2436 { 2437 mutex_lock(&all_q_mutex); 2438 } 2439 2440 void blk_mq_enable_hotplug(void) 2441 { 2442 mutex_unlock(&all_q_mutex); 2443 } 2444 2445 static int __init blk_mq_init(void) 2446 { 2447 blk_mq_cpu_init(); 2448 2449 hotcpu_notifier(blk_mq_queue_reinit_notify, 0); 2450 2451 return 0; 2452 } 2453 subsys_initcall(blk_mq_init); 2454
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.