162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (c) 2017-2018 Christoph Hellwig. 462306a36Sopenharmony_ci */ 562306a36Sopenharmony_ci 662306a36Sopenharmony_ci#include <linux/backing-dev.h> 762306a36Sopenharmony_ci#include <linux/moduleparam.h> 862306a36Sopenharmony_ci#include <linux/vmalloc.h> 962306a36Sopenharmony_ci#include <trace/events/block.h> 1062306a36Sopenharmony_ci#include "nvme.h" 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_cibool multipath = true; 1362306a36Sopenharmony_cimodule_param(multipath, bool, 0444); 1462306a36Sopenharmony_ciMODULE_PARM_DESC(multipath, 1562306a36Sopenharmony_ci "turn on native support for multiple controllers per subsystem"); 1662306a36Sopenharmony_ci 1762306a36Sopenharmony_cistatic const char *nvme_iopolicy_names[] = { 1862306a36Sopenharmony_ci [NVME_IOPOLICY_NUMA] = "numa", 1962306a36Sopenharmony_ci [NVME_IOPOLICY_RR] = "round-robin", 2062306a36Sopenharmony_ci}; 2162306a36Sopenharmony_ci 2262306a36Sopenharmony_cistatic int iopolicy = NVME_IOPOLICY_NUMA; 2362306a36Sopenharmony_ci 2462306a36Sopenharmony_cistatic int nvme_set_iopolicy(const char *val, const struct kernel_param *kp) 2562306a36Sopenharmony_ci{ 2662306a36Sopenharmony_ci if (!val) 2762306a36Sopenharmony_ci return -EINVAL; 2862306a36Sopenharmony_ci if (!strncmp(val, "numa", 4)) 2962306a36Sopenharmony_ci iopolicy = NVME_IOPOLICY_NUMA; 3062306a36Sopenharmony_ci else if (!strncmp(val, "round-robin", 11)) 3162306a36Sopenharmony_ci iopolicy = NVME_IOPOLICY_RR; 3262306a36Sopenharmony_ci else 3362306a36Sopenharmony_ci return -EINVAL; 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_ci return 0; 3662306a36Sopenharmony_ci} 3762306a36Sopenharmony_ci 3862306a36Sopenharmony_cistatic int nvme_get_iopolicy(char *buf, const struct kernel_param *kp) 3962306a36Sopenharmony_ci{ 4062306a36Sopenharmony_ci return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]); 4162306a36Sopenharmony_ci} 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_cimodule_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy, 4462306a36Sopenharmony_ci &iopolicy, 0644); 4562306a36Sopenharmony_ciMODULE_PARM_DESC(iopolicy, 4662306a36Sopenharmony_ci "Default multipath I/O policy; 'numa' (default) or 'round-robin'"); 4762306a36Sopenharmony_ci 4862306a36Sopenharmony_civoid nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys) 4962306a36Sopenharmony_ci{ 5062306a36Sopenharmony_ci subsys->iopolicy = iopolicy; 5162306a36Sopenharmony_ci} 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_civoid nvme_mpath_unfreeze(struct nvme_subsystem *subsys) 5462306a36Sopenharmony_ci{ 5562306a36Sopenharmony_ci struct nvme_ns_head *h; 5662306a36Sopenharmony_ci 5762306a36Sopenharmony_ci lockdep_assert_held(&subsys->lock); 5862306a36Sopenharmony_ci list_for_each_entry(h, &subsys->nsheads, entry) 5962306a36Sopenharmony_ci if (h->disk) 6062306a36Sopenharmony_ci blk_mq_unfreeze_queue(h->disk->queue); 6162306a36Sopenharmony_ci} 6262306a36Sopenharmony_ci 6362306a36Sopenharmony_civoid nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) 6462306a36Sopenharmony_ci{ 6562306a36Sopenharmony_ci struct nvme_ns_head *h; 6662306a36Sopenharmony_ci 6762306a36Sopenharmony_ci lockdep_assert_held(&subsys->lock); 6862306a36Sopenharmony_ci list_for_each_entry(h, &subsys->nsheads, entry) 6962306a36Sopenharmony_ci if (h->disk) 7062306a36Sopenharmony_ci blk_mq_freeze_queue_wait(h->disk->queue); 7162306a36Sopenharmony_ci} 7262306a36Sopenharmony_ci 7362306a36Sopenharmony_civoid nvme_mpath_start_freeze(struct nvme_subsystem *subsys) 7462306a36Sopenharmony_ci{ 7562306a36Sopenharmony_ci struct nvme_ns_head *h; 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_ci lockdep_assert_held(&subsys->lock); 7862306a36Sopenharmony_ci list_for_each_entry(h, &subsys->nsheads, entry) 7962306a36Sopenharmony_ci if (h->disk) 8062306a36Sopenharmony_ci blk_freeze_queue_start(h->disk->queue); 8162306a36Sopenharmony_ci} 8262306a36Sopenharmony_ci 8362306a36Sopenharmony_civoid nvme_failover_req(struct request *req) 8462306a36Sopenharmony_ci{ 8562306a36Sopenharmony_ci struct nvme_ns *ns = req->q->queuedata; 8662306a36Sopenharmony_ci u16 status = nvme_req(req)->status & 0x7ff; 8762306a36Sopenharmony_ci unsigned long flags; 8862306a36Sopenharmony_ci struct bio *bio; 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_ci nvme_mpath_clear_current_path(ns); 9162306a36Sopenharmony_ci 9262306a36Sopenharmony_ci /* 9362306a36Sopenharmony_ci * If we got back an ANA error, we know the controller is alive but not 9462306a36Sopenharmony_ci * ready to serve this namespace. Kick of a re-read of the ANA 9562306a36Sopenharmony_ci * information page, and just try any other available path for now. 9662306a36Sopenharmony_ci */ 9762306a36Sopenharmony_ci if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) { 9862306a36Sopenharmony_ci set_bit(NVME_NS_ANA_PENDING, &ns->flags); 9962306a36Sopenharmony_ci queue_work(nvme_wq, &ns->ctrl->ana_work); 10062306a36Sopenharmony_ci } 10162306a36Sopenharmony_ci 10262306a36Sopenharmony_ci spin_lock_irqsave(&ns->head->requeue_lock, flags); 10362306a36Sopenharmony_ci for (bio = req->bio; bio; bio = bio->bi_next) { 10462306a36Sopenharmony_ci bio_set_dev(bio, ns->head->disk->part0); 10562306a36Sopenharmony_ci if (bio->bi_opf & REQ_POLLED) { 10662306a36Sopenharmony_ci bio->bi_opf &= ~REQ_POLLED; 10762306a36Sopenharmony_ci bio->bi_cookie = BLK_QC_T_NONE; 10862306a36Sopenharmony_ci } 10962306a36Sopenharmony_ci /* 11062306a36Sopenharmony_ci * The alternate request queue that we may end up submitting 11162306a36Sopenharmony_ci * the bio to may be frozen temporarily, in this case REQ_NOWAIT 11262306a36Sopenharmony_ci * will fail the I/O immediately with EAGAIN to the issuer. 11362306a36Sopenharmony_ci * We are not in the issuer context which cannot block. Clear 11462306a36Sopenharmony_ci * the flag to avoid spurious EAGAIN I/O failures. 11562306a36Sopenharmony_ci */ 11662306a36Sopenharmony_ci bio->bi_opf &= ~REQ_NOWAIT; 11762306a36Sopenharmony_ci } 11862306a36Sopenharmony_ci blk_steal_bios(&ns->head->requeue_list, req); 11962306a36Sopenharmony_ci spin_unlock_irqrestore(&ns->head->requeue_lock, flags); 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_ci blk_mq_end_request(req, 0); 12262306a36Sopenharmony_ci kblockd_schedule_work(&ns->head->requeue_work); 12362306a36Sopenharmony_ci} 12462306a36Sopenharmony_ci 12562306a36Sopenharmony_civoid nvme_mpath_start_request(struct request *rq) 12662306a36Sopenharmony_ci{ 12762306a36Sopenharmony_ci struct nvme_ns *ns = rq->q->queuedata; 12862306a36Sopenharmony_ci struct gendisk *disk = ns->head->disk; 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq)) 13162306a36Sopenharmony_ci return; 13262306a36Sopenharmony_ci 13362306a36Sopenharmony_ci nvme_req(rq)->flags |= NVME_MPATH_IO_STATS; 13462306a36Sopenharmony_ci nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq), 13562306a36Sopenharmony_ci jiffies); 13662306a36Sopenharmony_ci} 13762306a36Sopenharmony_ciEXPORT_SYMBOL_GPL(nvme_mpath_start_request); 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_civoid nvme_mpath_end_request(struct request *rq) 14062306a36Sopenharmony_ci{ 14162306a36Sopenharmony_ci struct nvme_ns *ns = rq->q->queuedata; 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_ci if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS)) 14462306a36Sopenharmony_ci return; 14562306a36Sopenharmony_ci bdev_end_io_acct(ns->head->disk->part0, req_op(rq), 14662306a36Sopenharmony_ci blk_rq_bytes(rq) >> SECTOR_SHIFT, 14762306a36Sopenharmony_ci nvme_req(rq)->start_time); 14862306a36Sopenharmony_ci} 14962306a36Sopenharmony_ci 15062306a36Sopenharmony_civoid nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) 15162306a36Sopenharmony_ci{ 15262306a36Sopenharmony_ci struct nvme_ns *ns; 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci down_read(&ctrl->namespaces_rwsem); 15562306a36Sopenharmony_ci list_for_each_entry(ns, &ctrl->namespaces, list) { 15662306a36Sopenharmony_ci if (!ns->head->disk) 15762306a36Sopenharmony_ci continue; 15862306a36Sopenharmony_ci kblockd_schedule_work(&ns->head->requeue_work); 15962306a36Sopenharmony_ci if (ctrl->state == NVME_CTRL_LIVE) 16062306a36Sopenharmony_ci disk_uevent(ns->head->disk, KOBJ_CHANGE); 16162306a36Sopenharmony_ci } 16262306a36Sopenharmony_ci up_read(&ctrl->namespaces_rwsem); 16362306a36Sopenharmony_ci} 16462306a36Sopenharmony_ci 16562306a36Sopenharmony_cistatic const char *nvme_ana_state_names[] = { 16662306a36Sopenharmony_ci [0] = "invalid state", 16762306a36Sopenharmony_ci [NVME_ANA_OPTIMIZED] = "optimized", 16862306a36Sopenharmony_ci [NVME_ANA_NONOPTIMIZED] = "non-optimized", 16962306a36Sopenharmony_ci [NVME_ANA_INACCESSIBLE] = "inaccessible", 17062306a36Sopenharmony_ci [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss", 17162306a36Sopenharmony_ci [NVME_ANA_CHANGE] = "change", 17262306a36Sopenharmony_ci}; 17362306a36Sopenharmony_ci 17462306a36Sopenharmony_cibool nvme_mpath_clear_current_path(struct nvme_ns *ns) 17562306a36Sopenharmony_ci{ 17662306a36Sopenharmony_ci struct nvme_ns_head *head = ns->head; 17762306a36Sopenharmony_ci bool changed = false; 17862306a36Sopenharmony_ci int node; 17962306a36Sopenharmony_ci 18062306a36Sopenharmony_ci if (!head) 18162306a36Sopenharmony_ci goto out; 18262306a36Sopenharmony_ci 18362306a36Sopenharmony_ci for_each_node(node) { 18462306a36Sopenharmony_ci if (ns == rcu_access_pointer(head->current_path[node])) { 18562306a36Sopenharmony_ci rcu_assign_pointer(head->current_path[node], NULL); 18662306a36Sopenharmony_ci changed = true; 18762306a36Sopenharmony_ci } 18862306a36Sopenharmony_ci } 18962306a36Sopenharmony_ciout: 19062306a36Sopenharmony_ci return changed; 19162306a36Sopenharmony_ci} 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_civoid nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) 19462306a36Sopenharmony_ci{ 19562306a36Sopenharmony_ci struct nvme_ns *ns; 19662306a36Sopenharmony_ci 19762306a36Sopenharmony_ci down_read(&ctrl->namespaces_rwsem); 19862306a36Sopenharmony_ci list_for_each_entry(ns, &ctrl->namespaces, list) { 19962306a36Sopenharmony_ci nvme_mpath_clear_current_path(ns); 20062306a36Sopenharmony_ci kblockd_schedule_work(&ns->head->requeue_work); 20162306a36Sopenharmony_ci } 20262306a36Sopenharmony_ci up_read(&ctrl->namespaces_rwsem); 20362306a36Sopenharmony_ci} 20462306a36Sopenharmony_ci 20562306a36Sopenharmony_civoid nvme_mpath_revalidate_paths(struct nvme_ns *ns) 20662306a36Sopenharmony_ci{ 20762306a36Sopenharmony_ci struct nvme_ns_head *head = ns->head; 20862306a36Sopenharmony_ci sector_t capacity = get_capacity(head->disk); 20962306a36Sopenharmony_ci int node; 21062306a36Sopenharmony_ci int srcu_idx; 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_ci srcu_idx = srcu_read_lock(&head->srcu); 21362306a36Sopenharmony_ci list_for_each_entry_rcu(ns, &head->list, siblings) { 21462306a36Sopenharmony_ci if (capacity != get_capacity(ns->disk)) 21562306a36Sopenharmony_ci clear_bit(NVME_NS_READY, &ns->flags); 21662306a36Sopenharmony_ci } 21762306a36Sopenharmony_ci srcu_read_unlock(&head->srcu, srcu_idx); 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_ci for_each_node(node) 22062306a36Sopenharmony_ci rcu_assign_pointer(head->current_path[node], NULL); 22162306a36Sopenharmony_ci kblockd_schedule_work(&head->requeue_work); 22262306a36Sopenharmony_ci} 22362306a36Sopenharmony_ci 22462306a36Sopenharmony_cistatic bool nvme_path_is_disabled(struct nvme_ns *ns) 22562306a36Sopenharmony_ci{ 22662306a36Sopenharmony_ci /* 22762306a36Sopenharmony_ci * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should 22862306a36Sopenharmony_ci * still be able to complete assuming that the controller is connected. 22962306a36Sopenharmony_ci * Otherwise it will fail immediately and return to the requeue list. 23062306a36Sopenharmony_ci */ 23162306a36Sopenharmony_ci if (ns->ctrl->state != NVME_CTRL_LIVE && 23262306a36Sopenharmony_ci ns->ctrl->state != NVME_CTRL_DELETING) 23362306a36Sopenharmony_ci return true; 23462306a36Sopenharmony_ci if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || 23562306a36Sopenharmony_ci !test_bit(NVME_NS_READY, &ns->flags)) 23662306a36Sopenharmony_ci return true; 23762306a36Sopenharmony_ci return false; 23862306a36Sopenharmony_ci} 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_cistatic struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) 24162306a36Sopenharmony_ci{ 24262306a36Sopenharmony_ci int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; 24362306a36Sopenharmony_ci struct nvme_ns *found = NULL, *fallback = NULL, *ns; 24462306a36Sopenharmony_ci 24562306a36Sopenharmony_ci list_for_each_entry_rcu(ns, &head->list, siblings) { 24662306a36Sopenharmony_ci if (nvme_path_is_disabled(ns)) 24762306a36Sopenharmony_ci continue; 24862306a36Sopenharmony_ci 24962306a36Sopenharmony_ci if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) 25062306a36Sopenharmony_ci distance = node_distance(node, ns->ctrl->numa_node); 25162306a36Sopenharmony_ci else 25262306a36Sopenharmony_ci distance = LOCAL_DISTANCE; 25362306a36Sopenharmony_ci 25462306a36Sopenharmony_ci switch (ns->ana_state) { 25562306a36Sopenharmony_ci case NVME_ANA_OPTIMIZED: 25662306a36Sopenharmony_ci if (distance < found_distance) { 25762306a36Sopenharmony_ci found_distance = distance; 25862306a36Sopenharmony_ci found = ns; 25962306a36Sopenharmony_ci } 26062306a36Sopenharmony_ci break; 26162306a36Sopenharmony_ci case NVME_ANA_NONOPTIMIZED: 26262306a36Sopenharmony_ci if (distance < fallback_distance) { 26362306a36Sopenharmony_ci fallback_distance = distance; 26462306a36Sopenharmony_ci fallback = ns; 26562306a36Sopenharmony_ci } 26662306a36Sopenharmony_ci break; 26762306a36Sopenharmony_ci default: 26862306a36Sopenharmony_ci break; 26962306a36Sopenharmony_ci } 27062306a36Sopenharmony_ci } 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_ci if (!found) 27362306a36Sopenharmony_ci found = fallback; 27462306a36Sopenharmony_ci if (found) 27562306a36Sopenharmony_ci rcu_assign_pointer(head->current_path[node], found); 27662306a36Sopenharmony_ci return found; 27762306a36Sopenharmony_ci} 27862306a36Sopenharmony_ci 27962306a36Sopenharmony_cistatic struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, 28062306a36Sopenharmony_ci struct nvme_ns *ns) 28162306a36Sopenharmony_ci{ 28262306a36Sopenharmony_ci ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns, 28362306a36Sopenharmony_ci siblings); 28462306a36Sopenharmony_ci if (ns) 28562306a36Sopenharmony_ci return ns; 28662306a36Sopenharmony_ci return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings); 28762306a36Sopenharmony_ci} 28862306a36Sopenharmony_ci 28962306a36Sopenharmony_cistatic struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, 29062306a36Sopenharmony_ci int node, struct nvme_ns *old) 29162306a36Sopenharmony_ci{ 29262306a36Sopenharmony_ci struct nvme_ns *ns, *found = NULL; 29362306a36Sopenharmony_ci 29462306a36Sopenharmony_ci if (list_is_singular(&head->list)) { 29562306a36Sopenharmony_ci if (nvme_path_is_disabled(old)) 29662306a36Sopenharmony_ci return NULL; 29762306a36Sopenharmony_ci return old; 29862306a36Sopenharmony_ci } 29962306a36Sopenharmony_ci 30062306a36Sopenharmony_ci for (ns = nvme_next_ns(head, old); 30162306a36Sopenharmony_ci ns && ns != old; 30262306a36Sopenharmony_ci ns = nvme_next_ns(head, ns)) { 30362306a36Sopenharmony_ci if (nvme_path_is_disabled(ns)) 30462306a36Sopenharmony_ci continue; 30562306a36Sopenharmony_ci 30662306a36Sopenharmony_ci if (ns->ana_state == NVME_ANA_OPTIMIZED) { 30762306a36Sopenharmony_ci found = ns; 30862306a36Sopenharmony_ci goto out; 30962306a36Sopenharmony_ci } 31062306a36Sopenharmony_ci if (ns->ana_state == NVME_ANA_NONOPTIMIZED) 31162306a36Sopenharmony_ci found = ns; 31262306a36Sopenharmony_ci } 31362306a36Sopenharmony_ci 31462306a36Sopenharmony_ci /* 31562306a36Sopenharmony_ci * The loop above skips the current path for round-robin semantics. 31662306a36Sopenharmony_ci * Fall back to the current path if either: 31762306a36Sopenharmony_ci * - no other optimized path found and current is optimized, 31862306a36Sopenharmony_ci * - no other usable path found and current is usable. 31962306a36Sopenharmony_ci */ 32062306a36Sopenharmony_ci if (!nvme_path_is_disabled(old) && 32162306a36Sopenharmony_ci (old->ana_state == NVME_ANA_OPTIMIZED || 32262306a36Sopenharmony_ci (!found && old->ana_state == NVME_ANA_NONOPTIMIZED))) 32362306a36Sopenharmony_ci return old; 32462306a36Sopenharmony_ci 32562306a36Sopenharmony_ci if (!found) 32662306a36Sopenharmony_ci return NULL; 32762306a36Sopenharmony_ciout: 32862306a36Sopenharmony_ci rcu_assign_pointer(head->current_path[node], found); 32962306a36Sopenharmony_ci return found; 33062306a36Sopenharmony_ci} 33162306a36Sopenharmony_ci 33262306a36Sopenharmony_cistatic inline bool nvme_path_is_optimized(struct nvme_ns *ns) 33362306a36Sopenharmony_ci{ 33462306a36Sopenharmony_ci return ns->ctrl->state == NVME_CTRL_LIVE && 33562306a36Sopenharmony_ci ns->ana_state == NVME_ANA_OPTIMIZED; 33662306a36Sopenharmony_ci} 33762306a36Sopenharmony_ci 33862306a36Sopenharmony_ciinline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) 33962306a36Sopenharmony_ci{ 34062306a36Sopenharmony_ci int node = numa_node_id(); 34162306a36Sopenharmony_ci struct nvme_ns *ns; 34262306a36Sopenharmony_ci 34362306a36Sopenharmony_ci ns = srcu_dereference(head->current_path[node], &head->srcu); 34462306a36Sopenharmony_ci if (unlikely(!ns)) 34562306a36Sopenharmony_ci return __nvme_find_path(head, node); 34662306a36Sopenharmony_ci 34762306a36Sopenharmony_ci if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR) 34862306a36Sopenharmony_ci return nvme_round_robin_path(head, node, ns); 34962306a36Sopenharmony_ci if (unlikely(!nvme_path_is_optimized(ns))) 35062306a36Sopenharmony_ci return __nvme_find_path(head, node); 35162306a36Sopenharmony_ci return ns; 35262306a36Sopenharmony_ci} 35362306a36Sopenharmony_ci 35462306a36Sopenharmony_cistatic bool nvme_available_path(struct nvme_ns_head *head) 35562306a36Sopenharmony_ci{ 35662306a36Sopenharmony_ci struct nvme_ns *ns; 35762306a36Sopenharmony_ci 35862306a36Sopenharmony_ci list_for_each_entry_rcu(ns, &head->list, siblings) { 35962306a36Sopenharmony_ci if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) 36062306a36Sopenharmony_ci continue; 36162306a36Sopenharmony_ci switch (ns->ctrl->state) { 36262306a36Sopenharmony_ci case NVME_CTRL_LIVE: 36362306a36Sopenharmony_ci case NVME_CTRL_RESETTING: 36462306a36Sopenharmony_ci case NVME_CTRL_CONNECTING: 36562306a36Sopenharmony_ci /* fallthru */ 36662306a36Sopenharmony_ci return true; 36762306a36Sopenharmony_ci default: 36862306a36Sopenharmony_ci break; 36962306a36Sopenharmony_ci } 37062306a36Sopenharmony_ci } 37162306a36Sopenharmony_ci return false; 37262306a36Sopenharmony_ci} 37362306a36Sopenharmony_ci 37462306a36Sopenharmony_cistatic void nvme_ns_head_submit_bio(struct bio *bio) 37562306a36Sopenharmony_ci{ 37662306a36Sopenharmony_ci struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data; 37762306a36Sopenharmony_ci struct device *dev = disk_to_dev(head->disk); 37862306a36Sopenharmony_ci struct nvme_ns *ns; 37962306a36Sopenharmony_ci int srcu_idx; 38062306a36Sopenharmony_ci 38162306a36Sopenharmony_ci /* 38262306a36Sopenharmony_ci * The namespace might be going away and the bio might be moved to a 38362306a36Sopenharmony_ci * different queue via blk_steal_bios(), so we need to use the bio_split 38462306a36Sopenharmony_ci * pool from the original queue to allocate the bvecs from. 38562306a36Sopenharmony_ci */ 38662306a36Sopenharmony_ci bio = bio_split_to_limits(bio); 38762306a36Sopenharmony_ci if (!bio) 38862306a36Sopenharmony_ci return; 38962306a36Sopenharmony_ci 39062306a36Sopenharmony_ci srcu_idx = srcu_read_lock(&head->srcu); 39162306a36Sopenharmony_ci ns = nvme_find_path(head); 39262306a36Sopenharmony_ci if (likely(ns)) { 39362306a36Sopenharmony_ci bio_set_dev(bio, ns->disk->part0); 39462306a36Sopenharmony_ci bio->bi_opf |= REQ_NVME_MPATH; 39562306a36Sopenharmony_ci trace_block_bio_remap(bio, disk_devt(ns->head->disk), 39662306a36Sopenharmony_ci bio->bi_iter.bi_sector); 39762306a36Sopenharmony_ci submit_bio_noacct(bio); 39862306a36Sopenharmony_ci } else if (nvme_available_path(head)) { 39962306a36Sopenharmony_ci dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n"); 40062306a36Sopenharmony_ci 40162306a36Sopenharmony_ci spin_lock_irq(&head->requeue_lock); 40262306a36Sopenharmony_ci bio_list_add(&head->requeue_list, bio); 40362306a36Sopenharmony_ci spin_unlock_irq(&head->requeue_lock); 40462306a36Sopenharmony_ci } else { 40562306a36Sopenharmony_ci dev_warn_ratelimited(dev, "no available path - failing I/O\n"); 40662306a36Sopenharmony_ci 40762306a36Sopenharmony_ci bio_io_error(bio); 40862306a36Sopenharmony_ci } 40962306a36Sopenharmony_ci 41062306a36Sopenharmony_ci srcu_read_unlock(&head->srcu, srcu_idx); 41162306a36Sopenharmony_ci} 41262306a36Sopenharmony_ci 41362306a36Sopenharmony_cistatic int nvme_ns_head_open(struct gendisk *disk, blk_mode_t mode) 41462306a36Sopenharmony_ci{ 41562306a36Sopenharmony_ci if (!nvme_tryget_ns_head(disk->private_data)) 41662306a36Sopenharmony_ci return -ENXIO; 41762306a36Sopenharmony_ci return 0; 41862306a36Sopenharmony_ci} 41962306a36Sopenharmony_ci 42062306a36Sopenharmony_cistatic void nvme_ns_head_release(struct gendisk *disk) 42162306a36Sopenharmony_ci{ 42262306a36Sopenharmony_ci nvme_put_ns_head(disk->private_data); 42362306a36Sopenharmony_ci} 42462306a36Sopenharmony_ci 42562306a36Sopenharmony_ci#ifdef CONFIG_BLK_DEV_ZONED 42662306a36Sopenharmony_cistatic int nvme_ns_head_report_zones(struct gendisk *disk, sector_t sector, 42762306a36Sopenharmony_ci unsigned int nr_zones, report_zones_cb cb, void *data) 42862306a36Sopenharmony_ci{ 42962306a36Sopenharmony_ci struct nvme_ns_head *head = disk->private_data; 43062306a36Sopenharmony_ci struct nvme_ns *ns; 43162306a36Sopenharmony_ci int srcu_idx, ret = -EWOULDBLOCK; 43262306a36Sopenharmony_ci 43362306a36Sopenharmony_ci srcu_idx = srcu_read_lock(&head->srcu); 43462306a36Sopenharmony_ci ns = nvme_find_path(head); 43562306a36Sopenharmony_ci if (ns) 43662306a36Sopenharmony_ci ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data); 43762306a36Sopenharmony_ci srcu_read_unlock(&head->srcu, srcu_idx); 43862306a36Sopenharmony_ci return ret; 43962306a36Sopenharmony_ci} 44062306a36Sopenharmony_ci#else 44162306a36Sopenharmony_ci#define nvme_ns_head_report_zones NULL 44262306a36Sopenharmony_ci#endif /* CONFIG_BLK_DEV_ZONED */ 44362306a36Sopenharmony_ci 44462306a36Sopenharmony_ciconst struct block_device_operations nvme_ns_head_ops = { 44562306a36Sopenharmony_ci .owner = THIS_MODULE, 44662306a36Sopenharmony_ci .submit_bio = nvme_ns_head_submit_bio, 44762306a36Sopenharmony_ci .open = nvme_ns_head_open, 44862306a36Sopenharmony_ci .release = nvme_ns_head_release, 44962306a36Sopenharmony_ci .ioctl = nvme_ns_head_ioctl, 45062306a36Sopenharmony_ci .compat_ioctl = blkdev_compat_ptr_ioctl, 45162306a36Sopenharmony_ci .getgeo = nvme_getgeo, 45262306a36Sopenharmony_ci .report_zones = nvme_ns_head_report_zones, 45362306a36Sopenharmony_ci .pr_ops = &nvme_pr_ops, 45462306a36Sopenharmony_ci}; 45562306a36Sopenharmony_ci 45662306a36Sopenharmony_cistatic inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev) 45762306a36Sopenharmony_ci{ 45862306a36Sopenharmony_ci return container_of(cdev, struct nvme_ns_head, cdev); 45962306a36Sopenharmony_ci} 46062306a36Sopenharmony_ci 46162306a36Sopenharmony_cistatic int nvme_ns_head_chr_open(struct inode *inode, struct file *file) 46262306a36Sopenharmony_ci{ 46362306a36Sopenharmony_ci if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev))) 46462306a36Sopenharmony_ci return -ENXIO; 46562306a36Sopenharmony_ci return 0; 46662306a36Sopenharmony_ci} 46762306a36Sopenharmony_ci 46862306a36Sopenharmony_cistatic int nvme_ns_head_chr_release(struct inode *inode, struct file *file) 46962306a36Sopenharmony_ci{ 47062306a36Sopenharmony_ci nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev)); 47162306a36Sopenharmony_ci return 0; 47262306a36Sopenharmony_ci} 47362306a36Sopenharmony_ci 47462306a36Sopenharmony_cistatic const struct file_operations nvme_ns_head_chr_fops = { 47562306a36Sopenharmony_ci .owner = THIS_MODULE, 47662306a36Sopenharmony_ci .open = nvme_ns_head_chr_open, 47762306a36Sopenharmony_ci .release = nvme_ns_head_chr_release, 47862306a36Sopenharmony_ci .unlocked_ioctl = nvme_ns_head_chr_ioctl, 47962306a36Sopenharmony_ci .compat_ioctl = compat_ptr_ioctl, 48062306a36Sopenharmony_ci .uring_cmd = nvme_ns_head_chr_uring_cmd, 48162306a36Sopenharmony_ci .uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll, 48262306a36Sopenharmony_ci}; 48362306a36Sopenharmony_ci 48462306a36Sopenharmony_cistatic int nvme_add_ns_head_cdev(struct nvme_ns_head *head) 48562306a36Sopenharmony_ci{ 48662306a36Sopenharmony_ci int ret; 48762306a36Sopenharmony_ci 48862306a36Sopenharmony_ci head->cdev_device.parent = &head->subsys->dev; 48962306a36Sopenharmony_ci ret = dev_set_name(&head->cdev_device, "ng%dn%d", 49062306a36Sopenharmony_ci head->subsys->instance, head->instance); 49162306a36Sopenharmony_ci if (ret) 49262306a36Sopenharmony_ci return ret; 49362306a36Sopenharmony_ci ret = nvme_cdev_add(&head->cdev, &head->cdev_device, 49462306a36Sopenharmony_ci &nvme_ns_head_chr_fops, THIS_MODULE); 49562306a36Sopenharmony_ci return ret; 49662306a36Sopenharmony_ci} 49762306a36Sopenharmony_ci 49862306a36Sopenharmony_cistatic void nvme_requeue_work(struct work_struct *work) 49962306a36Sopenharmony_ci{ 50062306a36Sopenharmony_ci struct nvme_ns_head *head = 50162306a36Sopenharmony_ci container_of(work, struct nvme_ns_head, requeue_work); 50262306a36Sopenharmony_ci struct bio *bio, *next; 50362306a36Sopenharmony_ci 50462306a36Sopenharmony_ci spin_lock_irq(&head->requeue_lock); 50562306a36Sopenharmony_ci next = bio_list_get(&head->requeue_list); 50662306a36Sopenharmony_ci spin_unlock_irq(&head->requeue_lock); 50762306a36Sopenharmony_ci 50862306a36Sopenharmony_ci while ((bio = next) != NULL) { 50962306a36Sopenharmony_ci next = bio->bi_next; 51062306a36Sopenharmony_ci bio->bi_next = NULL; 51162306a36Sopenharmony_ci 51262306a36Sopenharmony_ci submit_bio_noacct(bio); 51362306a36Sopenharmony_ci } 51462306a36Sopenharmony_ci} 51562306a36Sopenharmony_ci 51662306a36Sopenharmony_ciint nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) 51762306a36Sopenharmony_ci{ 51862306a36Sopenharmony_ci bool vwc = false; 51962306a36Sopenharmony_ci 52062306a36Sopenharmony_ci mutex_init(&head->lock); 52162306a36Sopenharmony_ci bio_list_init(&head->requeue_list); 52262306a36Sopenharmony_ci spin_lock_init(&head->requeue_lock); 52362306a36Sopenharmony_ci INIT_WORK(&head->requeue_work, nvme_requeue_work); 52462306a36Sopenharmony_ci 52562306a36Sopenharmony_ci /* 52662306a36Sopenharmony_ci * Add a multipath node if the subsystems supports multiple controllers. 52762306a36Sopenharmony_ci * We also do this for private namespaces as the namespace sharing flag 52862306a36Sopenharmony_ci * could change after a rescan. 52962306a36Sopenharmony_ci */ 53062306a36Sopenharmony_ci if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || 53162306a36Sopenharmony_ci !nvme_is_unique_nsid(ctrl, head) || !multipath) 53262306a36Sopenharmony_ci return 0; 53362306a36Sopenharmony_ci 53462306a36Sopenharmony_ci head->disk = blk_alloc_disk(ctrl->numa_node); 53562306a36Sopenharmony_ci if (!head->disk) 53662306a36Sopenharmony_ci return -ENOMEM; 53762306a36Sopenharmony_ci head->disk->fops = &nvme_ns_head_ops; 53862306a36Sopenharmony_ci head->disk->private_data = head; 53962306a36Sopenharmony_ci sprintf(head->disk->disk_name, "nvme%dn%d", 54062306a36Sopenharmony_ci ctrl->subsys->instance, head->instance); 54162306a36Sopenharmony_ci 54262306a36Sopenharmony_ci blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue); 54362306a36Sopenharmony_ci blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue); 54462306a36Sopenharmony_ci blk_queue_flag_set(QUEUE_FLAG_IO_STAT, head->disk->queue); 54562306a36Sopenharmony_ci /* 54662306a36Sopenharmony_ci * This assumes all controllers that refer to a namespace either 54762306a36Sopenharmony_ci * support poll queues or not. That is not a strict guarantee, 54862306a36Sopenharmony_ci * but if the assumption is wrong the effect is only suboptimal 54962306a36Sopenharmony_ci * performance but not correctness problem. 55062306a36Sopenharmony_ci */ 55162306a36Sopenharmony_ci if (ctrl->tagset->nr_maps > HCTX_TYPE_POLL && 55262306a36Sopenharmony_ci ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues) 55362306a36Sopenharmony_ci blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue); 55462306a36Sopenharmony_ci 55562306a36Sopenharmony_ci /* set to a default value of 512 until the disk is validated */ 55662306a36Sopenharmony_ci blk_queue_logical_block_size(head->disk->queue, 512); 55762306a36Sopenharmony_ci blk_set_stacking_limits(&head->disk->queue->limits); 55862306a36Sopenharmony_ci blk_queue_dma_alignment(head->disk->queue, 3); 55962306a36Sopenharmony_ci 56062306a36Sopenharmony_ci /* we need to propagate up the VMC settings */ 56162306a36Sopenharmony_ci if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) 56262306a36Sopenharmony_ci vwc = true; 56362306a36Sopenharmony_ci blk_queue_write_cache(head->disk->queue, vwc, vwc); 56462306a36Sopenharmony_ci return 0; 56562306a36Sopenharmony_ci} 56662306a36Sopenharmony_ci 56762306a36Sopenharmony_cistatic void nvme_mpath_set_live(struct nvme_ns *ns) 56862306a36Sopenharmony_ci{ 56962306a36Sopenharmony_ci struct nvme_ns_head *head = ns->head; 57062306a36Sopenharmony_ci int rc; 57162306a36Sopenharmony_ci 57262306a36Sopenharmony_ci if (!head->disk) 57362306a36Sopenharmony_ci return; 57462306a36Sopenharmony_ci 57562306a36Sopenharmony_ci /* 57662306a36Sopenharmony_ci * test_and_set_bit() is used because it is protecting against two nvme 57762306a36Sopenharmony_ci * paths simultaneously calling device_add_disk() on the same namespace 57862306a36Sopenharmony_ci * head. 57962306a36Sopenharmony_ci */ 58062306a36Sopenharmony_ci if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { 58162306a36Sopenharmony_ci rc = device_add_disk(&head->subsys->dev, head->disk, 58262306a36Sopenharmony_ci nvme_ns_id_attr_groups); 58362306a36Sopenharmony_ci if (rc) { 58462306a36Sopenharmony_ci clear_bit(NVME_NSHEAD_DISK_LIVE, &ns->flags); 58562306a36Sopenharmony_ci return; 58662306a36Sopenharmony_ci } 58762306a36Sopenharmony_ci nvme_add_ns_head_cdev(head); 58862306a36Sopenharmony_ci } 58962306a36Sopenharmony_ci 59062306a36Sopenharmony_ci mutex_lock(&head->lock); 59162306a36Sopenharmony_ci if (nvme_path_is_optimized(ns)) { 59262306a36Sopenharmony_ci int node, srcu_idx; 59362306a36Sopenharmony_ci 59462306a36Sopenharmony_ci srcu_idx = srcu_read_lock(&head->srcu); 59562306a36Sopenharmony_ci for_each_node(node) 59662306a36Sopenharmony_ci __nvme_find_path(head, node); 59762306a36Sopenharmony_ci srcu_read_unlock(&head->srcu, srcu_idx); 59862306a36Sopenharmony_ci } 59962306a36Sopenharmony_ci mutex_unlock(&head->lock); 60062306a36Sopenharmony_ci 60162306a36Sopenharmony_ci synchronize_srcu(&head->srcu); 60262306a36Sopenharmony_ci kblockd_schedule_work(&head->requeue_work); 60362306a36Sopenharmony_ci} 60462306a36Sopenharmony_ci 60562306a36Sopenharmony_cistatic int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, 60662306a36Sopenharmony_ci int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *, 60762306a36Sopenharmony_ci void *)) 60862306a36Sopenharmony_ci{ 60962306a36Sopenharmony_ci void *base = ctrl->ana_log_buf; 61062306a36Sopenharmony_ci size_t offset = sizeof(struct nvme_ana_rsp_hdr); 61162306a36Sopenharmony_ci int error, i; 61262306a36Sopenharmony_ci 61362306a36Sopenharmony_ci lockdep_assert_held(&ctrl->ana_lock); 61462306a36Sopenharmony_ci 61562306a36Sopenharmony_ci for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { 61662306a36Sopenharmony_ci struct nvme_ana_group_desc *desc = base + offset; 61762306a36Sopenharmony_ci u32 nr_nsids; 61862306a36Sopenharmony_ci size_t nsid_buf_size; 61962306a36Sopenharmony_ci 62062306a36Sopenharmony_ci if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) 62162306a36Sopenharmony_ci return -EINVAL; 62262306a36Sopenharmony_ci 62362306a36Sopenharmony_ci nr_nsids = le32_to_cpu(desc->nnsids); 62462306a36Sopenharmony_ci nsid_buf_size = flex_array_size(desc, nsids, nr_nsids); 62562306a36Sopenharmony_ci 62662306a36Sopenharmony_ci if (WARN_ON_ONCE(desc->grpid == 0)) 62762306a36Sopenharmony_ci return -EINVAL; 62862306a36Sopenharmony_ci if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax)) 62962306a36Sopenharmony_ci return -EINVAL; 63062306a36Sopenharmony_ci if (WARN_ON_ONCE(desc->state == 0)) 63162306a36Sopenharmony_ci return -EINVAL; 63262306a36Sopenharmony_ci if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE)) 63362306a36Sopenharmony_ci return -EINVAL; 63462306a36Sopenharmony_ci 63562306a36Sopenharmony_ci offset += sizeof(*desc); 63662306a36Sopenharmony_ci if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size)) 63762306a36Sopenharmony_ci return -EINVAL; 63862306a36Sopenharmony_ci 63962306a36Sopenharmony_ci error = cb(ctrl, desc, data); 64062306a36Sopenharmony_ci if (error) 64162306a36Sopenharmony_ci return error; 64262306a36Sopenharmony_ci 64362306a36Sopenharmony_ci offset += nsid_buf_size; 64462306a36Sopenharmony_ci } 64562306a36Sopenharmony_ci 64662306a36Sopenharmony_ci return 0; 64762306a36Sopenharmony_ci} 64862306a36Sopenharmony_ci 64962306a36Sopenharmony_cistatic inline bool nvme_state_is_live(enum nvme_ana_state state) 65062306a36Sopenharmony_ci{ 65162306a36Sopenharmony_ci return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED; 65262306a36Sopenharmony_ci} 65362306a36Sopenharmony_ci 65462306a36Sopenharmony_cistatic void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, 65562306a36Sopenharmony_ci struct nvme_ns *ns) 65662306a36Sopenharmony_ci{ 65762306a36Sopenharmony_ci ns->ana_grpid = le32_to_cpu(desc->grpid); 65862306a36Sopenharmony_ci ns->ana_state = desc->state; 65962306a36Sopenharmony_ci clear_bit(NVME_NS_ANA_PENDING, &ns->flags); 66062306a36Sopenharmony_ci /* 66162306a36Sopenharmony_ci * nvme_mpath_set_live() will trigger I/O to the multipath path device 66262306a36Sopenharmony_ci * and in turn to this path device. However we cannot accept this I/O 66362306a36Sopenharmony_ci * if the controller is not live. This may deadlock if called from 66462306a36Sopenharmony_ci * nvme_mpath_init_identify() and the ctrl will never complete 66562306a36Sopenharmony_ci * initialization, preventing I/O from completing. For this case we 66662306a36Sopenharmony_ci * will reprocess the ANA log page in nvme_mpath_update() once the 66762306a36Sopenharmony_ci * controller is ready. 66862306a36Sopenharmony_ci */ 66962306a36Sopenharmony_ci if (nvme_state_is_live(ns->ana_state) && 67062306a36Sopenharmony_ci ns->ctrl->state == NVME_CTRL_LIVE) 67162306a36Sopenharmony_ci nvme_mpath_set_live(ns); 67262306a36Sopenharmony_ci} 67362306a36Sopenharmony_ci 67462306a36Sopenharmony_cistatic int nvme_update_ana_state(struct nvme_ctrl *ctrl, 67562306a36Sopenharmony_ci struct nvme_ana_group_desc *desc, void *data) 67662306a36Sopenharmony_ci{ 67762306a36Sopenharmony_ci u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; 67862306a36Sopenharmony_ci unsigned *nr_change_groups = data; 67962306a36Sopenharmony_ci struct nvme_ns *ns; 68062306a36Sopenharmony_ci 68162306a36Sopenharmony_ci dev_dbg(ctrl->device, "ANA group %d: %s.\n", 68262306a36Sopenharmony_ci le32_to_cpu(desc->grpid), 68362306a36Sopenharmony_ci nvme_ana_state_names[desc->state]); 68462306a36Sopenharmony_ci 68562306a36Sopenharmony_ci if (desc->state == NVME_ANA_CHANGE) 68662306a36Sopenharmony_ci (*nr_change_groups)++; 68762306a36Sopenharmony_ci 68862306a36Sopenharmony_ci if (!nr_nsids) 68962306a36Sopenharmony_ci return 0; 69062306a36Sopenharmony_ci 69162306a36Sopenharmony_ci down_read(&ctrl->namespaces_rwsem); 69262306a36Sopenharmony_ci list_for_each_entry(ns, &ctrl->namespaces, list) { 69362306a36Sopenharmony_ci unsigned nsid; 69462306a36Sopenharmony_ciagain: 69562306a36Sopenharmony_ci nsid = le32_to_cpu(desc->nsids[n]); 69662306a36Sopenharmony_ci if (ns->head->ns_id < nsid) 69762306a36Sopenharmony_ci continue; 69862306a36Sopenharmony_ci if (ns->head->ns_id == nsid) 69962306a36Sopenharmony_ci nvme_update_ns_ana_state(desc, ns); 70062306a36Sopenharmony_ci if (++n == nr_nsids) 70162306a36Sopenharmony_ci break; 70262306a36Sopenharmony_ci if (ns->head->ns_id > nsid) 70362306a36Sopenharmony_ci goto again; 70462306a36Sopenharmony_ci } 70562306a36Sopenharmony_ci up_read(&ctrl->namespaces_rwsem); 70662306a36Sopenharmony_ci return 0; 70762306a36Sopenharmony_ci} 70862306a36Sopenharmony_ci 70962306a36Sopenharmony_cistatic int nvme_read_ana_log(struct nvme_ctrl *ctrl) 71062306a36Sopenharmony_ci{ 71162306a36Sopenharmony_ci u32 nr_change_groups = 0; 71262306a36Sopenharmony_ci int error; 71362306a36Sopenharmony_ci 71462306a36Sopenharmony_ci mutex_lock(&ctrl->ana_lock); 71562306a36Sopenharmony_ci error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM, 71662306a36Sopenharmony_ci ctrl->ana_log_buf, ctrl->ana_log_size, 0); 71762306a36Sopenharmony_ci if (error) { 71862306a36Sopenharmony_ci dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); 71962306a36Sopenharmony_ci goto out_unlock; 72062306a36Sopenharmony_ci } 72162306a36Sopenharmony_ci 72262306a36Sopenharmony_ci error = nvme_parse_ana_log(ctrl, &nr_change_groups, 72362306a36Sopenharmony_ci nvme_update_ana_state); 72462306a36Sopenharmony_ci if (error) 72562306a36Sopenharmony_ci goto out_unlock; 72662306a36Sopenharmony_ci 72762306a36Sopenharmony_ci /* 72862306a36Sopenharmony_ci * In theory we should have an ANATT timer per group as they might enter 72962306a36Sopenharmony_ci * the change state at different times. But that is a lot of overhead 73062306a36Sopenharmony_ci * just to protect against a target that keeps entering new changes 73162306a36Sopenharmony_ci * states while never finishing previous ones. But we'll still 73262306a36Sopenharmony_ci * eventually time out once all groups are in change state, so this 73362306a36Sopenharmony_ci * isn't a big deal. 73462306a36Sopenharmony_ci * 73562306a36Sopenharmony_ci * We also double the ANATT value to provide some slack for transports 73662306a36Sopenharmony_ci * or AEN processing overhead. 73762306a36Sopenharmony_ci */ 73862306a36Sopenharmony_ci if (nr_change_groups) 73962306a36Sopenharmony_ci mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies); 74062306a36Sopenharmony_ci else 74162306a36Sopenharmony_ci del_timer_sync(&ctrl->anatt_timer); 74262306a36Sopenharmony_ciout_unlock: 74362306a36Sopenharmony_ci mutex_unlock(&ctrl->ana_lock); 74462306a36Sopenharmony_ci return error; 74562306a36Sopenharmony_ci} 74662306a36Sopenharmony_ci 74762306a36Sopenharmony_cistatic void nvme_ana_work(struct work_struct *work) 74862306a36Sopenharmony_ci{ 74962306a36Sopenharmony_ci struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); 75062306a36Sopenharmony_ci 75162306a36Sopenharmony_ci if (ctrl->state != NVME_CTRL_LIVE) 75262306a36Sopenharmony_ci return; 75362306a36Sopenharmony_ci 75462306a36Sopenharmony_ci nvme_read_ana_log(ctrl); 75562306a36Sopenharmony_ci} 75662306a36Sopenharmony_ci 75762306a36Sopenharmony_civoid nvme_mpath_update(struct nvme_ctrl *ctrl) 75862306a36Sopenharmony_ci{ 75962306a36Sopenharmony_ci u32 nr_change_groups = 0; 76062306a36Sopenharmony_ci 76162306a36Sopenharmony_ci if (!ctrl->ana_log_buf) 76262306a36Sopenharmony_ci return; 76362306a36Sopenharmony_ci 76462306a36Sopenharmony_ci mutex_lock(&ctrl->ana_lock); 76562306a36Sopenharmony_ci nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state); 76662306a36Sopenharmony_ci mutex_unlock(&ctrl->ana_lock); 76762306a36Sopenharmony_ci} 76862306a36Sopenharmony_ci 76962306a36Sopenharmony_cistatic void nvme_anatt_timeout(struct timer_list *t) 77062306a36Sopenharmony_ci{ 77162306a36Sopenharmony_ci struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); 77262306a36Sopenharmony_ci 77362306a36Sopenharmony_ci dev_info(ctrl->device, "ANATT timeout, resetting controller.\n"); 77462306a36Sopenharmony_ci nvme_reset_ctrl(ctrl); 77562306a36Sopenharmony_ci} 77662306a36Sopenharmony_ci 77762306a36Sopenharmony_civoid nvme_mpath_stop(struct nvme_ctrl *ctrl) 77862306a36Sopenharmony_ci{ 77962306a36Sopenharmony_ci if (!nvme_ctrl_use_ana(ctrl)) 78062306a36Sopenharmony_ci return; 78162306a36Sopenharmony_ci del_timer_sync(&ctrl->anatt_timer); 78262306a36Sopenharmony_ci cancel_work_sync(&ctrl->ana_work); 78362306a36Sopenharmony_ci} 78462306a36Sopenharmony_ci 78562306a36Sopenharmony_ci#define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \ 78662306a36Sopenharmony_ci struct device_attribute subsys_attr_##_name = \ 78762306a36Sopenharmony_ci __ATTR(_name, _mode, _show, _store) 78862306a36Sopenharmony_ci 78962306a36Sopenharmony_cistatic ssize_t nvme_subsys_iopolicy_show(struct device *dev, 79062306a36Sopenharmony_ci struct device_attribute *attr, char *buf) 79162306a36Sopenharmony_ci{ 79262306a36Sopenharmony_ci struct nvme_subsystem *subsys = 79362306a36Sopenharmony_ci container_of(dev, struct nvme_subsystem, dev); 79462306a36Sopenharmony_ci 79562306a36Sopenharmony_ci return sysfs_emit(buf, "%s\n", 79662306a36Sopenharmony_ci nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); 79762306a36Sopenharmony_ci} 79862306a36Sopenharmony_ci 79962306a36Sopenharmony_cistatic ssize_t nvme_subsys_iopolicy_store(struct device *dev, 80062306a36Sopenharmony_ci struct device_attribute *attr, const char *buf, size_t count) 80162306a36Sopenharmony_ci{ 80262306a36Sopenharmony_ci struct nvme_subsystem *subsys = 80362306a36Sopenharmony_ci container_of(dev, struct nvme_subsystem, dev); 80462306a36Sopenharmony_ci int i; 80562306a36Sopenharmony_ci 80662306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { 80762306a36Sopenharmony_ci if (sysfs_streq(buf, nvme_iopolicy_names[i])) { 80862306a36Sopenharmony_ci WRITE_ONCE(subsys->iopolicy, i); 80962306a36Sopenharmony_ci return count; 81062306a36Sopenharmony_ci } 81162306a36Sopenharmony_ci } 81262306a36Sopenharmony_ci 81362306a36Sopenharmony_ci return -EINVAL; 81462306a36Sopenharmony_ci} 81562306a36Sopenharmony_ciSUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, 81662306a36Sopenharmony_ci nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store); 81762306a36Sopenharmony_ci 81862306a36Sopenharmony_cistatic ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, 81962306a36Sopenharmony_ci char *buf) 82062306a36Sopenharmony_ci{ 82162306a36Sopenharmony_ci return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); 82262306a36Sopenharmony_ci} 82362306a36Sopenharmony_ciDEVICE_ATTR_RO(ana_grpid); 82462306a36Sopenharmony_ci 82562306a36Sopenharmony_cistatic ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, 82662306a36Sopenharmony_ci char *buf) 82762306a36Sopenharmony_ci{ 82862306a36Sopenharmony_ci struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 82962306a36Sopenharmony_ci 83062306a36Sopenharmony_ci return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); 83162306a36Sopenharmony_ci} 83262306a36Sopenharmony_ciDEVICE_ATTR_RO(ana_state); 83362306a36Sopenharmony_ci 83462306a36Sopenharmony_cistatic int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, 83562306a36Sopenharmony_ci struct nvme_ana_group_desc *desc, void *data) 83662306a36Sopenharmony_ci{ 83762306a36Sopenharmony_ci struct nvme_ana_group_desc *dst = data; 83862306a36Sopenharmony_ci 83962306a36Sopenharmony_ci if (desc->grpid != dst->grpid) 84062306a36Sopenharmony_ci return 0; 84162306a36Sopenharmony_ci 84262306a36Sopenharmony_ci *dst = *desc; 84362306a36Sopenharmony_ci return -ENXIO; /* just break out of the loop */ 84462306a36Sopenharmony_ci} 84562306a36Sopenharmony_ci 84662306a36Sopenharmony_civoid nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) 84762306a36Sopenharmony_ci{ 84862306a36Sopenharmony_ci if (nvme_ctrl_use_ana(ns->ctrl)) { 84962306a36Sopenharmony_ci struct nvme_ana_group_desc desc = { 85062306a36Sopenharmony_ci .grpid = anagrpid, 85162306a36Sopenharmony_ci .state = 0, 85262306a36Sopenharmony_ci }; 85362306a36Sopenharmony_ci 85462306a36Sopenharmony_ci mutex_lock(&ns->ctrl->ana_lock); 85562306a36Sopenharmony_ci ns->ana_grpid = le32_to_cpu(anagrpid); 85662306a36Sopenharmony_ci nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc); 85762306a36Sopenharmony_ci mutex_unlock(&ns->ctrl->ana_lock); 85862306a36Sopenharmony_ci if (desc.state) { 85962306a36Sopenharmony_ci /* found the group desc: update */ 86062306a36Sopenharmony_ci nvme_update_ns_ana_state(&desc, ns); 86162306a36Sopenharmony_ci } else { 86262306a36Sopenharmony_ci /* group desc not found: trigger a re-read */ 86362306a36Sopenharmony_ci set_bit(NVME_NS_ANA_PENDING, &ns->flags); 86462306a36Sopenharmony_ci queue_work(nvme_wq, &ns->ctrl->ana_work); 86562306a36Sopenharmony_ci } 86662306a36Sopenharmony_ci } else { 86762306a36Sopenharmony_ci ns->ana_state = NVME_ANA_OPTIMIZED; 86862306a36Sopenharmony_ci nvme_mpath_set_live(ns); 86962306a36Sopenharmony_ci } 87062306a36Sopenharmony_ci 87162306a36Sopenharmony_ci if (blk_queue_stable_writes(ns->queue) && ns->head->disk) 87262306a36Sopenharmony_ci blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, 87362306a36Sopenharmony_ci ns->head->disk->queue); 87462306a36Sopenharmony_ci#ifdef CONFIG_BLK_DEV_ZONED 87562306a36Sopenharmony_ci if (blk_queue_is_zoned(ns->queue) && ns->head->disk) 87662306a36Sopenharmony_ci ns->head->disk->nr_zones = ns->disk->nr_zones; 87762306a36Sopenharmony_ci#endif 87862306a36Sopenharmony_ci} 87962306a36Sopenharmony_ci 88062306a36Sopenharmony_civoid nvme_mpath_shutdown_disk(struct nvme_ns_head *head) 88162306a36Sopenharmony_ci{ 88262306a36Sopenharmony_ci if (!head->disk) 88362306a36Sopenharmony_ci return; 88462306a36Sopenharmony_ci kblockd_schedule_work(&head->requeue_work); 88562306a36Sopenharmony_ci if (test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { 88662306a36Sopenharmony_ci nvme_cdev_del(&head->cdev, &head->cdev_device); 88762306a36Sopenharmony_ci del_gendisk(head->disk); 88862306a36Sopenharmony_ci } 88962306a36Sopenharmony_ci} 89062306a36Sopenharmony_ci 89162306a36Sopenharmony_civoid nvme_mpath_remove_disk(struct nvme_ns_head *head) 89262306a36Sopenharmony_ci{ 89362306a36Sopenharmony_ci if (!head->disk) 89462306a36Sopenharmony_ci return; 89562306a36Sopenharmony_ci /* make sure all pending bios are cleaned up */ 89662306a36Sopenharmony_ci kblockd_schedule_work(&head->requeue_work); 89762306a36Sopenharmony_ci flush_work(&head->requeue_work); 89862306a36Sopenharmony_ci put_disk(head->disk); 89962306a36Sopenharmony_ci} 90062306a36Sopenharmony_ci 90162306a36Sopenharmony_civoid nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) 90262306a36Sopenharmony_ci{ 90362306a36Sopenharmony_ci mutex_init(&ctrl->ana_lock); 90462306a36Sopenharmony_ci timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); 90562306a36Sopenharmony_ci INIT_WORK(&ctrl->ana_work, nvme_ana_work); 90662306a36Sopenharmony_ci} 90762306a36Sopenharmony_ci 90862306a36Sopenharmony_ciint nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) 90962306a36Sopenharmony_ci{ 91062306a36Sopenharmony_ci size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT; 91162306a36Sopenharmony_ci size_t ana_log_size; 91262306a36Sopenharmony_ci int error = 0; 91362306a36Sopenharmony_ci 91462306a36Sopenharmony_ci /* check if multipath is enabled and we have the capability */ 91562306a36Sopenharmony_ci if (!multipath || !ctrl->subsys || 91662306a36Sopenharmony_ci !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) 91762306a36Sopenharmony_ci return 0; 91862306a36Sopenharmony_ci 91962306a36Sopenharmony_ci if (!ctrl->max_namespaces || 92062306a36Sopenharmony_ci ctrl->max_namespaces > le32_to_cpu(id->nn)) { 92162306a36Sopenharmony_ci dev_err(ctrl->device, 92262306a36Sopenharmony_ci "Invalid MNAN value %u\n", ctrl->max_namespaces); 92362306a36Sopenharmony_ci return -EINVAL; 92462306a36Sopenharmony_ci } 92562306a36Sopenharmony_ci 92662306a36Sopenharmony_ci ctrl->anacap = id->anacap; 92762306a36Sopenharmony_ci ctrl->anatt = id->anatt; 92862306a36Sopenharmony_ci ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); 92962306a36Sopenharmony_ci ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); 93062306a36Sopenharmony_ci 93162306a36Sopenharmony_ci ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + 93262306a36Sopenharmony_ci ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) + 93362306a36Sopenharmony_ci ctrl->max_namespaces * sizeof(__le32); 93462306a36Sopenharmony_ci if (ana_log_size > max_transfer_size) { 93562306a36Sopenharmony_ci dev_err(ctrl->device, 93662306a36Sopenharmony_ci "ANA log page size (%zd) larger than MDTS (%zd).\n", 93762306a36Sopenharmony_ci ana_log_size, max_transfer_size); 93862306a36Sopenharmony_ci dev_err(ctrl->device, "disabling ANA support.\n"); 93962306a36Sopenharmony_ci goto out_uninit; 94062306a36Sopenharmony_ci } 94162306a36Sopenharmony_ci if (ana_log_size > ctrl->ana_log_size) { 94262306a36Sopenharmony_ci nvme_mpath_stop(ctrl); 94362306a36Sopenharmony_ci nvme_mpath_uninit(ctrl); 94462306a36Sopenharmony_ci ctrl->ana_log_buf = kvmalloc(ana_log_size, GFP_KERNEL); 94562306a36Sopenharmony_ci if (!ctrl->ana_log_buf) 94662306a36Sopenharmony_ci return -ENOMEM; 94762306a36Sopenharmony_ci } 94862306a36Sopenharmony_ci ctrl->ana_log_size = ana_log_size; 94962306a36Sopenharmony_ci error = nvme_read_ana_log(ctrl); 95062306a36Sopenharmony_ci if (error) 95162306a36Sopenharmony_ci goto out_uninit; 95262306a36Sopenharmony_ci return 0; 95362306a36Sopenharmony_ci 95462306a36Sopenharmony_ciout_uninit: 95562306a36Sopenharmony_ci nvme_mpath_uninit(ctrl); 95662306a36Sopenharmony_ci return error; 95762306a36Sopenharmony_ci} 95862306a36Sopenharmony_ci 95962306a36Sopenharmony_civoid nvme_mpath_uninit(struct nvme_ctrl *ctrl) 96062306a36Sopenharmony_ci{ 96162306a36Sopenharmony_ci kvfree(ctrl->ana_log_buf); 96262306a36Sopenharmony_ci ctrl->ana_log_buf = NULL; 96362306a36Sopenharmony_ci ctrl->ana_log_size = 0; 96462306a36Sopenharmony_ci} 965