18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Copyright (c) 2017-2018 Christoph Hellwig. 48c2ecf20Sopenharmony_ci */ 58c2ecf20Sopenharmony_ci 68c2ecf20Sopenharmony_ci#include <linux/backing-dev.h> 78c2ecf20Sopenharmony_ci#include <linux/moduleparam.h> 88c2ecf20Sopenharmony_ci#include <trace/events/block.h> 98c2ecf20Sopenharmony_ci#include "nvme.h" 108c2ecf20Sopenharmony_ci 118c2ecf20Sopenharmony_cistatic bool multipath = true; 128c2ecf20Sopenharmony_cimodule_param(multipath, bool, 0444); 138c2ecf20Sopenharmony_ciMODULE_PARM_DESC(multipath, 148c2ecf20Sopenharmony_ci "turn on native support for multiple controllers per subsystem"); 158c2ecf20Sopenharmony_ci 168c2ecf20Sopenharmony_civoid nvme_mpath_unfreeze(struct nvme_subsystem *subsys) 178c2ecf20Sopenharmony_ci{ 188c2ecf20Sopenharmony_ci struct nvme_ns_head *h; 198c2ecf20Sopenharmony_ci 208c2ecf20Sopenharmony_ci lockdep_assert_held(&subsys->lock); 218c2ecf20Sopenharmony_ci list_for_each_entry(h, &subsys->nsheads, entry) 228c2ecf20Sopenharmony_ci if (h->disk) 238c2ecf20Sopenharmony_ci blk_mq_unfreeze_queue(h->disk->queue); 248c2ecf20Sopenharmony_ci} 258c2ecf20Sopenharmony_ci 268c2ecf20Sopenharmony_civoid nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) 278c2ecf20Sopenharmony_ci{ 288c2ecf20Sopenharmony_ci struct nvme_ns_head *h; 298c2ecf20Sopenharmony_ci 308c2ecf20Sopenharmony_ci lockdep_assert_held(&subsys->lock); 318c2ecf20Sopenharmony_ci list_for_each_entry(h, &subsys->nsheads, entry) 328c2ecf20Sopenharmony_ci if (h->disk) 338c2ecf20Sopenharmony_ci blk_mq_freeze_queue_wait(h->disk->queue); 348c2ecf20Sopenharmony_ci} 358c2ecf20Sopenharmony_ci 368c2ecf20Sopenharmony_civoid nvme_mpath_start_freeze(struct nvme_subsystem *subsys) 378c2ecf20Sopenharmony_ci{ 388c2ecf20Sopenharmony_ci struct nvme_ns_head *h; 398c2ecf20Sopenharmony_ci 408c2ecf20Sopenharmony_ci lockdep_assert_held(&subsys->lock); 418c2ecf20Sopenharmony_ci list_for_each_entry(h, &subsys->nsheads, entry) 428c2ecf20Sopenharmony_ci if (h->disk) 438c2ecf20Sopenharmony_ci blk_freeze_queue_start(h->disk->queue); 448c2ecf20Sopenharmony_ci} 458c2ecf20Sopenharmony_ci 468c2ecf20Sopenharmony_ci/* 478c2ecf20Sopenharmony_ci * If multipathing is enabled we need to always use the subsystem instance 488c2ecf20Sopenharmony_ci * number for numbering our devices to avoid conflicts between subsystems that 498c2ecf20Sopenharmony_ci * have multiple controllers and thus use the multipath-aware subsystem node 508c2ecf20Sopenharmony_ci * and those that have a single controller and use the controller node 518c2ecf20Sopenharmony_ci * directly. 528c2ecf20Sopenharmony_ci */ 538c2ecf20Sopenharmony_civoid nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, 548c2ecf20Sopenharmony_ci struct nvme_ctrl *ctrl, int *flags) 558c2ecf20Sopenharmony_ci{ 568c2ecf20Sopenharmony_ci if (!multipath) { 578c2ecf20Sopenharmony_ci sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); 588c2ecf20Sopenharmony_ci } else if (ns->head->disk) { 598c2ecf20Sopenharmony_ci sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance, 608c2ecf20Sopenharmony_ci ctrl->instance, ns->head->instance); 618c2ecf20Sopenharmony_ci *flags = GENHD_FL_HIDDEN; 628c2ecf20Sopenharmony_ci } else { 638c2ecf20Sopenharmony_ci sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance, 648c2ecf20Sopenharmony_ci ns->head->instance); 658c2ecf20Sopenharmony_ci } 668c2ecf20Sopenharmony_ci} 678c2ecf20Sopenharmony_ci 688c2ecf20Sopenharmony_civoid nvme_failover_req(struct request *req) 698c2ecf20Sopenharmony_ci{ 708c2ecf20Sopenharmony_ci struct nvme_ns *ns = req->q->queuedata; 718c2ecf20Sopenharmony_ci u16 status = nvme_req(req)->status & 0x7ff; 728c2ecf20Sopenharmony_ci unsigned long flags; 738c2ecf20Sopenharmony_ci 748c2ecf20Sopenharmony_ci nvme_mpath_clear_current_path(ns); 758c2ecf20Sopenharmony_ci 768c2ecf20Sopenharmony_ci /* 778c2ecf20Sopenharmony_ci * If we got back an ANA error, we know the controller is alive but not 788c2ecf20Sopenharmony_ci * ready to serve this namespace. Kick of a re-read of the ANA 798c2ecf20Sopenharmony_ci * information page, and just try any other available path for now. 808c2ecf20Sopenharmony_ci */ 818c2ecf20Sopenharmony_ci if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) { 828c2ecf20Sopenharmony_ci set_bit(NVME_NS_ANA_PENDING, &ns->flags); 838c2ecf20Sopenharmony_ci queue_work(nvme_wq, &ns->ctrl->ana_work); 848c2ecf20Sopenharmony_ci } 858c2ecf20Sopenharmony_ci 868c2ecf20Sopenharmony_ci spin_lock_irqsave(&ns->head->requeue_lock, flags); 878c2ecf20Sopenharmony_ci blk_steal_bios(&ns->head->requeue_list, req); 888c2ecf20Sopenharmony_ci spin_unlock_irqrestore(&ns->head->requeue_lock, flags); 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_ci blk_mq_end_request(req, 0); 918c2ecf20Sopenharmony_ci kblockd_schedule_work(&ns->head->requeue_work); 928c2ecf20Sopenharmony_ci} 938c2ecf20Sopenharmony_ci 948c2ecf20Sopenharmony_civoid nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) 958c2ecf20Sopenharmony_ci{ 968c2ecf20Sopenharmony_ci struct nvme_ns *ns; 978c2ecf20Sopenharmony_ci 988c2ecf20Sopenharmony_ci down_read(&ctrl->namespaces_rwsem); 998c2ecf20Sopenharmony_ci list_for_each_entry(ns, &ctrl->namespaces, list) { 1008c2ecf20Sopenharmony_ci if (ns->head->disk) 1018c2ecf20Sopenharmony_ci kblockd_schedule_work(&ns->head->requeue_work); 1028c2ecf20Sopenharmony_ci } 1038c2ecf20Sopenharmony_ci up_read(&ctrl->namespaces_rwsem); 1048c2ecf20Sopenharmony_ci} 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_cistatic const char *nvme_ana_state_names[] = { 1078c2ecf20Sopenharmony_ci [0] = "invalid state", 1088c2ecf20Sopenharmony_ci [NVME_ANA_OPTIMIZED] = "optimized", 1098c2ecf20Sopenharmony_ci [NVME_ANA_NONOPTIMIZED] = "non-optimized", 1108c2ecf20Sopenharmony_ci [NVME_ANA_INACCESSIBLE] = "inaccessible", 1118c2ecf20Sopenharmony_ci [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss", 1128c2ecf20Sopenharmony_ci [NVME_ANA_CHANGE] = "change", 1138c2ecf20Sopenharmony_ci}; 1148c2ecf20Sopenharmony_ci 1158c2ecf20Sopenharmony_cibool nvme_mpath_clear_current_path(struct nvme_ns *ns) 1168c2ecf20Sopenharmony_ci{ 1178c2ecf20Sopenharmony_ci struct nvme_ns_head *head = ns->head; 1188c2ecf20Sopenharmony_ci bool changed = false; 1198c2ecf20Sopenharmony_ci int node; 1208c2ecf20Sopenharmony_ci 1218c2ecf20Sopenharmony_ci if (!head) 1228c2ecf20Sopenharmony_ci goto out; 1238c2ecf20Sopenharmony_ci 1248c2ecf20Sopenharmony_ci for_each_node(node) { 1258c2ecf20Sopenharmony_ci if (ns == rcu_access_pointer(head->current_path[node])) { 1268c2ecf20Sopenharmony_ci rcu_assign_pointer(head->current_path[node], NULL); 1278c2ecf20Sopenharmony_ci changed = true; 1288c2ecf20Sopenharmony_ci } 1298c2ecf20Sopenharmony_ci } 1308c2ecf20Sopenharmony_ciout: 1318c2ecf20Sopenharmony_ci return changed; 1328c2ecf20Sopenharmony_ci} 1338c2ecf20Sopenharmony_ci 1348c2ecf20Sopenharmony_civoid nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) 1358c2ecf20Sopenharmony_ci{ 1368c2ecf20Sopenharmony_ci struct nvme_ns *ns; 1378c2ecf20Sopenharmony_ci 1388c2ecf20Sopenharmony_ci down_read(&ctrl->namespaces_rwsem); 1398c2ecf20Sopenharmony_ci list_for_each_entry(ns, &ctrl->namespaces, list) { 1408c2ecf20Sopenharmony_ci nvme_mpath_clear_current_path(ns); 1418c2ecf20Sopenharmony_ci kblockd_schedule_work(&ns->head->requeue_work); 1428c2ecf20Sopenharmony_ci } 1438c2ecf20Sopenharmony_ci up_read(&ctrl->namespaces_rwsem); 1448c2ecf20Sopenharmony_ci} 1458c2ecf20Sopenharmony_ci 1468c2ecf20Sopenharmony_cistatic bool nvme_path_is_disabled(struct nvme_ns *ns) 1478c2ecf20Sopenharmony_ci{ 1488c2ecf20Sopenharmony_ci /* 1498c2ecf20Sopenharmony_ci * We don't treat NVME_CTRL_DELETING as a disabled path as I/O should 1508c2ecf20Sopenharmony_ci * still be able to complete assuming that the controller is connected. 1518c2ecf20Sopenharmony_ci * Otherwise it will fail immediately and return to the requeue list. 1528c2ecf20Sopenharmony_ci */ 1538c2ecf20Sopenharmony_ci if (ns->ctrl->state != NVME_CTRL_LIVE && 1548c2ecf20Sopenharmony_ci ns->ctrl->state != NVME_CTRL_DELETING) 1558c2ecf20Sopenharmony_ci return true; 1568c2ecf20Sopenharmony_ci if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || 1578c2ecf20Sopenharmony_ci test_bit(NVME_NS_REMOVING, &ns->flags)) 1588c2ecf20Sopenharmony_ci return true; 1598c2ecf20Sopenharmony_ci return false; 1608c2ecf20Sopenharmony_ci} 1618c2ecf20Sopenharmony_ci 1628c2ecf20Sopenharmony_cistatic struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) 1638c2ecf20Sopenharmony_ci{ 1648c2ecf20Sopenharmony_ci int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; 1658c2ecf20Sopenharmony_ci struct nvme_ns *found = NULL, *fallback = NULL, *ns; 1668c2ecf20Sopenharmony_ci 1678c2ecf20Sopenharmony_ci list_for_each_entry_rcu(ns, &head->list, siblings) { 1688c2ecf20Sopenharmony_ci if (nvme_path_is_disabled(ns)) 1698c2ecf20Sopenharmony_ci continue; 1708c2ecf20Sopenharmony_ci 1718c2ecf20Sopenharmony_ci if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) 1728c2ecf20Sopenharmony_ci distance = node_distance(node, ns->ctrl->numa_node); 1738c2ecf20Sopenharmony_ci else 1748c2ecf20Sopenharmony_ci distance = LOCAL_DISTANCE; 1758c2ecf20Sopenharmony_ci 1768c2ecf20Sopenharmony_ci switch (ns->ana_state) { 1778c2ecf20Sopenharmony_ci case NVME_ANA_OPTIMIZED: 1788c2ecf20Sopenharmony_ci if (distance < found_distance) { 1798c2ecf20Sopenharmony_ci found_distance = distance; 1808c2ecf20Sopenharmony_ci found = ns; 1818c2ecf20Sopenharmony_ci } 1828c2ecf20Sopenharmony_ci break; 1838c2ecf20Sopenharmony_ci case NVME_ANA_NONOPTIMIZED: 1848c2ecf20Sopenharmony_ci if (distance < fallback_distance) { 1858c2ecf20Sopenharmony_ci fallback_distance = distance; 1868c2ecf20Sopenharmony_ci fallback = ns; 1878c2ecf20Sopenharmony_ci } 1888c2ecf20Sopenharmony_ci break; 1898c2ecf20Sopenharmony_ci default: 1908c2ecf20Sopenharmony_ci break; 1918c2ecf20Sopenharmony_ci } 1928c2ecf20Sopenharmony_ci } 1938c2ecf20Sopenharmony_ci 1948c2ecf20Sopenharmony_ci if (!found) 1958c2ecf20Sopenharmony_ci found = fallback; 1968c2ecf20Sopenharmony_ci if (found) 1978c2ecf20Sopenharmony_ci rcu_assign_pointer(head->current_path[node], found); 1988c2ecf20Sopenharmony_ci return found; 1998c2ecf20Sopenharmony_ci} 2008c2ecf20Sopenharmony_ci 2018c2ecf20Sopenharmony_cistatic struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head, 2028c2ecf20Sopenharmony_ci struct nvme_ns *ns) 2038c2ecf20Sopenharmony_ci{ 2048c2ecf20Sopenharmony_ci ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns, 2058c2ecf20Sopenharmony_ci siblings); 2068c2ecf20Sopenharmony_ci if (ns) 2078c2ecf20Sopenharmony_ci return ns; 2088c2ecf20Sopenharmony_ci return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings); 2098c2ecf20Sopenharmony_ci} 2108c2ecf20Sopenharmony_ci 2118c2ecf20Sopenharmony_cistatic struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, 2128c2ecf20Sopenharmony_ci int node, struct nvme_ns *old) 2138c2ecf20Sopenharmony_ci{ 2148c2ecf20Sopenharmony_ci struct nvme_ns *ns, *found = NULL; 2158c2ecf20Sopenharmony_ci 2168c2ecf20Sopenharmony_ci if (list_is_singular(&head->list)) { 2178c2ecf20Sopenharmony_ci if (nvme_path_is_disabled(old)) 2188c2ecf20Sopenharmony_ci return NULL; 2198c2ecf20Sopenharmony_ci return old; 2208c2ecf20Sopenharmony_ci } 2218c2ecf20Sopenharmony_ci 2228c2ecf20Sopenharmony_ci for (ns = nvme_next_ns(head, old); 2238c2ecf20Sopenharmony_ci ns && ns != old; 2248c2ecf20Sopenharmony_ci ns = nvme_next_ns(head, ns)) { 2258c2ecf20Sopenharmony_ci if (nvme_path_is_disabled(ns)) 2268c2ecf20Sopenharmony_ci continue; 2278c2ecf20Sopenharmony_ci 2288c2ecf20Sopenharmony_ci if (ns->ana_state == NVME_ANA_OPTIMIZED) { 2298c2ecf20Sopenharmony_ci found = ns; 2308c2ecf20Sopenharmony_ci goto out; 2318c2ecf20Sopenharmony_ci } 2328c2ecf20Sopenharmony_ci if (ns->ana_state == NVME_ANA_NONOPTIMIZED) 2338c2ecf20Sopenharmony_ci found = ns; 2348c2ecf20Sopenharmony_ci } 2358c2ecf20Sopenharmony_ci 2368c2ecf20Sopenharmony_ci /* 2378c2ecf20Sopenharmony_ci * The loop above skips the current path for round-robin semantics. 2388c2ecf20Sopenharmony_ci * Fall back to the current path if either: 2398c2ecf20Sopenharmony_ci * - no other optimized path found and current is optimized, 2408c2ecf20Sopenharmony_ci * - no other usable path found and current is usable. 2418c2ecf20Sopenharmony_ci */ 2428c2ecf20Sopenharmony_ci if (!nvme_path_is_disabled(old) && 2438c2ecf20Sopenharmony_ci (old->ana_state == NVME_ANA_OPTIMIZED || 2448c2ecf20Sopenharmony_ci (!found && old->ana_state == NVME_ANA_NONOPTIMIZED))) 2458c2ecf20Sopenharmony_ci return old; 2468c2ecf20Sopenharmony_ci 2478c2ecf20Sopenharmony_ci if (!found) 2488c2ecf20Sopenharmony_ci return NULL; 2498c2ecf20Sopenharmony_ciout: 2508c2ecf20Sopenharmony_ci rcu_assign_pointer(head->current_path[node], found); 2518c2ecf20Sopenharmony_ci return found; 2528c2ecf20Sopenharmony_ci} 2538c2ecf20Sopenharmony_ci 2548c2ecf20Sopenharmony_cistatic inline bool nvme_path_is_optimized(struct nvme_ns *ns) 2558c2ecf20Sopenharmony_ci{ 2568c2ecf20Sopenharmony_ci return ns->ctrl->state == NVME_CTRL_LIVE && 2578c2ecf20Sopenharmony_ci ns->ana_state == NVME_ANA_OPTIMIZED; 2588c2ecf20Sopenharmony_ci} 2598c2ecf20Sopenharmony_ci 2608c2ecf20Sopenharmony_ciinline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) 2618c2ecf20Sopenharmony_ci{ 2628c2ecf20Sopenharmony_ci int node = numa_node_id(); 2638c2ecf20Sopenharmony_ci struct nvme_ns *ns; 2648c2ecf20Sopenharmony_ci 2658c2ecf20Sopenharmony_ci ns = srcu_dereference(head->current_path[node], &head->srcu); 2668c2ecf20Sopenharmony_ci if (unlikely(!ns)) 2678c2ecf20Sopenharmony_ci return __nvme_find_path(head, node); 2688c2ecf20Sopenharmony_ci 2698c2ecf20Sopenharmony_ci if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR) 2708c2ecf20Sopenharmony_ci return nvme_round_robin_path(head, node, ns); 2718c2ecf20Sopenharmony_ci if (unlikely(!nvme_path_is_optimized(ns))) 2728c2ecf20Sopenharmony_ci return __nvme_find_path(head, node); 2738c2ecf20Sopenharmony_ci return ns; 2748c2ecf20Sopenharmony_ci} 2758c2ecf20Sopenharmony_ci 2768c2ecf20Sopenharmony_cistatic bool nvme_available_path(struct nvme_ns_head *head) 2778c2ecf20Sopenharmony_ci{ 2788c2ecf20Sopenharmony_ci struct nvme_ns *ns; 2798c2ecf20Sopenharmony_ci 2808c2ecf20Sopenharmony_ci list_for_each_entry_rcu(ns, &head->list, siblings) { 2818c2ecf20Sopenharmony_ci switch (ns->ctrl->state) { 2828c2ecf20Sopenharmony_ci case NVME_CTRL_LIVE: 2838c2ecf20Sopenharmony_ci case NVME_CTRL_RESETTING: 2848c2ecf20Sopenharmony_ci case NVME_CTRL_CONNECTING: 2858c2ecf20Sopenharmony_ci /* fallthru */ 2868c2ecf20Sopenharmony_ci return true; 2878c2ecf20Sopenharmony_ci default: 2888c2ecf20Sopenharmony_ci break; 2898c2ecf20Sopenharmony_ci } 2908c2ecf20Sopenharmony_ci } 2918c2ecf20Sopenharmony_ci return false; 2928c2ecf20Sopenharmony_ci} 2938c2ecf20Sopenharmony_ci 2948c2ecf20Sopenharmony_ciblk_qc_t nvme_ns_head_submit_bio(struct bio *bio) 2958c2ecf20Sopenharmony_ci{ 2968c2ecf20Sopenharmony_ci struct nvme_ns_head *head = bio->bi_disk->private_data; 2978c2ecf20Sopenharmony_ci struct device *dev = disk_to_dev(head->disk); 2988c2ecf20Sopenharmony_ci struct nvme_ns *ns; 2998c2ecf20Sopenharmony_ci blk_qc_t ret = BLK_QC_T_NONE; 3008c2ecf20Sopenharmony_ci int srcu_idx; 3018c2ecf20Sopenharmony_ci 3028c2ecf20Sopenharmony_ci /* 3038c2ecf20Sopenharmony_ci * The namespace might be going away and the bio might be moved to a 3048c2ecf20Sopenharmony_ci * different queue via blk_steal_bios(), so we need to use the bio_split 3058c2ecf20Sopenharmony_ci * pool from the original queue to allocate the bvecs from. 3068c2ecf20Sopenharmony_ci */ 3078c2ecf20Sopenharmony_ci blk_queue_split(&bio); 3088c2ecf20Sopenharmony_ci 3098c2ecf20Sopenharmony_ci srcu_idx = srcu_read_lock(&head->srcu); 3108c2ecf20Sopenharmony_ci ns = nvme_find_path(head); 3118c2ecf20Sopenharmony_ci if (likely(ns)) { 3128c2ecf20Sopenharmony_ci bio->bi_disk = ns->disk; 3138c2ecf20Sopenharmony_ci bio->bi_opf |= REQ_NVME_MPATH; 3148c2ecf20Sopenharmony_ci trace_block_bio_remap(bio->bi_disk->queue, bio, 3158c2ecf20Sopenharmony_ci disk_devt(ns->head->disk), 3168c2ecf20Sopenharmony_ci bio->bi_iter.bi_sector); 3178c2ecf20Sopenharmony_ci ret = submit_bio_noacct(bio); 3188c2ecf20Sopenharmony_ci } else if (nvme_available_path(head)) { 3198c2ecf20Sopenharmony_ci dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n"); 3208c2ecf20Sopenharmony_ci 3218c2ecf20Sopenharmony_ci spin_lock_irq(&head->requeue_lock); 3228c2ecf20Sopenharmony_ci bio_list_add(&head->requeue_list, bio); 3238c2ecf20Sopenharmony_ci spin_unlock_irq(&head->requeue_lock); 3248c2ecf20Sopenharmony_ci } else { 3258c2ecf20Sopenharmony_ci dev_warn_ratelimited(dev, "no available path - failing I/O\n"); 3268c2ecf20Sopenharmony_ci 3278c2ecf20Sopenharmony_ci bio->bi_status = BLK_STS_IOERR; 3288c2ecf20Sopenharmony_ci bio_endio(bio); 3298c2ecf20Sopenharmony_ci } 3308c2ecf20Sopenharmony_ci 3318c2ecf20Sopenharmony_ci srcu_read_unlock(&head->srcu, srcu_idx); 3328c2ecf20Sopenharmony_ci return ret; 3338c2ecf20Sopenharmony_ci} 3348c2ecf20Sopenharmony_ci 3358c2ecf20Sopenharmony_cistatic void nvme_requeue_work(struct work_struct *work) 3368c2ecf20Sopenharmony_ci{ 3378c2ecf20Sopenharmony_ci struct nvme_ns_head *head = 3388c2ecf20Sopenharmony_ci container_of(work, struct nvme_ns_head, requeue_work); 3398c2ecf20Sopenharmony_ci struct bio *bio, *next; 3408c2ecf20Sopenharmony_ci 3418c2ecf20Sopenharmony_ci spin_lock_irq(&head->requeue_lock); 3428c2ecf20Sopenharmony_ci next = bio_list_get(&head->requeue_list); 3438c2ecf20Sopenharmony_ci spin_unlock_irq(&head->requeue_lock); 3448c2ecf20Sopenharmony_ci 3458c2ecf20Sopenharmony_ci while ((bio = next) != NULL) { 3468c2ecf20Sopenharmony_ci next = bio->bi_next; 3478c2ecf20Sopenharmony_ci bio->bi_next = NULL; 3488c2ecf20Sopenharmony_ci 3498c2ecf20Sopenharmony_ci /* 3508c2ecf20Sopenharmony_ci * Reset disk to the mpath node and resubmit to select a new 3518c2ecf20Sopenharmony_ci * path. 3528c2ecf20Sopenharmony_ci */ 3538c2ecf20Sopenharmony_ci bio->bi_disk = head->disk; 3548c2ecf20Sopenharmony_ci submit_bio_noacct(bio); 3558c2ecf20Sopenharmony_ci } 3568c2ecf20Sopenharmony_ci} 3578c2ecf20Sopenharmony_ci 3588c2ecf20Sopenharmony_ciint nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) 3598c2ecf20Sopenharmony_ci{ 3608c2ecf20Sopenharmony_ci struct request_queue *q; 3618c2ecf20Sopenharmony_ci bool vwc = false; 3628c2ecf20Sopenharmony_ci 3638c2ecf20Sopenharmony_ci mutex_init(&head->lock); 3648c2ecf20Sopenharmony_ci bio_list_init(&head->requeue_list); 3658c2ecf20Sopenharmony_ci spin_lock_init(&head->requeue_lock); 3668c2ecf20Sopenharmony_ci INIT_WORK(&head->requeue_work, nvme_requeue_work); 3678c2ecf20Sopenharmony_ci 3688c2ecf20Sopenharmony_ci /* 3698c2ecf20Sopenharmony_ci * Add a multipath node if the subsystems supports multiple controllers. 3708c2ecf20Sopenharmony_ci * We also do this for private namespaces as the namespace sharing data could 3718c2ecf20Sopenharmony_ci * change after a rescan. 3728c2ecf20Sopenharmony_ci */ 3738c2ecf20Sopenharmony_ci if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath) 3748c2ecf20Sopenharmony_ci return 0; 3758c2ecf20Sopenharmony_ci 3768c2ecf20Sopenharmony_ci q = blk_alloc_queue(ctrl->numa_node); 3778c2ecf20Sopenharmony_ci if (!q) 3788c2ecf20Sopenharmony_ci goto out; 3798c2ecf20Sopenharmony_ci blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 3808c2ecf20Sopenharmony_ci /* set to a default value for 512 until disk is validated */ 3818c2ecf20Sopenharmony_ci blk_queue_logical_block_size(q, 512); 3828c2ecf20Sopenharmony_ci blk_set_stacking_limits(&q->limits); 3838c2ecf20Sopenharmony_ci 3848c2ecf20Sopenharmony_ci /* we need to propagate up the VMC settings */ 3858c2ecf20Sopenharmony_ci if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) 3868c2ecf20Sopenharmony_ci vwc = true; 3878c2ecf20Sopenharmony_ci blk_queue_write_cache(q, vwc, vwc); 3888c2ecf20Sopenharmony_ci 3898c2ecf20Sopenharmony_ci head->disk = alloc_disk(0); 3908c2ecf20Sopenharmony_ci if (!head->disk) 3918c2ecf20Sopenharmony_ci goto out_cleanup_queue; 3928c2ecf20Sopenharmony_ci head->disk->fops = &nvme_ns_head_ops; 3938c2ecf20Sopenharmony_ci head->disk->private_data = head; 3948c2ecf20Sopenharmony_ci head->disk->queue = q; 3958c2ecf20Sopenharmony_ci head->disk->flags = GENHD_FL_EXT_DEVT; 3968c2ecf20Sopenharmony_ci sprintf(head->disk->disk_name, "nvme%dn%d", 3978c2ecf20Sopenharmony_ci ctrl->subsys->instance, head->instance); 3988c2ecf20Sopenharmony_ci return 0; 3998c2ecf20Sopenharmony_ci 4008c2ecf20Sopenharmony_ciout_cleanup_queue: 4018c2ecf20Sopenharmony_ci blk_cleanup_queue(q); 4028c2ecf20Sopenharmony_ciout: 4038c2ecf20Sopenharmony_ci return -ENOMEM; 4048c2ecf20Sopenharmony_ci} 4058c2ecf20Sopenharmony_ci 4068c2ecf20Sopenharmony_cistatic void nvme_mpath_set_live(struct nvme_ns *ns) 4078c2ecf20Sopenharmony_ci{ 4088c2ecf20Sopenharmony_ci struct nvme_ns_head *head = ns->head; 4098c2ecf20Sopenharmony_ci 4108c2ecf20Sopenharmony_ci if (!head->disk) 4118c2ecf20Sopenharmony_ci return; 4128c2ecf20Sopenharmony_ci 4138c2ecf20Sopenharmony_ci if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) 4148c2ecf20Sopenharmony_ci device_add_disk(&head->subsys->dev, head->disk, 4158c2ecf20Sopenharmony_ci nvme_ns_id_attr_groups); 4168c2ecf20Sopenharmony_ci 4178c2ecf20Sopenharmony_ci mutex_lock(&head->lock); 4188c2ecf20Sopenharmony_ci if (nvme_path_is_optimized(ns)) { 4198c2ecf20Sopenharmony_ci int node, srcu_idx; 4208c2ecf20Sopenharmony_ci 4218c2ecf20Sopenharmony_ci srcu_idx = srcu_read_lock(&head->srcu); 4228c2ecf20Sopenharmony_ci for_each_node(node) 4238c2ecf20Sopenharmony_ci __nvme_find_path(head, node); 4248c2ecf20Sopenharmony_ci srcu_read_unlock(&head->srcu, srcu_idx); 4258c2ecf20Sopenharmony_ci } 4268c2ecf20Sopenharmony_ci mutex_unlock(&head->lock); 4278c2ecf20Sopenharmony_ci 4288c2ecf20Sopenharmony_ci synchronize_srcu(&head->srcu); 4298c2ecf20Sopenharmony_ci kblockd_schedule_work(&head->requeue_work); 4308c2ecf20Sopenharmony_ci} 4318c2ecf20Sopenharmony_ci 4328c2ecf20Sopenharmony_cistatic int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data, 4338c2ecf20Sopenharmony_ci int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *, 4348c2ecf20Sopenharmony_ci void *)) 4358c2ecf20Sopenharmony_ci{ 4368c2ecf20Sopenharmony_ci void *base = ctrl->ana_log_buf; 4378c2ecf20Sopenharmony_ci size_t offset = sizeof(struct nvme_ana_rsp_hdr); 4388c2ecf20Sopenharmony_ci int error, i; 4398c2ecf20Sopenharmony_ci 4408c2ecf20Sopenharmony_ci lockdep_assert_held(&ctrl->ana_lock); 4418c2ecf20Sopenharmony_ci 4428c2ecf20Sopenharmony_ci for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) { 4438c2ecf20Sopenharmony_ci struct nvme_ana_group_desc *desc = base + offset; 4448c2ecf20Sopenharmony_ci u32 nr_nsids; 4458c2ecf20Sopenharmony_ci size_t nsid_buf_size; 4468c2ecf20Sopenharmony_ci 4478c2ecf20Sopenharmony_ci if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc))) 4488c2ecf20Sopenharmony_ci return -EINVAL; 4498c2ecf20Sopenharmony_ci 4508c2ecf20Sopenharmony_ci nr_nsids = le32_to_cpu(desc->nnsids); 4518c2ecf20Sopenharmony_ci nsid_buf_size = nr_nsids * sizeof(__le32); 4528c2ecf20Sopenharmony_ci 4538c2ecf20Sopenharmony_ci if (WARN_ON_ONCE(desc->grpid == 0)) 4548c2ecf20Sopenharmony_ci return -EINVAL; 4558c2ecf20Sopenharmony_ci if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax)) 4568c2ecf20Sopenharmony_ci return -EINVAL; 4578c2ecf20Sopenharmony_ci if (WARN_ON_ONCE(desc->state == 0)) 4588c2ecf20Sopenharmony_ci return -EINVAL; 4598c2ecf20Sopenharmony_ci if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE)) 4608c2ecf20Sopenharmony_ci return -EINVAL; 4618c2ecf20Sopenharmony_ci 4628c2ecf20Sopenharmony_ci offset += sizeof(*desc); 4638c2ecf20Sopenharmony_ci if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size)) 4648c2ecf20Sopenharmony_ci return -EINVAL; 4658c2ecf20Sopenharmony_ci 4668c2ecf20Sopenharmony_ci error = cb(ctrl, desc, data); 4678c2ecf20Sopenharmony_ci if (error) 4688c2ecf20Sopenharmony_ci return error; 4698c2ecf20Sopenharmony_ci 4708c2ecf20Sopenharmony_ci offset += nsid_buf_size; 4718c2ecf20Sopenharmony_ci } 4728c2ecf20Sopenharmony_ci 4738c2ecf20Sopenharmony_ci return 0; 4748c2ecf20Sopenharmony_ci} 4758c2ecf20Sopenharmony_ci 4768c2ecf20Sopenharmony_cistatic inline bool nvme_state_is_live(enum nvme_ana_state state) 4778c2ecf20Sopenharmony_ci{ 4788c2ecf20Sopenharmony_ci return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED; 4798c2ecf20Sopenharmony_ci} 4808c2ecf20Sopenharmony_ci 4818c2ecf20Sopenharmony_cistatic void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc, 4828c2ecf20Sopenharmony_ci struct nvme_ns *ns) 4838c2ecf20Sopenharmony_ci{ 4848c2ecf20Sopenharmony_ci ns->ana_grpid = le32_to_cpu(desc->grpid); 4858c2ecf20Sopenharmony_ci ns->ana_state = desc->state; 4868c2ecf20Sopenharmony_ci clear_bit(NVME_NS_ANA_PENDING, &ns->flags); 4878c2ecf20Sopenharmony_ci /* 4888c2ecf20Sopenharmony_ci * nvme_mpath_set_live() will trigger I/O to the multipath path device 4898c2ecf20Sopenharmony_ci * and in turn to this path device. However we cannot accept this I/O 4908c2ecf20Sopenharmony_ci * if the controller is not live. This may deadlock if called from 4918c2ecf20Sopenharmony_ci * nvme_mpath_init_identify() and the ctrl will never complete 4928c2ecf20Sopenharmony_ci * initialization, preventing I/O from completing. For this case we 4938c2ecf20Sopenharmony_ci * will reprocess the ANA log page in nvme_mpath_update() once the 4948c2ecf20Sopenharmony_ci * controller is ready. 4958c2ecf20Sopenharmony_ci */ 4968c2ecf20Sopenharmony_ci if (nvme_state_is_live(ns->ana_state) && 4978c2ecf20Sopenharmony_ci ns->ctrl->state == NVME_CTRL_LIVE) 4988c2ecf20Sopenharmony_ci nvme_mpath_set_live(ns); 4998c2ecf20Sopenharmony_ci} 5008c2ecf20Sopenharmony_ci 5018c2ecf20Sopenharmony_cistatic int nvme_update_ana_state(struct nvme_ctrl *ctrl, 5028c2ecf20Sopenharmony_ci struct nvme_ana_group_desc *desc, void *data) 5038c2ecf20Sopenharmony_ci{ 5048c2ecf20Sopenharmony_ci u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; 5058c2ecf20Sopenharmony_ci unsigned *nr_change_groups = data; 5068c2ecf20Sopenharmony_ci struct nvme_ns *ns; 5078c2ecf20Sopenharmony_ci 5088c2ecf20Sopenharmony_ci dev_dbg(ctrl->device, "ANA group %d: %s.\n", 5098c2ecf20Sopenharmony_ci le32_to_cpu(desc->grpid), 5108c2ecf20Sopenharmony_ci nvme_ana_state_names[desc->state]); 5118c2ecf20Sopenharmony_ci 5128c2ecf20Sopenharmony_ci if (desc->state == NVME_ANA_CHANGE) 5138c2ecf20Sopenharmony_ci (*nr_change_groups)++; 5148c2ecf20Sopenharmony_ci 5158c2ecf20Sopenharmony_ci if (!nr_nsids) 5168c2ecf20Sopenharmony_ci return 0; 5178c2ecf20Sopenharmony_ci 5188c2ecf20Sopenharmony_ci down_read(&ctrl->namespaces_rwsem); 5198c2ecf20Sopenharmony_ci list_for_each_entry(ns, &ctrl->namespaces, list) { 5208c2ecf20Sopenharmony_ci unsigned nsid; 5218c2ecf20Sopenharmony_ciagain: 5228c2ecf20Sopenharmony_ci nsid = le32_to_cpu(desc->nsids[n]); 5238c2ecf20Sopenharmony_ci if (ns->head->ns_id < nsid) 5248c2ecf20Sopenharmony_ci continue; 5258c2ecf20Sopenharmony_ci if (ns->head->ns_id == nsid) 5268c2ecf20Sopenharmony_ci nvme_update_ns_ana_state(desc, ns); 5278c2ecf20Sopenharmony_ci if (++n == nr_nsids) 5288c2ecf20Sopenharmony_ci break; 5298c2ecf20Sopenharmony_ci if (ns->head->ns_id > nsid) 5308c2ecf20Sopenharmony_ci goto again; 5318c2ecf20Sopenharmony_ci } 5328c2ecf20Sopenharmony_ci up_read(&ctrl->namespaces_rwsem); 5338c2ecf20Sopenharmony_ci return 0; 5348c2ecf20Sopenharmony_ci} 5358c2ecf20Sopenharmony_ci 5368c2ecf20Sopenharmony_cistatic int nvme_read_ana_log(struct nvme_ctrl *ctrl) 5378c2ecf20Sopenharmony_ci{ 5388c2ecf20Sopenharmony_ci u32 nr_change_groups = 0; 5398c2ecf20Sopenharmony_ci int error; 5408c2ecf20Sopenharmony_ci 5418c2ecf20Sopenharmony_ci mutex_lock(&ctrl->ana_lock); 5428c2ecf20Sopenharmony_ci error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM, 5438c2ecf20Sopenharmony_ci ctrl->ana_log_buf, ctrl->ana_log_size, 0); 5448c2ecf20Sopenharmony_ci if (error) { 5458c2ecf20Sopenharmony_ci dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); 5468c2ecf20Sopenharmony_ci goto out_unlock; 5478c2ecf20Sopenharmony_ci } 5488c2ecf20Sopenharmony_ci 5498c2ecf20Sopenharmony_ci error = nvme_parse_ana_log(ctrl, &nr_change_groups, 5508c2ecf20Sopenharmony_ci nvme_update_ana_state); 5518c2ecf20Sopenharmony_ci if (error) 5528c2ecf20Sopenharmony_ci goto out_unlock; 5538c2ecf20Sopenharmony_ci 5548c2ecf20Sopenharmony_ci /* 5558c2ecf20Sopenharmony_ci * In theory we should have an ANATT timer per group as they might enter 5568c2ecf20Sopenharmony_ci * the change state at different times. But that is a lot of overhead 5578c2ecf20Sopenharmony_ci * just to protect against a target that keeps entering new changes 5588c2ecf20Sopenharmony_ci * states while never finishing previous ones. But we'll still 5598c2ecf20Sopenharmony_ci * eventually time out once all groups are in change state, so this 5608c2ecf20Sopenharmony_ci * isn't a big deal. 5618c2ecf20Sopenharmony_ci * 5628c2ecf20Sopenharmony_ci * We also double the ANATT value to provide some slack for transports 5638c2ecf20Sopenharmony_ci * or AEN processing overhead. 5648c2ecf20Sopenharmony_ci */ 5658c2ecf20Sopenharmony_ci if (nr_change_groups) 5668c2ecf20Sopenharmony_ci mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies); 5678c2ecf20Sopenharmony_ci else 5688c2ecf20Sopenharmony_ci del_timer_sync(&ctrl->anatt_timer); 5698c2ecf20Sopenharmony_ciout_unlock: 5708c2ecf20Sopenharmony_ci mutex_unlock(&ctrl->ana_lock); 5718c2ecf20Sopenharmony_ci return error; 5728c2ecf20Sopenharmony_ci} 5738c2ecf20Sopenharmony_ci 5748c2ecf20Sopenharmony_cistatic void nvme_ana_work(struct work_struct *work) 5758c2ecf20Sopenharmony_ci{ 5768c2ecf20Sopenharmony_ci struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work); 5778c2ecf20Sopenharmony_ci 5788c2ecf20Sopenharmony_ci if (ctrl->state != NVME_CTRL_LIVE) 5798c2ecf20Sopenharmony_ci return; 5808c2ecf20Sopenharmony_ci 5818c2ecf20Sopenharmony_ci nvme_read_ana_log(ctrl); 5828c2ecf20Sopenharmony_ci} 5838c2ecf20Sopenharmony_ci 5848c2ecf20Sopenharmony_civoid nvme_mpath_update(struct nvme_ctrl *ctrl) 5858c2ecf20Sopenharmony_ci{ 5868c2ecf20Sopenharmony_ci u32 nr_change_groups = 0; 5878c2ecf20Sopenharmony_ci 5888c2ecf20Sopenharmony_ci if (!ctrl->ana_log_buf) 5898c2ecf20Sopenharmony_ci return; 5908c2ecf20Sopenharmony_ci 5918c2ecf20Sopenharmony_ci mutex_lock(&ctrl->ana_lock); 5928c2ecf20Sopenharmony_ci nvme_parse_ana_log(ctrl, &nr_change_groups, nvme_update_ana_state); 5938c2ecf20Sopenharmony_ci mutex_unlock(&ctrl->ana_lock); 5948c2ecf20Sopenharmony_ci} 5958c2ecf20Sopenharmony_ci 5968c2ecf20Sopenharmony_cistatic void nvme_anatt_timeout(struct timer_list *t) 5978c2ecf20Sopenharmony_ci{ 5988c2ecf20Sopenharmony_ci struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer); 5998c2ecf20Sopenharmony_ci 6008c2ecf20Sopenharmony_ci dev_info(ctrl->device, "ANATT timeout, resetting controller.\n"); 6018c2ecf20Sopenharmony_ci nvme_reset_ctrl(ctrl); 6028c2ecf20Sopenharmony_ci} 6038c2ecf20Sopenharmony_ci 6048c2ecf20Sopenharmony_civoid nvme_mpath_stop(struct nvme_ctrl *ctrl) 6058c2ecf20Sopenharmony_ci{ 6068c2ecf20Sopenharmony_ci if (!nvme_ctrl_use_ana(ctrl)) 6078c2ecf20Sopenharmony_ci return; 6088c2ecf20Sopenharmony_ci del_timer_sync(&ctrl->anatt_timer); 6098c2ecf20Sopenharmony_ci cancel_work_sync(&ctrl->ana_work); 6108c2ecf20Sopenharmony_ci} 6118c2ecf20Sopenharmony_ci 6128c2ecf20Sopenharmony_ci#define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \ 6138c2ecf20Sopenharmony_ci struct device_attribute subsys_attr_##_name = \ 6148c2ecf20Sopenharmony_ci __ATTR(_name, _mode, _show, _store) 6158c2ecf20Sopenharmony_ci 6168c2ecf20Sopenharmony_cistatic const char *nvme_iopolicy_names[] = { 6178c2ecf20Sopenharmony_ci [NVME_IOPOLICY_NUMA] = "numa", 6188c2ecf20Sopenharmony_ci [NVME_IOPOLICY_RR] = "round-robin", 6198c2ecf20Sopenharmony_ci}; 6208c2ecf20Sopenharmony_ci 6218c2ecf20Sopenharmony_cistatic ssize_t nvme_subsys_iopolicy_show(struct device *dev, 6228c2ecf20Sopenharmony_ci struct device_attribute *attr, char *buf) 6238c2ecf20Sopenharmony_ci{ 6248c2ecf20Sopenharmony_ci struct nvme_subsystem *subsys = 6258c2ecf20Sopenharmony_ci container_of(dev, struct nvme_subsystem, dev); 6268c2ecf20Sopenharmony_ci 6278c2ecf20Sopenharmony_ci return sysfs_emit(buf, "%s\n", 6288c2ecf20Sopenharmony_ci nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); 6298c2ecf20Sopenharmony_ci} 6308c2ecf20Sopenharmony_ci 6318c2ecf20Sopenharmony_cistatic ssize_t nvme_subsys_iopolicy_store(struct device *dev, 6328c2ecf20Sopenharmony_ci struct device_attribute *attr, const char *buf, size_t count) 6338c2ecf20Sopenharmony_ci{ 6348c2ecf20Sopenharmony_ci struct nvme_subsystem *subsys = 6358c2ecf20Sopenharmony_ci container_of(dev, struct nvme_subsystem, dev); 6368c2ecf20Sopenharmony_ci int i; 6378c2ecf20Sopenharmony_ci 6388c2ecf20Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) { 6398c2ecf20Sopenharmony_ci if (sysfs_streq(buf, nvme_iopolicy_names[i])) { 6408c2ecf20Sopenharmony_ci WRITE_ONCE(subsys->iopolicy, i); 6418c2ecf20Sopenharmony_ci return count; 6428c2ecf20Sopenharmony_ci } 6438c2ecf20Sopenharmony_ci } 6448c2ecf20Sopenharmony_ci 6458c2ecf20Sopenharmony_ci return -EINVAL; 6468c2ecf20Sopenharmony_ci} 6478c2ecf20Sopenharmony_ciSUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, 6488c2ecf20Sopenharmony_ci nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store); 6498c2ecf20Sopenharmony_ci 6508c2ecf20Sopenharmony_cistatic ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, 6518c2ecf20Sopenharmony_ci char *buf) 6528c2ecf20Sopenharmony_ci{ 6538c2ecf20Sopenharmony_ci return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); 6548c2ecf20Sopenharmony_ci} 6558c2ecf20Sopenharmony_ciDEVICE_ATTR_RO(ana_grpid); 6568c2ecf20Sopenharmony_ci 6578c2ecf20Sopenharmony_cistatic ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, 6588c2ecf20Sopenharmony_ci char *buf) 6598c2ecf20Sopenharmony_ci{ 6608c2ecf20Sopenharmony_ci struct nvme_ns *ns = nvme_get_ns_from_dev(dev); 6618c2ecf20Sopenharmony_ci 6628c2ecf20Sopenharmony_ci return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); 6638c2ecf20Sopenharmony_ci} 6648c2ecf20Sopenharmony_ciDEVICE_ATTR_RO(ana_state); 6658c2ecf20Sopenharmony_ci 6668c2ecf20Sopenharmony_cistatic int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, 6678c2ecf20Sopenharmony_ci struct nvme_ana_group_desc *desc, void *data) 6688c2ecf20Sopenharmony_ci{ 6698c2ecf20Sopenharmony_ci struct nvme_ana_group_desc *dst = data; 6708c2ecf20Sopenharmony_ci 6718c2ecf20Sopenharmony_ci if (desc->grpid != dst->grpid) 6728c2ecf20Sopenharmony_ci return 0; 6738c2ecf20Sopenharmony_ci 6748c2ecf20Sopenharmony_ci *dst = *desc; 6758c2ecf20Sopenharmony_ci return -ENXIO; /* just break out of the loop */ 6768c2ecf20Sopenharmony_ci} 6778c2ecf20Sopenharmony_ci 6788c2ecf20Sopenharmony_civoid nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) 6798c2ecf20Sopenharmony_ci{ 6808c2ecf20Sopenharmony_ci if (nvme_ctrl_use_ana(ns->ctrl)) { 6818c2ecf20Sopenharmony_ci struct nvme_ana_group_desc desc = { 6828c2ecf20Sopenharmony_ci .grpid = id->anagrpid, 6838c2ecf20Sopenharmony_ci .state = 0, 6848c2ecf20Sopenharmony_ci }; 6858c2ecf20Sopenharmony_ci 6868c2ecf20Sopenharmony_ci mutex_lock(&ns->ctrl->ana_lock); 6878c2ecf20Sopenharmony_ci ns->ana_grpid = le32_to_cpu(id->anagrpid); 6888c2ecf20Sopenharmony_ci nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc); 6898c2ecf20Sopenharmony_ci mutex_unlock(&ns->ctrl->ana_lock); 6908c2ecf20Sopenharmony_ci if (desc.state) { 6918c2ecf20Sopenharmony_ci /* found the group desc: update */ 6928c2ecf20Sopenharmony_ci nvme_update_ns_ana_state(&desc, ns); 6938c2ecf20Sopenharmony_ci } else { 6948c2ecf20Sopenharmony_ci /* group desc not found: trigger a re-read */ 6958c2ecf20Sopenharmony_ci set_bit(NVME_NS_ANA_PENDING, &ns->flags); 6968c2ecf20Sopenharmony_ci queue_work(nvme_wq, &ns->ctrl->ana_work); 6978c2ecf20Sopenharmony_ci } 6988c2ecf20Sopenharmony_ci } else { 6998c2ecf20Sopenharmony_ci ns->ana_state = NVME_ANA_OPTIMIZED; 7008c2ecf20Sopenharmony_ci nvme_mpath_set_live(ns); 7018c2ecf20Sopenharmony_ci } 7028c2ecf20Sopenharmony_ci 7038c2ecf20Sopenharmony_ci if (blk_queue_stable_writes(ns->queue) && ns->head->disk) 7048c2ecf20Sopenharmony_ci blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, 7058c2ecf20Sopenharmony_ci ns->head->disk->queue); 7068c2ecf20Sopenharmony_ci#ifdef CONFIG_BLK_DEV_ZONED 7078c2ecf20Sopenharmony_ci if (blk_queue_is_zoned(ns->queue) && ns->head->disk) 7088c2ecf20Sopenharmony_ci ns->head->disk->queue->nr_zones = ns->queue->nr_zones; 7098c2ecf20Sopenharmony_ci#endif 7108c2ecf20Sopenharmony_ci} 7118c2ecf20Sopenharmony_ci 7128c2ecf20Sopenharmony_civoid nvme_mpath_remove_disk(struct nvme_ns_head *head) 7138c2ecf20Sopenharmony_ci{ 7148c2ecf20Sopenharmony_ci if (!head->disk) 7158c2ecf20Sopenharmony_ci return; 7168c2ecf20Sopenharmony_ci if (head->disk->flags & GENHD_FL_UP) 7178c2ecf20Sopenharmony_ci del_gendisk(head->disk); 7188c2ecf20Sopenharmony_ci blk_set_queue_dying(head->disk->queue); 7198c2ecf20Sopenharmony_ci /* make sure all pending bios are cleaned up */ 7208c2ecf20Sopenharmony_ci kblockd_schedule_work(&head->requeue_work); 7218c2ecf20Sopenharmony_ci flush_work(&head->requeue_work); 7228c2ecf20Sopenharmony_ci blk_cleanup_queue(head->disk->queue); 7238c2ecf20Sopenharmony_ci if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { 7248c2ecf20Sopenharmony_ci /* 7258c2ecf20Sopenharmony_ci * if device_add_disk wasn't called, prevent 7268c2ecf20Sopenharmony_ci * disk release to put a bogus reference on the 7278c2ecf20Sopenharmony_ci * request queue 7288c2ecf20Sopenharmony_ci */ 7298c2ecf20Sopenharmony_ci head->disk->queue = NULL; 7308c2ecf20Sopenharmony_ci } 7318c2ecf20Sopenharmony_ci put_disk(head->disk); 7328c2ecf20Sopenharmony_ci} 7338c2ecf20Sopenharmony_ci 7348c2ecf20Sopenharmony_civoid nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) 7358c2ecf20Sopenharmony_ci{ 7368c2ecf20Sopenharmony_ci mutex_init(&ctrl->ana_lock); 7378c2ecf20Sopenharmony_ci timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0); 7388c2ecf20Sopenharmony_ci INIT_WORK(&ctrl->ana_work, nvme_ana_work); 7398c2ecf20Sopenharmony_ci} 7408c2ecf20Sopenharmony_ci 7418c2ecf20Sopenharmony_ciint nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) 7428c2ecf20Sopenharmony_ci{ 7438c2ecf20Sopenharmony_ci size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT; 7448c2ecf20Sopenharmony_ci size_t ana_log_size; 7458c2ecf20Sopenharmony_ci int error = 0; 7468c2ecf20Sopenharmony_ci 7478c2ecf20Sopenharmony_ci /* check if multipath is enabled and we have the capability */ 7488c2ecf20Sopenharmony_ci if (!multipath || !ctrl->subsys || 7498c2ecf20Sopenharmony_ci !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA)) 7508c2ecf20Sopenharmony_ci return 0; 7518c2ecf20Sopenharmony_ci 7528c2ecf20Sopenharmony_ci ctrl->anacap = id->anacap; 7538c2ecf20Sopenharmony_ci ctrl->anatt = id->anatt; 7548c2ecf20Sopenharmony_ci ctrl->nanagrpid = le32_to_cpu(id->nanagrpid); 7558c2ecf20Sopenharmony_ci ctrl->anagrpmax = le32_to_cpu(id->anagrpmax); 7568c2ecf20Sopenharmony_ci 7578c2ecf20Sopenharmony_ci ana_log_size = sizeof(struct nvme_ana_rsp_hdr) + 7588c2ecf20Sopenharmony_ci ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) + 7598c2ecf20Sopenharmony_ci ctrl->max_namespaces * sizeof(__le32); 7608c2ecf20Sopenharmony_ci if (ana_log_size > max_transfer_size) { 7618c2ecf20Sopenharmony_ci dev_err(ctrl->device, 7628c2ecf20Sopenharmony_ci "ANA log page size (%zd) larger than MDTS (%zd).\n", 7638c2ecf20Sopenharmony_ci ana_log_size, max_transfer_size); 7648c2ecf20Sopenharmony_ci dev_err(ctrl->device, "disabling ANA support.\n"); 7658c2ecf20Sopenharmony_ci goto out_uninit; 7668c2ecf20Sopenharmony_ci } 7678c2ecf20Sopenharmony_ci if (ana_log_size > ctrl->ana_log_size) { 7688c2ecf20Sopenharmony_ci nvme_mpath_stop(ctrl); 7698c2ecf20Sopenharmony_ci kfree(ctrl->ana_log_buf); 7708c2ecf20Sopenharmony_ci ctrl->ana_log_buf = kmalloc(ana_log_size, GFP_KERNEL); 7718c2ecf20Sopenharmony_ci if (!ctrl->ana_log_buf) 7728c2ecf20Sopenharmony_ci return -ENOMEM; 7738c2ecf20Sopenharmony_ci } 7748c2ecf20Sopenharmony_ci ctrl->ana_log_size = ana_log_size; 7758c2ecf20Sopenharmony_ci error = nvme_read_ana_log(ctrl); 7768c2ecf20Sopenharmony_ci if (error) 7778c2ecf20Sopenharmony_ci goto out_uninit; 7788c2ecf20Sopenharmony_ci return 0; 7798c2ecf20Sopenharmony_ci 7808c2ecf20Sopenharmony_ciout_uninit: 7818c2ecf20Sopenharmony_ci nvme_mpath_uninit(ctrl); 7828c2ecf20Sopenharmony_ci return error; 7838c2ecf20Sopenharmony_ci} 7848c2ecf20Sopenharmony_ci 7858c2ecf20Sopenharmony_civoid nvme_mpath_uninit(struct nvme_ctrl *ctrl) 7868c2ecf20Sopenharmony_ci{ 7878c2ecf20Sopenharmony_ci kfree(ctrl->ana_log_buf); 7888c2ecf20Sopenharmony_ci ctrl->ana_log_buf = NULL; 7898c2ecf20Sopenharmony_ci} 7908c2ecf20Sopenharmony_ci 791