162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* Copyright(c) 2016-2019 Intel Corporation. All rights reserved. */ 362306a36Sopenharmony_ci#include <linux/memremap.h> 462306a36Sopenharmony_ci#include <linux/pagemap.h> 562306a36Sopenharmony_ci#include <linux/memory.h> 662306a36Sopenharmony_ci#include <linux/module.h> 762306a36Sopenharmony_ci#include <linux/device.h> 862306a36Sopenharmony_ci#include <linux/pfn_t.h> 962306a36Sopenharmony_ci#include <linux/slab.h> 1062306a36Sopenharmony_ci#include <linux/dax.h> 1162306a36Sopenharmony_ci#include <linux/fs.h> 1262306a36Sopenharmony_ci#include <linux/mm.h> 1362306a36Sopenharmony_ci#include <linux/mman.h> 1462306a36Sopenharmony_ci#include <linux/memory-tiers.h> 1562306a36Sopenharmony_ci#include "dax-private.h" 1662306a36Sopenharmony_ci#include "bus.h" 1762306a36Sopenharmony_ci 1862306a36Sopenharmony_ci/* 1962306a36Sopenharmony_ci * Default abstract distance assigned to the NUMA node onlined 2062306a36Sopenharmony_ci * by DAX/kmem if the low level platform driver didn't initialize 2162306a36Sopenharmony_ci * one for this NUMA node. 2262306a36Sopenharmony_ci */ 2362306a36Sopenharmony_ci#define MEMTIER_DEFAULT_DAX_ADISTANCE (MEMTIER_ADISTANCE_DRAM * 5) 2462306a36Sopenharmony_ci 2562306a36Sopenharmony_ci/* Memory resource name used for add_memory_driver_managed(). */ 2662306a36Sopenharmony_cistatic const char *kmem_name; 2762306a36Sopenharmony_ci/* Set if any memory will remain added when the driver will be unloaded. */ 2862306a36Sopenharmony_cistatic bool any_hotremove_failed; 2962306a36Sopenharmony_ci 3062306a36Sopenharmony_cistatic int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r) 3162306a36Sopenharmony_ci{ 3262306a36Sopenharmony_ci struct dev_dax_range *dax_range = &dev_dax->ranges[i]; 3362306a36Sopenharmony_ci struct range *range = &dax_range->range; 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_ci /* memory-block align the hotplug range */ 3662306a36Sopenharmony_ci r->start = ALIGN(range->start, memory_block_size_bytes()); 3762306a36Sopenharmony_ci r->end = ALIGN_DOWN(range->end + 1, memory_block_size_bytes()) - 1; 3862306a36Sopenharmony_ci if (r->start >= r->end) { 3962306a36Sopenharmony_ci r->start = range->start; 4062306a36Sopenharmony_ci r->end = range->end; 4162306a36Sopenharmony_ci return -ENOSPC; 4262306a36Sopenharmony_ci } 4362306a36Sopenharmony_ci return 0; 4462306a36Sopenharmony_ci} 4562306a36Sopenharmony_ci 4662306a36Sopenharmony_cistruct dax_kmem_data { 4762306a36Sopenharmony_ci const char *res_name; 4862306a36Sopenharmony_ci int mgid; 4962306a36Sopenharmony_ci struct resource *res[]; 5062306a36Sopenharmony_ci}; 5162306a36Sopenharmony_ci 5262306a36Sopenharmony_cistatic struct memory_dev_type *dax_slowmem_type; 5362306a36Sopenharmony_cistatic int dev_dax_kmem_probe(struct dev_dax *dev_dax) 5462306a36Sopenharmony_ci{ 5562306a36Sopenharmony_ci struct device *dev = &dev_dax->dev; 5662306a36Sopenharmony_ci unsigned long total_len = 0; 5762306a36Sopenharmony_ci struct dax_kmem_data *data; 5862306a36Sopenharmony_ci int i, rc, mapped = 0; 5962306a36Sopenharmony_ci int numa_node; 6062306a36Sopenharmony_ci 6162306a36Sopenharmony_ci /* 6262306a36Sopenharmony_ci * Ensure good NUMA information for the persistent memory. 6362306a36Sopenharmony_ci * Without this check, there is a risk that slow memory 6462306a36Sopenharmony_ci * could be mixed in a node with faster memory, causing 6562306a36Sopenharmony_ci * unavoidable performance issues. 6662306a36Sopenharmony_ci */ 6762306a36Sopenharmony_ci numa_node = dev_dax->target_node; 6862306a36Sopenharmony_ci if (numa_node < 0) { 6962306a36Sopenharmony_ci dev_warn(dev, "rejecting DAX region with invalid node: %d\n", 7062306a36Sopenharmony_ci numa_node); 7162306a36Sopenharmony_ci return -EINVAL; 7262306a36Sopenharmony_ci } 7362306a36Sopenharmony_ci 7462306a36Sopenharmony_ci for (i = 0; i < dev_dax->nr_range; i++) { 7562306a36Sopenharmony_ci struct range range; 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_ci rc = dax_kmem_range(dev_dax, i, &range); 7862306a36Sopenharmony_ci if (rc) { 7962306a36Sopenharmony_ci dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n", 8062306a36Sopenharmony_ci i, range.start, range.end); 8162306a36Sopenharmony_ci continue; 8262306a36Sopenharmony_ci } 8362306a36Sopenharmony_ci total_len += range_len(&range); 8462306a36Sopenharmony_ci } 8562306a36Sopenharmony_ci 8662306a36Sopenharmony_ci if (!total_len) { 8762306a36Sopenharmony_ci dev_warn(dev, "rejecting DAX region without any memory after alignment\n"); 8862306a36Sopenharmony_ci return -EINVAL; 8962306a36Sopenharmony_ci } 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_ci init_node_memory_type(numa_node, dax_slowmem_type); 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_ci rc = -ENOMEM; 9462306a36Sopenharmony_ci data = kzalloc(struct_size(data, res, dev_dax->nr_range), GFP_KERNEL); 9562306a36Sopenharmony_ci if (!data) 9662306a36Sopenharmony_ci goto err_dax_kmem_data; 9762306a36Sopenharmony_ci 9862306a36Sopenharmony_ci data->res_name = kstrdup(dev_name(dev), GFP_KERNEL); 9962306a36Sopenharmony_ci if (!data->res_name) 10062306a36Sopenharmony_ci goto err_res_name; 10162306a36Sopenharmony_ci 10262306a36Sopenharmony_ci rc = memory_group_register_static(numa_node, PFN_UP(total_len)); 10362306a36Sopenharmony_ci if (rc < 0) 10462306a36Sopenharmony_ci goto err_reg_mgid; 10562306a36Sopenharmony_ci data->mgid = rc; 10662306a36Sopenharmony_ci 10762306a36Sopenharmony_ci for (i = 0; i < dev_dax->nr_range; i++) { 10862306a36Sopenharmony_ci struct resource *res; 10962306a36Sopenharmony_ci struct range range; 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_ci rc = dax_kmem_range(dev_dax, i, &range); 11262306a36Sopenharmony_ci if (rc) 11362306a36Sopenharmony_ci continue; 11462306a36Sopenharmony_ci 11562306a36Sopenharmony_ci /* Region is permanently reserved if hotremove fails. */ 11662306a36Sopenharmony_ci res = request_mem_region(range.start, range_len(&range), data->res_name); 11762306a36Sopenharmony_ci if (!res) { 11862306a36Sopenharmony_ci dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve region\n", 11962306a36Sopenharmony_ci i, range.start, range.end); 12062306a36Sopenharmony_ci /* 12162306a36Sopenharmony_ci * Once some memory has been onlined we can't 12262306a36Sopenharmony_ci * assume that it can be un-onlined safely. 12362306a36Sopenharmony_ci */ 12462306a36Sopenharmony_ci if (mapped) 12562306a36Sopenharmony_ci continue; 12662306a36Sopenharmony_ci rc = -EBUSY; 12762306a36Sopenharmony_ci goto err_request_mem; 12862306a36Sopenharmony_ci } 12962306a36Sopenharmony_ci data->res[i] = res; 13062306a36Sopenharmony_ci 13162306a36Sopenharmony_ci /* 13262306a36Sopenharmony_ci * Set flags appropriate for System RAM. Leave ..._BUSY clear 13362306a36Sopenharmony_ci * so that add_memory() can add a child resource. Do not 13462306a36Sopenharmony_ci * inherit flags from the parent since it may set new flags 13562306a36Sopenharmony_ci * unknown to us that will break add_memory() below. 13662306a36Sopenharmony_ci */ 13762306a36Sopenharmony_ci res->flags = IORESOURCE_SYSTEM_RAM; 13862306a36Sopenharmony_ci 13962306a36Sopenharmony_ci /* 14062306a36Sopenharmony_ci * Ensure that future kexec'd kernels will not treat 14162306a36Sopenharmony_ci * this as RAM automatically. 14262306a36Sopenharmony_ci */ 14362306a36Sopenharmony_ci rc = add_memory_driver_managed(data->mgid, range.start, 14462306a36Sopenharmony_ci range_len(&range), kmem_name, MHP_NID_IS_MGID); 14562306a36Sopenharmony_ci 14662306a36Sopenharmony_ci if (rc) { 14762306a36Sopenharmony_ci dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n", 14862306a36Sopenharmony_ci i, range.start, range.end); 14962306a36Sopenharmony_ci remove_resource(res); 15062306a36Sopenharmony_ci kfree(res); 15162306a36Sopenharmony_ci data->res[i] = NULL; 15262306a36Sopenharmony_ci if (mapped) 15362306a36Sopenharmony_ci continue; 15462306a36Sopenharmony_ci goto err_request_mem; 15562306a36Sopenharmony_ci } 15662306a36Sopenharmony_ci mapped++; 15762306a36Sopenharmony_ci } 15862306a36Sopenharmony_ci 15962306a36Sopenharmony_ci dev_set_drvdata(dev, data); 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_ci return 0; 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_cierr_request_mem: 16462306a36Sopenharmony_ci memory_group_unregister(data->mgid); 16562306a36Sopenharmony_cierr_reg_mgid: 16662306a36Sopenharmony_ci kfree(data->res_name); 16762306a36Sopenharmony_cierr_res_name: 16862306a36Sopenharmony_ci kfree(data); 16962306a36Sopenharmony_cierr_dax_kmem_data: 17062306a36Sopenharmony_ci clear_node_memory_type(numa_node, dax_slowmem_type); 17162306a36Sopenharmony_ci return rc; 17262306a36Sopenharmony_ci} 17362306a36Sopenharmony_ci 17462306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_HOTREMOVE 17562306a36Sopenharmony_cistatic void dev_dax_kmem_remove(struct dev_dax *dev_dax) 17662306a36Sopenharmony_ci{ 17762306a36Sopenharmony_ci int i, success = 0; 17862306a36Sopenharmony_ci int node = dev_dax->target_node; 17962306a36Sopenharmony_ci struct device *dev = &dev_dax->dev; 18062306a36Sopenharmony_ci struct dax_kmem_data *data = dev_get_drvdata(dev); 18162306a36Sopenharmony_ci 18262306a36Sopenharmony_ci /* 18362306a36Sopenharmony_ci * We have one shot for removing memory, if some memory blocks were not 18462306a36Sopenharmony_ci * offline prior to calling this function remove_memory() will fail, and 18562306a36Sopenharmony_ci * there is no way to hotremove this memory until reboot because device 18662306a36Sopenharmony_ci * unbind will succeed even if we return failure. 18762306a36Sopenharmony_ci */ 18862306a36Sopenharmony_ci for (i = 0; i < dev_dax->nr_range; i++) { 18962306a36Sopenharmony_ci struct range range; 19062306a36Sopenharmony_ci int rc; 19162306a36Sopenharmony_ci 19262306a36Sopenharmony_ci rc = dax_kmem_range(dev_dax, i, &range); 19362306a36Sopenharmony_ci if (rc) 19462306a36Sopenharmony_ci continue; 19562306a36Sopenharmony_ci 19662306a36Sopenharmony_ci rc = remove_memory(range.start, range_len(&range)); 19762306a36Sopenharmony_ci if (rc == 0) { 19862306a36Sopenharmony_ci remove_resource(data->res[i]); 19962306a36Sopenharmony_ci kfree(data->res[i]); 20062306a36Sopenharmony_ci data->res[i] = NULL; 20162306a36Sopenharmony_ci success++; 20262306a36Sopenharmony_ci continue; 20362306a36Sopenharmony_ci } 20462306a36Sopenharmony_ci any_hotremove_failed = true; 20562306a36Sopenharmony_ci dev_err(dev, 20662306a36Sopenharmony_ci "mapping%d: %#llx-%#llx cannot be hotremoved until the next reboot\n", 20762306a36Sopenharmony_ci i, range.start, range.end); 20862306a36Sopenharmony_ci } 20962306a36Sopenharmony_ci 21062306a36Sopenharmony_ci if (success >= dev_dax->nr_range) { 21162306a36Sopenharmony_ci memory_group_unregister(data->mgid); 21262306a36Sopenharmony_ci kfree(data->res_name); 21362306a36Sopenharmony_ci kfree(data); 21462306a36Sopenharmony_ci dev_set_drvdata(dev, NULL); 21562306a36Sopenharmony_ci /* 21662306a36Sopenharmony_ci * Clear the memtype association on successful unplug. 21762306a36Sopenharmony_ci * If not, we have memory blocks left which can be 21862306a36Sopenharmony_ci * offlined/onlined later. We need to keep memory_dev_type 21962306a36Sopenharmony_ci * for that. This implies this reference will be around 22062306a36Sopenharmony_ci * till next reboot. 22162306a36Sopenharmony_ci */ 22262306a36Sopenharmony_ci clear_node_memory_type(node, dax_slowmem_type); 22362306a36Sopenharmony_ci } 22462306a36Sopenharmony_ci} 22562306a36Sopenharmony_ci#else 22662306a36Sopenharmony_cistatic void dev_dax_kmem_remove(struct dev_dax *dev_dax) 22762306a36Sopenharmony_ci{ 22862306a36Sopenharmony_ci /* 22962306a36Sopenharmony_ci * Without hotremove purposely leak the request_mem_region() for the 23062306a36Sopenharmony_ci * device-dax range and return '0' to ->remove() attempts. The removal 23162306a36Sopenharmony_ci * of the device from the driver always succeeds, but the region is 23262306a36Sopenharmony_ci * permanently pinned as reserved by the unreleased 23362306a36Sopenharmony_ci * request_mem_region(). 23462306a36Sopenharmony_ci */ 23562306a36Sopenharmony_ci any_hotremove_failed = true; 23662306a36Sopenharmony_ci} 23762306a36Sopenharmony_ci#endif /* CONFIG_MEMORY_HOTREMOVE */ 23862306a36Sopenharmony_ci 23962306a36Sopenharmony_cistatic struct dax_device_driver device_dax_kmem_driver = { 24062306a36Sopenharmony_ci .probe = dev_dax_kmem_probe, 24162306a36Sopenharmony_ci .remove = dev_dax_kmem_remove, 24262306a36Sopenharmony_ci .type = DAXDRV_KMEM_TYPE, 24362306a36Sopenharmony_ci}; 24462306a36Sopenharmony_ci 24562306a36Sopenharmony_cistatic int __init dax_kmem_init(void) 24662306a36Sopenharmony_ci{ 24762306a36Sopenharmony_ci int rc; 24862306a36Sopenharmony_ci 24962306a36Sopenharmony_ci /* Resource name is permanently allocated if any hotremove fails. */ 25062306a36Sopenharmony_ci kmem_name = kstrdup_const("System RAM (kmem)", GFP_KERNEL); 25162306a36Sopenharmony_ci if (!kmem_name) 25262306a36Sopenharmony_ci return -ENOMEM; 25362306a36Sopenharmony_ci 25462306a36Sopenharmony_ci dax_slowmem_type = alloc_memory_type(MEMTIER_DEFAULT_DAX_ADISTANCE); 25562306a36Sopenharmony_ci if (IS_ERR(dax_slowmem_type)) { 25662306a36Sopenharmony_ci rc = PTR_ERR(dax_slowmem_type); 25762306a36Sopenharmony_ci goto err_dax_slowmem_type; 25862306a36Sopenharmony_ci } 25962306a36Sopenharmony_ci 26062306a36Sopenharmony_ci rc = dax_driver_register(&device_dax_kmem_driver); 26162306a36Sopenharmony_ci if (rc) 26262306a36Sopenharmony_ci goto error_dax_driver; 26362306a36Sopenharmony_ci 26462306a36Sopenharmony_ci return rc; 26562306a36Sopenharmony_ci 26662306a36Sopenharmony_cierror_dax_driver: 26762306a36Sopenharmony_ci put_memory_type(dax_slowmem_type); 26862306a36Sopenharmony_cierr_dax_slowmem_type: 26962306a36Sopenharmony_ci kfree_const(kmem_name); 27062306a36Sopenharmony_ci return rc; 27162306a36Sopenharmony_ci} 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_cistatic void __exit dax_kmem_exit(void) 27462306a36Sopenharmony_ci{ 27562306a36Sopenharmony_ci dax_driver_unregister(&device_dax_kmem_driver); 27662306a36Sopenharmony_ci if (!any_hotremove_failed) 27762306a36Sopenharmony_ci kfree_const(kmem_name); 27862306a36Sopenharmony_ci put_memory_type(dax_slowmem_type); 27962306a36Sopenharmony_ci} 28062306a36Sopenharmony_ci 28162306a36Sopenharmony_ciMODULE_AUTHOR("Intel Corporation"); 28262306a36Sopenharmony_ciMODULE_LICENSE("GPL v2"); 28362306a36Sopenharmony_cimodule_init(dax_kmem_init); 28462306a36Sopenharmony_cimodule_exit(dax_kmem_exit); 28562306a36Sopenharmony_ciMODULE_ALIAS_DAX_DEVICE(0); 286