xref: /kernel/linux/linux-6.6/drivers/dax/kmem.c (revision 62306a36)
162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci/* Copyright(c) 2016-2019 Intel Corporation. All rights reserved. */
362306a36Sopenharmony_ci#include <linux/memremap.h>
462306a36Sopenharmony_ci#include <linux/pagemap.h>
562306a36Sopenharmony_ci#include <linux/memory.h>
662306a36Sopenharmony_ci#include <linux/module.h>
762306a36Sopenharmony_ci#include <linux/device.h>
862306a36Sopenharmony_ci#include <linux/pfn_t.h>
962306a36Sopenharmony_ci#include <linux/slab.h>
1062306a36Sopenharmony_ci#include <linux/dax.h>
1162306a36Sopenharmony_ci#include <linux/fs.h>
1262306a36Sopenharmony_ci#include <linux/mm.h>
1362306a36Sopenharmony_ci#include <linux/mman.h>
1462306a36Sopenharmony_ci#include <linux/memory-tiers.h>
1562306a36Sopenharmony_ci#include "dax-private.h"
1662306a36Sopenharmony_ci#include "bus.h"
1762306a36Sopenharmony_ci
1862306a36Sopenharmony_ci/*
1962306a36Sopenharmony_ci * Default abstract distance assigned to the NUMA node onlined
2062306a36Sopenharmony_ci * by DAX/kmem if the low level platform driver didn't initialize
2162306a36Sopenharmony_ci * one for this NUMA node.
2262306a36Sopenharmony_ci */
2362306a36Sopenharmony_ci#define MEMTIER_DEFAULT_DAX_ADISTANCE	(MEMTIER_ADISTANCE_DRAM * 5)
2462306a36Sopenharmony_ci
2562306a36Sopenharmony_ci/* Memory resource name used for add_memory_driver_managed(). */
2662306a36Sopenharmony_cistatic const char *kmem_name;
2762306a36Sopenharmony_ci/* Set if any memory will remain added when the driver will be unloaded. */
2862306a36Sopenharmony_cistatic bool any_hotremove_failed;
2962306a36Sopenharmony_ci
3062306a36Sopenharmony_cistatic int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r)
3162306a36Sopenharmony_ci{
3262306a36Sopenharmony_ci	struct dev_dax_range *dax_range = &dev_dax->ranges[i];
3362306a36Sopenharmony_ci	struct range *range = &dax_range->range;
3462306a36Sopenharmony_ci
3562306a36Sopenharmony_ci	/* memory-block align the hotplug range */
3662306a36Sopenharmony_ci	r->start = ALIGN(range->start, memory_block_size_bytes());
3762306a36Sopenharmony_ci	r->end = ALIGN_DOWN(range->end + 1, memory_block_size_bytes()) - 1;
3862306a36Sopenharmony_ci	if (r->start >= r->end) {
3962306a36Sopenharmony_ci		r->start = range->start;
4062306a36Sopenharmony_ci		r->end = range->end;
4162306a36Sopenharmony_ci		return -ENOSPC;
4262306a36Sopenharmony_ci	}
4362306a36Sopenharmony_ci	return 0;
4462306a36Sopenharmony_ci}
4562306a36Sopenharmony_ci
4662306a36Sopenharmony_cistruct dax_kmem_data {
4762306a36Sopenharmony_ci	const char *res_name;
4862306a36Sopenharmony_ci	int mgid;
4962306a36Sopenharmony_ci	struct resource *res[];
5062306a36Sopenharmony_ci};
5162306a36Sopenharmony_ci
5262306a36Sopenharmony_cistatic struct memory_dev_type *dax_slowmem_type;
5362306a36Sopenharmony_cistatic int dev_dax_kmem_probe(struct dev_dax *dev_dax)
5462306a36Sopenharmony_ci{
5562306a36Sopenharmony_ci	struct device *dev = &dev_dax->dev;
5662306a36Sopenharmony_ci	unsigned long total_len = 0;
5762306a36Sopenharmony_ci	struct dax_kmem_data *data;
5862306a36Sopenharmony_ci	int i, rc, mapped = 0;
5962306a36Sopenharmony_ci	int numa_node;
6062306a36Sopenharmony_ci
6162306a36Sopenharmony_ci	/*
6262306a36Sopenharmony_ci	 * Ensure good NUMA information for the persistent memory.
6362306a36Sopenharmony_ci	 * Without this check, there is a risk that slow memory
6462306a36Sopenharmony_ci	 * could be mixed in a node with faster memory, causing
6562306a36Sopenharmony_ci	 * unavoidable performance issues.
6662306a36Sopenharmony_ci	 */
6762306a36Sopenharmony_ci	numa_node = dev_dax->target_node;
6862306a36Sopenharmony_ci	if (numa_node < 0) {
6962306a36Sopenharmony_ci		dev_warn(dev, "rejecting DAX region with invalid node: %d\n",
7062306a36Sopenharmony_ci				numa_node);
7162306a36Sopenharmony_ci		return -EINVAL;
7262306a36Sopenharmony_ci	}
7362306a36Sopenharmony_ci
7462306a36Sopenharmony_ci	for (i = 0; i < dev_dax->nr_range; i++) {
7562306a36Sopenharmony_ci		struct range range;
7662306a36Sopenharmony_ci
7762306a36Sopenharmony_ci		rc = dax_kmem_range(dev_dax, i, &range);
7862306a36Sopenharmony_ci		if (rc) {
7962306a36Sopenharmony_ci			dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n",
8062306a36Sopenharmony_ci					i, range.start, range.end);
8162306a36Sopenharmony_ci			continue;
8262306a36Sopenharmony_ci		}
8362306a36Sopenharmony_ci		total_len += range_len(&range);
8462306a36Sopenharmony_ci	}
8562306a36Sopenharmony_ci
8662306a36Sopenharmony_ci	if (!total_len) {
8762306a36Sopenharmony_ci		dev_warn(dev, "rejecting DAX region without any memory after alignment\n");
8862306a36Sopenharmony_ci		return -EINVAL;
8962306a36Sopenharmony_ci	}
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci	init_node_memory_type(numa_node, dax_slowmem_type);
9262306a36Sopenharmony_ci
9362306a36Sopenharmony_ci	rc = -ENOMEM;
9462306a36Sopenharmony_ci	data = kzalloc(struct_size(data, res, dev_dax->nr_range), GFP_KERNEL);
9562306a36Sopenharmony_ci	if (!data)
9662306a36Sopenharmony_ci		goto err_dax_kmem_data;
9762306a36Sopenharmony_ci
9862306a36Sopenharmony_ci	data->res_name = kstrdup(dev_name(dev), GFP_KERNEL);
9962306a36Sopenharmony_ci	if (!data->res_name)
10062306a36Sopenharmony_ci		goto err_res_name;
10162306a36Sopenharmony_ci
10262306a36Sopenharmony_ci	rc = memory_group_register_static(numa_node, PFN_UP(total_len));
10362306a36Sopenharmony_ci	if (rc < 0)
10462306a36Sopenharmony_ci		goto err_reg_mgid;
10562306a36Sopenharmony_ci	data->mgid = rc;
10662306a36Sopenharmony_ci
10762306a36Sopenharmony_ci	for (i = 0; i < dev_dax->nr_range; i++) {
10862306a36Sopenharmony_ci		struct resource *res;
10962306a36Sopenharmony_ci		struct range range;
11062306a36Sopenharmony_ci
11162306a36Sopenharmony_ci		rc = dax_kmem_range(dev_dax, i, &range);
11262306a36Sopenharmony_ci		if (rc)
11362306a36Sopenharmony_ci			continue;
11462306a36Sopenharmony_ci
11562306a36Sopenharmony_ci		/* Region is permanently reserved if hotremove fails. */
11662306a36Sopenharmony_ci		res = request_mem_region(range.start, range_len(&range), data->res_name);
11762306a36Sopenharmony_ci		if (!res) {
11862306a36Sopenharmony_ci			dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve region\n",
11962306a36Sopenharmony_ci					i, range.start, range.end);
12062306a36Sopenharmony_ci			/*
12162306a36Sopenharmony_ci			 * Once some memory has been onlined we can't
12262306a36Sopenharmony_ci			 * assume that it can be un-onlined safely.
12362306a36Sopenharmony_ci			 */
12462306a36Sopenharmony_ci			if (mapped)
12562306a36Sopenharmony_ci				continue;
12662306a36Sopenharmony_ci			rc = -EBUSY;
12762306a36Sopenharmony_ci			goto err_request_mem;
12862306a36Sopenharmony_ci		}
12962306a36Sopenharmony_ci		data->res[i] = res;
13062306a36Sopenharmony_ci
13162306a36Sopenharmony_ci		/*
13262306a36Sopenharmony_ci		 * Set flags appropriate for System RAM.  Leave ..._BUSY clear
13362306a36Sopenharmony_ci		 * so that add_memory() can add a child resource.  Do not
13462306a36Sopenharmony_ci		 * inherit flags from the parent since it may set new flags
13562306a36Sopenharmony_ci		 * unknown to us that will break add_memory() below.
13662306a36Sopenharmony_ci		 */
13762306a36Sopenharmony_ci		res->flags = IORESOURCE_SYSTEM_RAM;
13862306a36Sopenharmony_ci
13962306a36Sopenharmony_ci		/*
14062306a36Sopenharmony_ci		 * Ensure that future kexec'd kernels will not treat
14162306a36Sopenharmony_ci		 * this as RAM automatically.
14262306a36Sopenharmony_ci		 */
14362306a36Sopenharmony_ci		rc = add_memory_driver_managed(data->mgid, range.start,
14462306a36Sopenharmony_ci				range_len(&range), kmem_name, MHP_NID_IS_MGID);
14562306a36Sopenharmony_ci
14662306a36Sopenharmony_ci		if (rc) {
14762306a36Sopenharmony_ci			dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n",
14862306a36Sopenharmony_ci					i, range.start, range.end);
14962306a36Sopenharmony_ci			remove_resource(res);
15062306a36Sopenharmony_ci			kfree(res);
15162306a36Sopenharmony_ci			data->res[i] = NULL;
15262306a36Sopenharmony_ci			if (mapped)
15362306a36Sopenharmony_ci				continue;
15462306a36Sopenharmony_ci			goto err_request_mem;
15562306a36Sopenharmony_ci		}
15662306a36Sopenharmony_ci		mapped++;
15762306a36Sopenharmony_ci	}
15862306a36Sopenharmony_ci
15962306a36Sopenharmony_ci	dev_set_drvdata(dev, data);
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_ci	return 0;
16262306a36Sopenharmony_ci
16362306a36Sopenharmony_cierr_request_mem:
16462306a36Sopenharmony_ci	memory_group_unregister(data->mgid);
16562306a36Sopenharmony_cierr_reg_mgid:
16662306a36Sopenharmony_ci	kfree(data->res_name);
16762306a36Sopenharmony_cierr_res_name:
16862306a36Sopenharmony_ci	kfree(data);
16962306a36Sopenharmony_cierr_dax_kmem_data:
17062306a36Sopenharmony_ci	clear_node_memory_type(numa_node, dax_slowmem_type);
17162306a36Sopenharmony_ci	return rc;
17262306a36Sopenharmony_ci}
17362306a36Sopenharmony_ci
17462306a36Sopenharmony_ci#ifdef CONFIG_MEMORY_HOTREMOVE
17562306a36Sopenharmony_cistatic void dev_dax_kmem_remove(struct dev_dax *dev_dax)
17662306a36Sopenharmony_ci{
17762306a36Sopenharmony_ci	int i, success = 0;
17862306a36Sopenharmony_ci	int node = dev_dax->target_node;
17962306a36Sopenharmony_ci	struct device *dev = &dev_dax->dev;
18062306a36Sopenharmony_ci	struct dax_kmem_data *data = dev_get_drvdata(dev);
18162306a36Sopenharmony_ci
18262306a36Sopenharmony_ci	/*
18362306a36Sopenharmony_ci	 * We have one shot for removing memory, if some memory blocks were not
18462306a36Sopenharmony_ci	 * offline prior to calling this function remove_memory() will fail, and
18562306a36Sopenharmony_ci	 * there is no way to hotremove this memory until reboot because device
18662306a36Sopenharmony_ci	 * unbind will succeed even if we return failure.
18762306a36Sopenharmony_ci	 */
18862306a36Sopenharmony_ci	for (i = 0; i < dev_dax->nr_range; i++) {
18962306a36Sopenharmony_ci		struct range range;
19062306a36Sopenharmony_ci		int rc;
19162306a36Sopenharmony_ci
19262306a36Sopenharmony_ci		rc = dax_kmem_range(dev_dax, i, &range);
19362306a36Sopenharmony_ci		if (rc)
19462306a36Sopenharmony_ci			continue;
19562306a36Sopenharmony_ci
19662306a36Sopenharmony_ci		rc = remove_memory(range.start, range_len(&range));
19762306a36Sopenharmony_ci		if (rc == 0) {
19862306a36Sopenharmony_ci			remove_resource(data->res[i]);
19962306a36Sopenharmony_ci			kfree(data->res[i]);
20062306a36Sopenharmony_ci			data->res[i] = NULL;
20162306a36Sopenharmony_ci			success++;
20262306a36Sopenharmony_ci			continue;
20362306a36Sopenharmony_ci		}
20462306a36Sopenharmony_ci		any_hotremove_failed = true;
20562306a36Sopenharmony_ci		dev_err(dev,
20662306a36Sopenharmony_ci			"mapping%d: %#llx-%#llx cannot be hotremoved until the next reboot\n",
20762306a36Sopenharmony_ci				i, range.start, range.end);
20862306a36Sopenharmony_ci	}
20962306a36Sopenharmony_ci
21062306a36Sopenharmony_ci	if (success >= dev_dax->nr_range) {
21162306a36Sopenharmony_ci		memory_group_unregister(data->mgid);
21262306a36Sopenharmony_ci		kfree(data->res_name);
21362306a36Sopenharmony_ci		kfree(data);
21462306a36Sopenharmony_ci		dev_set_drvdata(dev, NULL);
21562306a36Sopenharmony_ci		/*
21662306a36Sopenharmony_ci		 * Clear the memtype association on successful unplug.
21762306a36Sopenharmony_ci		 * If not, we have memory blocks left which can be
21862306a36Sopenharmony_ci		 * offlined/onlined later. We need to keep memory_dev_type
21962306a36Sopenharmony_ci		 * for that. This implies this reference will be around
22062306a36Sopenharmony_ci		 * till next reboot.
22162306a36Sopenharmony_ci		 */
22262306a36Sopenharmony_ci		clear_node_memory_type(node, dax_slowmem_type);
22362306a36Sopenharmony_ci	}
22462306a36Sopenharmony_ci}
22562306a36Sopenharmony_ci#else
22662306a36Sopenharmony_cistatic void dev_dax_kmem_remove(struct dev_dax *dev_dax)
22762306a36Sopenharmony_ci{
22862306a36Sopenharmony_ci	/*
22962306a36Sopenharmony_ci	 * Without hotremove purposely leak the request_mem_region() for the
23062306a36Sopenharmony_ci	 * device-dax range and return '0' to ->remove() attempts. The removal
23162306a36Sopenharmony_ci	 * of the device from the driver always succeeds, but the region is
23262306a36Sopenharmony_ci	 * permanently pinned as reserved by the unreleased
23362306a36Sopenharmony_ci	 * request_mem_region().
23462306a36Sopenharmony_ci	 */
23562306a36Sopenharmony_ci	any_hotremove_failed = true;
23662306a36Sopenharmony_ci}
23762306a36Sopenharmony_ci#endif /* CONFIG_MEMORY_HOTREMOVE */
23862306a36Sopenharmony_ci
23962306a36Sopenharmony_cistatic struct dax_device_driver device_dax_kmem_driver = {
24062306a36Sopenharmony_ci	.probe = dev_dax_kmem_probe,
24162306a36Sopenharmony_ci	.remove = dev_dax_kmem_remove,
24262306a36Sopenharmony_ci	.type = DAXDRV_KMEM_TYPE,
24362306a36Sopenharmony_ci};
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_cistatic int __init dax_kmem_init(void)
24662306a36Sopenharmony_ci{
24762306a36Sopenharmony_ci	int rc;
24862306a36Sopenharmony_ci
24962306a36Sopenharmony_ci	/* Resource name is permanently allocated if any hotremove fails. */
25062306a36Sopenharmony_ci	kmem_name = kstrdup_const("System RAM (kmem)", GFP_KERNEL);
25162306a36Sopenharmony_ci	if (!kmem_name)
25262306a36Sopenharmony_ci		return -ENOMEM;
25362306a36Sopenharmony_ci
25462306a36Sopenharmony_ci	dax_slowmem_type = alloc_memory_type(MEMTIER_DEFAULT_DAX_ADISTANCE);
25562306a36Sopenharmony_ci	if (IS_ERR(dax_slowmem_type)) {
25662306a36Sopenharmony_ci		rc = PTR_ERR(dax_slowmem_type);
25762306a36Sopenharmony_ci		goto err_dax_slowmem_type;
25862306a36Sopenharmony_ci	}
25962306a36Sopenharmony_ci
26062306a36Sopenharmony_ci	rc = dax_driver_register(&device_dax_kmem_driver);
26162306a36Sopenharmony_ci	if (rc)
26262306a36Sopenharmony_ci		goto error_dax_driver;
26362306a36Sopenharmony_ci
26462306a36Sopenharmony_ci	return rc;
26562306a36Sopenharmony_ci
26662306a36Sopenharmony_cierror_dax_driver:
26762306a36Sopenharmony_ci	put_memory_type(dax_slowmem_type);
26862306a36Sopenharmony_cierr_dax_slowmem_type:
26962306a36Sopenharmony_ci	kfree_const(kmem_name);
27062306a36Sopenharmony_ci	return rc;
27162306a36Sopenharmony_ci}
27262306a36Sopenharmony_ci
27362306a36Sopenharmony_cistatic void __exit dax_kmem_exit(void)
27462306a36Sopenharmony_ci{
27562306a36Sopenharmony_ci	dax_driver_unregister(&device_dax_kmem_driver);
27662306a36Sopenharmony_ci	if (!any_hotremove_failed)
27762306a36Sopenharmony_ci		kfree_const(kmem_name);
27862306a36Sopenharmony_ci	put_memory_type(dax_slowmem_type);
27962306a36Sopenharmony_ci}
28062306a36Sopenharmony_ci
28162306a36Sopenharmony_ciMODULE_AUTHOR("Intel Corporation");
28262306a36Sopenharmony_ciMODULE_LICENSE("GPL v2");
28362306a36Sopenharmony_cimodule_init(dax_kmem_init);
28462306a36Sopenharmony_cimodule_exit(dax_kmem_exit);
28562306a36Sopenharmony_ciMODULE_ALIAS_DAX_DEVICE(0);
286