linux/drivers/cxl/pmem.c

// SPDX-License-Identifier: GPL-2.0-only
/* Copyright(c) 2021 Intel Corporation. All rights reserved. */
#include <linux/libnvdimm.h>
#include <asm/unaligned.h>
#include <linux/device.h>
#include <linux/module.h>
#include <linux/ndctl.h>
#include <linux/async.h>
#include <linux/slab.h>
#include <linux/nd.h>
#include "cxlmem.h"
#include "cxl.h"

/*
 * Ordered workqueue for cxl nvdimm device arrival and departure
 * to coordinate bus rescans when a bridge arrives and trigger remove
 * operations when the bridge is removed.
 */
static struct workqueue_struct *cxl_pmem_wq;

static __read_mostly DECLARE_BITMAP(exclusive_cmds, CXL_MEM_COMMAND_ID_MAX);

static void clear_exclusive(void *cxlds)
{
	clear_exclusive_cxl_commands(cxlds, exclusive_cmds);
}

static void unregister_nvdimm(void *nvdimm)
{
	struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
	struct cxl_nvdimm_bridge *cxl_nvb = cxl_nvd->bridge;
	struct cxl_pmem_region *cxlr_pmem;
	unsigned long index;

	device_lock(&cxl_nvb->dev);
	dev_set_drvdata(&cxl_nvd->dev, NULL);
	xa_for_each(&cxl_nvd->pmem_regions, index, cxlr_pmem) {
		get_device(&cxlr_pmem->dev);
		device_unlock(&cxl_nvb->dev);

		device_release_driver(&cxlr_pmem->dev);
		put_device(&cxlr_pmem->dev);

		device_lock(&cxl_nvb->dev);
	}
	device_unlock(&cxl_nvb->dev);

	nvdimm_delete(nvdimm);
	cxl_nvd->bridge = NULL;
}

static int cxl_nvdimm_probe(struct device *dev)
{
	struct cxl_nvdimm *cxl_nvd = to_cxl_nvdimm(dev);
	struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
	unsigned long flags = 0, cmd_mask = 0;
	struct cxl_dev_state *cxlds = cxlmd->cxlds;
	struct cxl_nvdimm_bridge *cxl_nvb;
	struct nvdimm *nvdimm;
	int rc;

	cxl_nvb = cxl_find_nvdimm_bridge(dev);
	if (!cxl_nvb)
		return -ENXIO;

	device_lock(&cxl_nvb->dev);
	if (!cxl_nvb->nvdimm_bus) {
		rc = -ENXIO;
		goto out;
	}

	set_exclusive_cxl_commands(cxlds, exclusive_cmds);
	rc = devm_add_action_or_reset(dev, clear_exclusive, cxlds);
	if (rc)
		goto out;

	set_bit(NDD_LABELING, &flags);
	set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask);
	set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask);
	set_bit(ND_CMD_SET_CONFIG_DATA, &cmd_mask);
	nvdimm = nvdimm_create(cxl_nvb->nvdimm_bus, cxl_nvd, NULL, flags,
			       cmd_mask, 0, NULL);
	if (!nvdimm) {
		rc = -ENOMEM;
		goto out;
	}

	dev_set_drvdata(dev, nvdimm);
	cxl_nvd->bridge = cxl_nvb;
	rc = devm_add_action_or_reset(dev, unregister_nvdimm, nvdimm);
out:
	device_unlock(&cxl_nvb->dev);
	put_device(&cxl_nvb->dev);

	return rc;
}

static struct cxl_driver cxl_nvdimm_driver = {
	.name = "cxl_nvdimm",
	.probe = cxl_nvdimm_probe,
	.id = CXL_DEVICE_NVDIMM,
};

static int cxl_pmem_get_config_size(struct cxl_dev_state *cxlds,
				    struct nd_cmd_get_config_size *cmd,
				    unsigned int buf_len)
{
	if (sizeof(*cmd) > buf_len)
		return -EINVAL;

	*cmd = (struct nd_cmd_get_config_size) {
		 .config_size = cxlds->lsa_size,
		 .max_xfer = cxlds->payload_size - sizeof(struct cxl_mbox_set_lsa),
	};

	return 0;
}

static int cxl_pmem_get_config_data(struct cxl_dev_state *cxlds,
				    struct nd_cmd_get_config_data_hdr *cmd,
				    unsigned int buf_len)
{
	struct cxl_mbox_get_lsa get_lsa;
	int rc;

	if (sizeof(*cmd) > buf_len)
		return -EINVAL;
	if (struct_size(cmd, out_buf, cmd->in_length) > buf_len)
		return -EINVAL;

	get_lsa = (struct cxl_mbox_get_lsa) {
		.offset = cpu_to_le32(cmd->in_offset),
		.length = cpu_to_le32(cmd->in_length),
	};

	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_GET_LSA, &get_lsa,
			       sizeof(get_lsa), cmd->out_buf, cmd->in_length);
	cmd->status = 0;

	return rc;
}

static int cxl_pmem_set_config_data(struct cxl_dev_state *cxlds,
				    struct nd_cmd_set_config_hdr *cmd,
				    unsigned int buf_len)
{
	struct cxl_mbox_set_lsa *set_lsa;
	int rc;

	if (sizeof(*cmd) > buf_len)
		return -EINVAL;

	/* 4-byte status follows the input data in the payload */
	if (size_add(struct_size(cmd, in_buf, cmd->in_length), 4) > buf_len)
		return -EINVAL;

	set_lsa =
		kvzalloc(struct_size(set_lsa, data, cmd->in_length), GFP_KERNEL);
	if (!set_lsa)
		return -ENOMEM;

	*set_lsa = (struct cxl_mbox_set_lsa) {
		.offset = cpu_to_le32(cmd->in_offset),
	};
	memcpy(set_lsa->data, cmd->in_buf, cmd->in_length);

	rc = cxl_mbox_send_cmd(cxlds, CXL_MBOX_OP_SET_LSA, set_lsa,
			       struct_size(set_lsa, data, cmd->in_length),
			       NULL, 0);

	/*
	 * Set "firmware" status (4-packed bytes at the end of the input
	 * payload.
	 */
	put_unaligned(0, (u32 *) &cmd->in_buf[cmd->in_length]);
	kvfree(set_lsa);

	return rc;
}

static int cxl_pmem_nvdimm_ctl(struct nvdimm *nvdimm, unsigned int cmd,
			       void *buf, unsigned int buf_len)
{
	struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
	unsigned long cmd_mask = nvdimm_cmd_mask(nvdimm);
	struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
	struct cxl_dev_state *cxlds = cxlmd->cxlds;

	if (!test_bit(cmd, &cmd_mask))
		return -ENOTTY;

	switch (cmd) {
	case ND_CMD_GET_CONFIG_SIZE:
		return cxl_pmem_get_config_size(cxlds, buf, buf_len);
	case ND_CMD_GET_CONFIG_DATA:
		return cxl_pmem_get_config_data(cxlds, buf, buf_len);
	case ND_CMD_SET_CONFIG_DATA:
		return cxl_pmem_set_config_data(cxlds, buf, buf_len);
	default:
		return -ENOTTY;
	}
}

static int cxl_pmem_ctl(struct nvdimm_bus_descriptor *nd_desc,
			struct nvdimm *nvdimm, unsigned int cmd, void *buf,
			unsigned int buf_len, int *cmd_rc)
{
	/*
	 * No firmware response to translate, let the transport error
	 * code take precedence.
	 */
	*cmd_rc = 0;

	if (!nvdimm)
		return -ENOTTY;
	return cxl_pmem_nvdimm_ctl(nvdimm, cmd, buf, buf_len);
}

static bool online_nvdimm_bus(struct cxl_nvdimm_bridge *cxl_nvb)
{
	if (cxl_nvb->nvdimm_bus)
		return true;
	cxl_nvb->nvdimm_bus =
		nvdimm_bus_register(&cxl_nvb->dev, &cxl_nvb->nd_desc);
	return cxl_nvb->nvdimm_bus != NULL;
}

static int cxl_nvdimm_release_driver(struct device *dev, void *cxl_nvb)
{
	struct cxl_nvdimm *cxl_nvd;

	if (!is_cxl_nvdimm(dev))
		return 0;

	cxl_nvd = to_cxl_nvdimm(dev);
	if (cxl_nvd->bridge != cxl_nvb)
		return 0;

	device_release_driver(dev);
	return 0;
}

static int cxl_pmem_region_release_driver(struct device *dev, void *cxl_nvb)
{
	struct cxl_pmem_region *cxlr_pmem;

	if (!is_cxl_pmem_region(dev))
		return 0;

	cxlr_pmem = to_cxl_pmem_region(dev);
	if (cxlr_pmem->bridge != cxl_nvb)
		return 0;

	device_release_driver(dev);
	return 0;
}

static void offline_nvdimm_bus(struct cxl_nvdimm_bridge *cxl_nvb,
			       struct nvdimm_bus *nvdimm_bus)
{
	if (!nvdimm_bus)
		return;

	/*
	 * Set the state of cxl_nvdimm devices to unbound / idle before
	 * nvdimm_bus_unregister() rips the nvdimm objects out from
	 * underneath them.
	 */
	bus_for_each_dev(&cxl_bus_type, NULL, cxl_nvb,
			 cxl_pmem_region_release_driver);
	bus_for_each_dev(&cxl_bus_type, NULL, cxl_nvb,
			 cxl_nvdimm_release_driver);
	nvdimm_bus_unregister(nvdimm_bus);
}

static void cxl_nvb_update_state(struct work_struct *work)
{
	struct cxl_nvdimm_bridge *cxl_nvb =
		container_of(work, typeof(*cxl_nvb), state_work);
	struct nvdimm_bus *victim_bus = NULL;
	bool release = false, rescan = false;

	device_lock(&cxl_nvb->dev);
	switch (cxl_nvb->state) {
	case CXL_NVB_ONLINE:
		if (!online_nvdimm_bus(cxl_nvb)) {
			dev_err(&cxl_nvb->dev,
				"failed to establish nvdimm bus\n");
			release = true;
		} else
			rescan = true;
		break;
	case CXL_NVB_OFFLINE:
	case CXL_NVB_DEAD:
		victim_bus = cxl_nvb->nvdimm_bus;
		cxl_nvb->nvdimm_bus = NULL;
		break;
	default:
		break;
	}
	device_unlock(&cxl_nvb->dev);

	if (release)
		device_release_driver(&cxl_nvb->dev);
	if (rescan) {
		int rc = bus_rescan_devices(&cxl_bus_type);

		dev_dbg(&cxl_nvb->dev, "rescan: %d\n", rc);
	}
	offline_nvdimm_bus(cxl_nvb, victim_bus);

	put_device(&cxl_nvb->dev);
}

static void cxl_nvdimm_bridge_state_work(struct cxl_nvdimm_bridge *cxl_nvb)
{
	/*
	 * Take a reference that the workqueue will drop if new work
	 * gets queued.
	 */
	get_device(&cxl_nvb->dev);
	if (!queue_work(cxl_pmem_wq, &cxl_nvb->state_work))
		put_device(&cxl_nvb->dev);
}

static void cxl_nvdimm_bridge_remove(struct device *dev)
{
	struct cxl_nvdimm_bridge *cxl_nvb = to_cxl_nvdimm_bridge(dev);

	if (cxl_nvb->state == CXL_NVB_ONLINE)
		cxl_nvb->state = CXL_NVB_OFFLINE;
	cxl_nvdimm_bridge_state_work(cxl_nvb);
}

static int cxl_nvdimm_bridge_probe(struct device *dev)
{
	struct cxl_nvdimm_bridge *cxl_nvb = to_cxl_nvdimm_bridge(dev);

	if (cxl_nvb->state == CXL_NVB_DEAD)
		return -ENXIO;

	if (cxl_nvb->state == CXL_NVB_NEW) {
		cxl_nvb->nd_desc = (struct nvdimm_bus_descriptor) {
			.provider_name = "CXL",
			.module = THIS_MODULE,
			.ndctl = cxl_pmem_ctl,
		};

		INIT_WORK(&cxl_nvb->state_work, cxl_nvb_update_state);
	}

	cxl_nvb->state = CXL_NVB_ONLINE;
	cxl_nvdimm_bridge_state_work(cxl_nvb);

	return 0;
}

static struct cxl_driver cxl_nvdimm_bridge_driver = {
	.name = "cxl_nvdimm_bridge",
	.probe = cxl_nvdimm_bridge_probe,
	.remove = cxl_nvdimm_bridge_remove,
	.id = CXL_DEVICE_NVDIMM_BRIDGE,
};

static int match_cxl_nvdimm(struct device *dev, void *data)
{
	return is_cxl_nvdimm(dev);
}

static void unregister_nvdimm_region(void *nd_region)
{
	nvdimm_region_delete(nd_region);
}

static int cxl_nvdimm_add_region(struct cxl_nvdimm *cxl_nvd,
				 struct cxl_pmem_region *cxlr_pmem)
{
	int rc;

	rc = xa_insert(&cxl_nvd->pmem_regions, (unsigned long)cxlr_pmem,
		       cxlr_pmem, GFP_KERNEL);
	if (rc)
		return rc;

	get_device(&cxlr_pmem->dev);
	return 0;
}

static void cxl_nvdimm_del_region(struct cxl_nvdimm *cxl_nvd,
				  struct cxl_pmem_region *cxlr_pmem)
{
	/*
	 * It is possible this is called without a corresponding
	 * cxl_nvdimm_add_region for @cxlr_pmem
	 */
	cxlr_pmem = xa_erase(&cxl_nvd->pmem_regions, (unsigned long)cxlr_pmem);
	if (cxlr_pmem)
		put_device(&cxlr_pmem->dev);
}

static void release_mappings(void *data)
{
	int i;
	struct cxl_pmem_region *cxlr_pmem = data;
	struct cxl_nvdimm_bridge *cxl_nvb = cxlr_pmem->bridge;

	device_lock(&cxl_nvb->dev);
	for (i = 0; i < cxlr_pmem->nr_mappings; i++) {
		struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
		struct cxl_nvdimm *cxl_nvd = m->cxl_nvd;

		cxl_nvdimm_del_region(cxl_nvd, cxlr_pmem);
	}
	device_unlock(&cxl_nvb->dev);
}

static void cxlr_pmem_remove_resource(void *res)
{
	remove_resource(res);
}

struct cxl_pmem_region_info {
	u64 offset;
	u64 serial;
};

static int cxl_pmem_region_probe(struct device *dev)
{
	struct nd_mapping_desc mappings[CXL_DECODER_MAX_INTERLEAVE];
	struct cxl_pmem_region *cxlr_pmem = to_cxl_pmem_region(dev);
	struct cxl_region *cxlr = cxlr_pmem->cxlr;
	struct cxl_pmem_region_info *info = NULL;
	struct cxl_nvdimm_bridge *cxl_nvb;
	struct nd_interleave_set *nd_set;
	struct nd_region_desc ndr_desc;
	struct cxl_nvdimm *cxl_nvd;
	struct nvdimm *nvdimm;
	struct resource *res;
	int rc, i = 0;

	cxl_nvb = cxl_find_nvdimm_bridge(&cxlr_pmem->mapping[0].cxlmd->dev);
	if (!cxl_nvb) {
		dev_dbg(dev, "bridge not found\n");
		return -ENXIO;
	}
	cxlr_pmem->bridge = cxl_nvb;

	device_lock(&cxl_nvb->dev);
	if (!cxl_nvb->nvdimm_bus) {
		dev_dbg(dev, "nvdimm bus not found\n");
		rc = -ENXIO;
		goto out_nvb;
	}

	memset(&mappings, 0, sizeof(mappings));
	memset(&ndr_desc, 0, sizeof(ndr_desc));

	res = devm_kzalloc(dev, sizeof(*res), GFP_KERNEL);
	if (!res) {
		rc = -ENOMEM;
		goto out_nvb;
	}

	res->name = "Persistent Memory";
	res->start = cxlr_pmem->hpa_range.start;
	res->end = cxlr_pmem->hpa_range.end;
	res->flags = IORESOURCE_MEM;
	res->desc = IORES_DESC_PERSISTENT_MEMORY;

	rc = insert_resource(&iomem_resource, res);
	if (rc)
		goto out_nvb;

	rc = devm_add_action_or_reset(dev, cxlr_pmem_remove_resource, res);
	if (rc)
		goto out_nvb;

	ndr_desc.res = res;
	ndr_desc.provider_data = cxlr_pmem;

	ndr_desc.numa_node = memory_add_physaddr_to_nid(res->start);
	ndr_desc.target_node = phys_to_target_node(res->start);
	if (ndr_desc.target_node == NUMA_NO_NODE) {
		ndr_desc.target_node = ndr_desc.numa_node;
		dev_dbg(&cxlr->dev, "changing target node from %d to %d",
			NUMA_NO_NODE, ndr_desc.target_node);
	}

	nd_set = devm_kzalloc(dev, sizeof(*nd_set), GFP_KERNEL);
	if (!nd_set) {
		rc = -ENOMEM;
		goto out_nvb;
	}

	ndr_desc.memregion = cxlr->id;
	set_bit(ND_REGION_CXL, &ndr_desc.flags);
	set_bit(ND_REGION_PERSIST_MEMCTRL, &ndr_desc.flags);

	info = kmalloc_array(cxlr_pmem->nr_mappings, sizeof(*info), GFP_KERNEL);
	if (!info) {
		rc = -ENOMEM;
		goto out_nvb;
	}

	rc = devm_add_action_or_reset(dev, release_mappings, cxlr_pmem);
	if (rc)
		goto out_nvd;

	for (i = 0; i < cxlr_pmem->nr_mappings; i++) {
		struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i];
		struct cxl_memdev *cxlmd = m->cxlmd;
		struct cxl_dev_state *cxlds = cxlmd->cxlds;
		struct device *d;

		d = device_find_child(&cxlmd->dev, NULL, match_cxl_nvdimm);
		if (!d) {
			dev_dbg(dev, "[%d]: %s: no cxl_nvdimm found\n", i,
				dev_name(&cxlmd->dev));
			rc = -ENODEV;
			goto out_nvd;
		}

		/* safe to drop ref now with bridge lock held */
		put_device(d);

		cxl_nvd = to_cxl_nvdimm(d);
		nvdimm = dev_get_drvdata(&cxl_nvd->dev);
		if (!nvdimm) {
			dev_dbg(dev, "[%d]: %s: no nvdimm found\n", i,
				dev_name(&cxlmd->dev));
			rc = -ENODEV;
			goto out_nvd;
		}

		/*
		 * Pin the region per nvdimm device as those may be released
		 * out-of-order with respect to the region, and a single nvdimm
		 * maybe associated with multiple regions
		 */
		rc = cxl_nvdimm_add_region(cxl_nvd, cxlr_pmem);
		if (rc)
			goto out_nvd;
		m->cxl_nvd = cxl_nvd;
		mappings[i] = (struct nd_mapping_desc) {
			.nvdimm = nvdimm,
			.start = m->start,
			.size = m->size,
			.position = i,
		};
		info[i].offset = m->start;
		info[i].serial = cxlds->serial;
	}
	ndr_desc.num_mappings = cxlr_pmem->nr_mappings;
	ndr_desc.mapping = mappings;

	/*
	 * TODO enable CXL labels which skip the need for 'interleave-set cookie'
	 */
	nd_set->cookie1 =
		nd_fletcher64(info, sizeof(*info) * cxlr_pmem->nr_mappings, 0);
	nd_set->cookie2 = nd_set->cookie1;
	ndr_desc.nd_set = nd_set;

	cxlr_pmem->nd_region =
		nvdimm_pmem_region_create(cxl_nvb->nvdimm_bus, &ndr_desc);
	if (!cxlr_pmem->nd_region) {
		rc = -ENOMEM;
		goto out_nvd;
	}

	rc = devm_add_action_or_reset(dev, unregister_nvdimm_region,
				      cxlr_pmem->nd_region);
out_nvd:
	kfree(info);
out_nvb:
	device_unlock(&cxl_nvb->dev);
	put_device(&cxl_nvb->dev);

	return rc;
}

static struct cxl_driver cxl_pmem_region_driver = {
	.name = "cxl_pmem_region",
	.probe = cxl_pmem_region_probe,
	.id = CXL_DEVICE_PMEM_REGION,
};

/*
 * Return all bridges to the CXL_NVB_NEW state to invalidate any
 * ->state_work referring to the now destroyed cxl_pmem_wq.
 */
static int cxl_nvdimm_bridge_reset(struct device *dev, void *data)
{
	struct cxl_nvdimm_bridge *cxl_nvb;

	if (!is_cxl_nvdimm_bridge(dev))
		return 0;

	cxl_nvb = to_cxl_nvdimm_bridge(dev);
	device_lock(dev);
	cxl_nvb->state = CXL_NVB_NEW;
	device_unlock(dev);

	return 0;
}

static void destroy_cxl_pmem_wq(void)
{
	destroy_workqueue(cxl_pmem_wq);
	bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_nvdimm_bridge_reset);
}

static __init int cxl_pmem_init(void)
{
	int rc;

	set_bit(CXL_MEM_COMMAND_ID_SET_SHUTDOWN_STATE, exclusive_cmds);
	set_bit(CXL_MEM_COMMAND_ID_SET_LSA, exclusive_cmds);

	cxl_pmem_wq = alloc_ordered_workqueue("cxl_pmem", 0);
	if (!cxl_pmem_wq)
		return -ENXIO;

	rc = cxl_driver_register(&cxl_nvdimm_bridge_driver);
	if (rc)
		goto err_bridge;

	rc = cxl_driver_register(&cxl_nvdimm_driver);
	if (rc)
		goto err_nvdimm;

	rc = cxl_driver_register(&cxl_pmem_region_driver);
	if (rc)
		goto err_region;

	return 0;

err_region:
	cxl_driver_unregister(&cxl_nvdimm_driver);
err_nvdimm:
	cxl_driver_unregister(&cxl_nvdimm_bridge_driver);
err_bridge:
	destroy_cxl_pmem_wq();
	return rc;
}

static __exit void cxl_pmem_exit(void)
{
	cxl_driver_unregister(&cxl_pmem_region_driver);
	cxl_driver_unregister(&cxl_nvdimm_driver);
	cxl_driver_unregister(&cxl_nvdimm_bridge_driver);
	destroy_cxl_pmem_wq();
}

MODULE_LICENSE("GPL v2");
module_init(cxl_pmem_init);
module_exit(cxl_pmem_exit);
MODULE_IMPORT_NS(CXL);
MODULE_ALIAS_CXL(CXL_DEVICE_NVDIMM_BRIDGE);
MODULE_ALIAS_CXL(CXL_DEVICE_NVDIMM);
MODULE_ALIAS_CXL(CXL_DEVICE_PMEM_REGION);