Merge branch 'for-6.15/dirty-shutdown' into cxl-for-next2

Add support for Global Persistent Flush (GPF) and dirty shutdown
accounting.
This commit is contained in:
Dave Jiang 2025-03-14 16:11:42 -07:00
commit d781a45270
11 changed files with 274 additions and 1 deletions

View File

@ -604,3 +604,15 @@ Description:
See Documentation/ABI/stable/sysfs-devices-node. access0 provides
the number to the closest initiator and access1 provides the
number to the closest CPU.
What: /sys/bus/cxl/devices/nvdimm-bridge0/ndbusX/nmemY/cxl/dirty_shutdown
Date: Feb, 2025
KernelVersion: v6.15
Contact: linux-cxl@vger.kernel.org
Description:
(RO) The device dirty shutdown count value, which is the number
of times the device could have incurred in potential data loss.
The count is persistent across power loss and wraps back to 0
upon overflow. If this file is not present, the device does not
have the necessary support for dirty tracking.

View File

@ -130,7 +130,7 @@ Mailbox commands
* [0] Switch CCI
* [3] Timestamp
* [1] PMEM labels
* [0] PMEM GPF / Dirty Shutdown
* [3] PMEM GPF / Dirty Shutdown
* [0] Scan Media
PMU

View File

@ -117,5 +117,6 @@ int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port,
int cxl_ras_init(void);
void cxl_ras_exit(void);
int cxl_gpf_port_setup(struct device *dport_dev, struct cxl_port *port);
#endif /* __CXL_CORE_H__ */

View File

@ -1282,6 +1282,45 @@ int cxl_mem_dpa_fetch(struct cxl_memdev_state *mds, struct cxl_dpa_info *info)
}
EXPORT_SYMBOL_NS_GPL(cxl_mem_dpa_fetch, "CXL");
int cxl_get_dirty_count(struct cxl_memdev_state *mds, u32 *count)
{
struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
struct cxl_mbox_get_health_info_out hi;
struct cxl_mbox_cmd mbox_cmd;
int rc;
mbox_cmd = (struct cxl_mbox_cmd) {
.opcode = CXL_MBOX_OP_GET_HEALTH_INFO,
.size_out = sizeof(hi),
.payload_out = &hi,
};
rc = cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
if (!rc)
*count = le32_to_cpu(hi.dirty_shutdown_cnt);
return rc;
}
EXPORT_SYMBOL_NS_GPL(cxl_get_dirty_count, "CXL");
int cxl_arm_dirty_shutdown(struct cxl_memdev_state *mds)
{
struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;
struct cxl_mbox_cmd mbox_cmd;
struct cxl_mbox_set_shutdown_state_in in = {
.state = 1
};
mbox_cmd = (struct cxl_mbox_cmd) {
.opcode = CXL_MBOX_OP_SET_SHUTDOWN_STATE,
.size_in = sizeof(in),
.payload_in = &in,
};
return cxl_internal_send_cmd(cxl_mbox, &mbox_cmd);
}
EXPORT_SYMBOL_NS_GPL(cxl_arm_dirty_shutdown, "CXL");
int cxl_set_timestamp(struct cxl_memdev_state *mds)
{
struct cxl_mailbox *cxl_mbox = &mds->cxlds.cxl_mbox;

View File

@ -1054,3 +1054,100 @@ int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c)
return 0;
}
/*
* Set max timeout such that platforms will optimize GPF flow to avoid
* the implied worst-case scenario delays. On a sane platform, all
* devices should always complete GPF within the energy budget of
* the GPF flow. The kernel does not have enough information to pick
* anything better than "maximize timeouts and hope it works".
*
* A misbehaving device could block forward progress of GPF for all
* the other devices, exhausting the energy budget of the platform.
* However, the spec seems to assume that moving on from slow to respond
* devices is a virtue. It is not possible to know that, in actuality,
* the slow to respond device is *the* most critical device in the
* system to wait.
*/
#define GPF_TIMEOUT_BASE_MAX 2
#define GPF_TIMEOUT_SCALE_MAX 7 /* 10 seconds */
u16 cxl_gpf_get_dvsec(struct device *dev, bool is_port)
{
u16 dvsec;
if (!dev_is_pci(dev))
return 0;
dvsec = pci_find_dvsec_capability(to_pci_dev(dev), PCI_VENDOR_ID_CXL,
is_port ? CXL_DVSEC_PORT_GPF : CXL_DVSEC_DEVICE_GPF);
if (!dvsec)
dev_warn(dev, "%s GPF DVSEC not present\n",
is_port ? "Port" : "Device");
return dvsec;
}
EXPORT_SYMBOL_NS_GPL(cxl_gpf_get_dvsec, "CXL");
static int update_gpf_port_dvsec(struct pci_dev *pdev, int dvsec, int phase)
{
u64 base, scale;
int rc, offset;
u16 ctrl;
switch (phase) {
case 1:
offset = CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET;
base = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK;
scale = CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK;
break;
case 2:
offset = CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET;
base = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK;
scale = CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK;
break;
default:
return -EINVAL;
}
rc = pci_read_config_word(pdev, dvsec + offset, &ctrl);
if (rc)
return rc;
if (FIELD_GET(base, ctrl) == GPF_TIMEOUT_BASE_MAX &&
FIELD_GET(scale, ctrl) == GPF_TIMEOUT_SCALE_MAX)
return 0;
ctrl = FIELD_PREP(base, GPF_TIMEOUT_BASE_MAX);
ctrl |= FIELD_PREP(scale, GPF_TIMEOUT_SCALE_MAX);
rc = pci_write_config_word(pdev, dvsec + offset, ctrl);
if (!rc)
pci_dbg(pdev, "Port GPF phase %d timeout: %d0 secs\n",
phase, GPF_TIMEOUT_BASE_MAX);
return rc;
}
int cxl_gpf_port_setup(struct device *dport_dev, struct cxl_port *port)
{
struct pci_dev *pdev;
if (!port)
return -EINVAL;
if (!port->gpf_dvsec) {
int dvsec;
dvsec = cxl_gpf_get_dvsec(dport_dev, true);
if (!dvsec)
return -EINVAL;
port->gpf_dvsec = dvsec;
}
pdev = to_pci_dev(dport_dev);
update_gpf_port_dvsec(pdev, port->gpf_dvsec, 1);
update_gpf_port_dvsec(pdev, port->gpf_dvsec, 2);
return 0;
}

View File

@ -1678,6 +1678,8 @@ int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd)
if (rc && rc != -EBUSY)
return rc;
cxl_gpf_port_setup(dport_dev, port);
/* Any more ports to add between this one and the root? */
if (!dev_is_cxl_root_child(&port->dev))
continue;

View File

@ -542,6 +542,7 @@ struct cxl_nvdimm {
struct device dev;
struct cxl_memdev *cxlmd;
u8 dev_id[CXL_DEV_ID_LEN]; /* for nvdimm, string of 'serial' */
u64 dirty_shutdowns;
};
struct cxl_pmem_region_mapping {
@ -589,6 +590,7 @@ struct cxl_dax_region {
* @cdat: Cached CDAT data
* @cdat_available: Should a CDAT attribute be available in sysfs
* @pci_latency: Upstream latency in picoseconds
* @gpf_dvsec: Cached GPF port DVSEC
*/
struct cxl_port {
struct device dev;
@ -612,6 +614,7 @@ struct cxl_port {
} cdat;
bool cdat_available;
long pci_latency;
int gpf_dvsec;
};
/**
@ -899,4 +902,6 @@ bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port);
#define __mock static
#endif
u16 cxl_gpf_get_dvsec(struct device *dev, bool is_port);
#endif /* __CXL_H__ */

View File

@ -721,6 +721,23 @@ struct cxl_mbox_set_partition_info {
#define CXL_SET_PARTITION_IMMEDIATE_FLAG BIT(0)
/* Get Health Info Output Payload CXL 3.2 Spec 8.2.10.9.3.1 Table 8-148 */
struct cxl_mbox_get_health_info_out {
u8 health_status;
u8 media_status;
u8 additional_status;
u8 life_used;
__le16 device_temperature;
__le32 dirty_shutdown_cnt;
__le32 corrected_volatile_error_cnt;
__le32 corrected_persistent_error_cnt;
} __packed;
/* Set Shutdown State Input Payload CXL 3.2 Spec 8.2.10.9.3.5 Table 8-152 */
struct cxl_mbox_set_shutdown_state_in {
u8 state;
} __packed;
/* Set Timestamp CXL 3.0 Spec 8.2.9.4.2 */
struct cxl_mbox_set_timestamp_in {
__le64 timestamp;
@ -857,6 +874,8 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
enum cxl_event_log_type type,
enum cxl_event_type event_type,
const uuid_t *uuid, union cxl_event *evt);
int cxl_get_dirty_count(struct cxl_memdev_state *mds, u32 *count);
int cxl_arm_dirty_shutdown(struct cxl_memdev_state *mds);
int cxl_set_timestamp(struct cxl_memdev_state *mds);
int cxl_poison_state_init(struct cxl_memdev_state *mds);
int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,

View File

@ -40,6 +40,12 @@
/* CXL 2.0 8.1.6: GPF DVSEC for CXL Port */
#define CXL_DVSEC_PORT_GPF 4
#define CXL_DVSEC_PORT_GPF_PHASE_1_CONTROL_OFFSET 0x0C
#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_BASE_MASK GENMASK(3, 0)
#define CXL_DVSEC_PORT_GPF_PHASE_1_TMO_SCALE_MASK GENMASK(11, 8)
#define CXL_DVSEC_PORT_GPF_PHASE_2_CONTROL_OFFSET 0xE
#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_BASE_MASK GENMASK(3, 0)
#define CXL_DVSEC_PORT_GPF_PHASE_2_TMO_SCALE_MASK GENMASK(11, 8)
/* CXL 2.0 8.1.7: GPF DVSEC for CXL Device */
#define CXL_DVSEC_DEVICE_GPF 5

View File

@ -42,15 +42,44 @@ static ssize_t id_show(struct device *dev, struct device_attribute *attr, char *
}
static DEVICE_ATTR_RO(id);
static ssize_t dirty_shutdown_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nvdimm *nvdimm = to_nvdimm(dev);
struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
return sysfs_emit(buf, "%llu\n", cxl_nvd->dirty_shutdowns);
}
static DEVICE_ATTR_RO(dirty_shutdown);
static struct attribute *cxl_dimm_attributes[] = {
&dev_attr_id.attr,
&dev_attr_provider.attr,
&dev_attr_dirty_shutdown.attr,
NULL
};
#define CXL_INVALID_DIRTY_SHUTDOWN_COUNT ULLONG_MAX
static umode_t cxl_dimm_visible(struct kobject *kobj,
struct attribute *a, int n)
{
if (a == &dev_attr_dirty_shutdown.attr) {
struct device *dev = kobj_to_dev(kobj);
struct nvdimm *nvdimm = to_nvdimm(dev);
struct cxl_nvdimm *cxl_nvd = nvdimm_provider_data(nvdimm);
if (cxl_nvd->dirty_shutdowns ==
CXL_INVALID_DIRTY_SHUTDOWN_COUNT)
return 0;
}
return a->mode;
}
static const struct attribute_group cxl_dimm_attribute_group = {
.name = "cxl",
.attrs = cxl_dimm_attributes,
.is_visible = cxl_dimm_visible
};
static const struct attribute_group *cxl_dimm_attribute_groups[] = {
@ -58,6 +87,38 @@ static const struct attribute_group *cxl_dimm_attribute_groups[] = {
NULL
};
static void cxl_nvdimm_arm_dirty_shutdown_tracking(struct cxl_nvdimm *cxl_nvd)
{
struct cxl_memdev *cxlmd = cxl_nvd->cxlmd;
struct cxl_dev_state *cxlds = cxlmd->cxlds;
struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
struct device *dev = &cxl_nvd->dev;
u32 count;
/*
* Dirty tracking is enabled and exposed to the user, only when:
* - dirty shutdown on the device can be set, and,
* - the device has a Device GPF DVSEC (albeit unused), and,
* - the Get Health Info cmd can retrieve the device's dirty count.
*/
cxl_nvd->dirty_shutdowns = CXL_INVALID_DIRTY_SHUTDOWN_COUNT;
if (cxl_arm_dirty_shutdown(mds)) {
dev_warn(dev, "GPF: could not set dirty shutdown state\n");
return;
}
if (!cxl_gpf_get_dvsec(cxlds->dev, false))
return;
if (cxl_get_dirty_count(mds, &count)) {
dev_warn(dev, "GPF: could not retrieve dirty count\n");
return;
}
cxl_nvd->dirty_shutdowns = count;
}
static int cxl_nvdimm_probe(struct device *dev)
{
struct cxl_nvdimm *cxl_nvd = to_cxl_nvdimm(dev);
@ -78,6 +139,14 @@ static int cxl_nvdimm_probe(struct device *dev)
set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask);
set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask);
set_bit(ND_CMD_SET_CONFIG_DATA, &cmd_mask);
/*
* Set dirty shutdown now, with the expectation that the device
* clear it upon a successful GPF flow. The exception to this
* is upon Viral detection, per CXL 3.2 section 12.4.2.
*/
cxl_nvdimm_arm_dirty_shutdown_tracking(cxl_nvd);
nvdimm = __nvdimm_create(cxl_nvb->nvdimm_bus, cxl_nvd,
cxl_dimm_attribute_groups, flags,
cmd_mask, 0, NULL, cxl_nvd->dev_id,

View File

@ -65,6 +65,10 @@ static struct cxl_cel_entry mock_cel[] = {
.opcode = cpu_to_le16(CXL_MBOX_OP_GET_HEALTH_INFO),
.effect = CXL_CMD_EFFECT_NONE,
},
{
.opcode = cpu_to_le16(CXL_MBOX_OP_SET_SHUTDOWN_STATE),
.effect = POLICY_CHANGE_IMMEDIATE,
},
{
.opcode = cpu_to_le16(CXL_MBOX_OP_GET_POISON),
.effect = CXL_CMD_EFFECT_NONE,
@ -161,6 +165,7 @@ struct cxl_mockmem_data {
u8 event_buf[SZ_4K];
u64 timestamp;
unsigned long sanitize_timeout;
u8 shutdown_state;
};
static struct mock_event_log *event_find_log(struct device *dev, int log_type)
@ -1088,6 +1093,21 @@ static int mock_health_info(struct cxl_mbox_cmd *cmd)
return 0;
}
static int mock_set_shutdown_state(struct cxl_mockmem_data *mdata,
struct cxl_mbox_cmd *cmd)
{
struct cxl_mbox_set_shutdown_state_in *ss = cmd->payload_in;
if (cmd->size_in != sizeof(*ss))
return -EINVAL;
if (cmd->size_out != 0)
return -EINVAL;
mdata->shutdown_state = ss->state;
return 0;
}
static struct mock_poison {
struct cxl_dev_state *cxlds;
u64 dpa;
@ -1421,6 +1441,9 @@ static int cxl_mock_mbox_send(struct cxl_mailbox *cxl_mbox,
case CXL_MBOX_OP_PASSPHRASE_SECURE_ERASE:
rc = mock_passphrase_secure_erase(mdata, cmd);
break;
case CXL_MBOX_OP_SET_SHUTDOWN_STATE:
rc = mock_set_shutdown_state(mdata, cmd);
break;
case CXL_MBOX_OP_GET_POISON:
rc = mock_get_poison(cxlds, cmd);
break;