From fe5c9940dfd8ba0c73672dddb30acd1b7a11d4c7 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Thu, 28 Sep 2023 21:58:23 +0200 Subject: [PATCH 1/5] can: dev: can_restart(): don't crash kernel if carrier is OK During testing, I triggered a can_restart() with the netif carrier being OK [1]. The BUG_ON, which checks if the carrier is OK, results in a fatal kernel crash. This is neither helpful for debugging nor for a production system. [1] The root cause is a race condition in can_restart() which will be fixed in the next patch. Do not crash the kernel, issue an error message instead, and continue restarting the CAN device anyway. Fixes: 39549eef3587 ("can: CAN Network device driver and Netlink interface") Link: https://lore.kernel.org/all/20231005-can-dev-fix-can-restart-v2-1-91b5c1fd922c@pengutronix.de Reviewed-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index 7f9334a8af50..a5bbdfa9a269 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -132,7 +132,8 @@ static void can_restart(struct net_device *dev) struct can_frame *cf; int err; - BUG_ON(netif_carrier_ok(dev)); + if (netif_carrier_ok(dev)) + netdev_err(dev, "Attempt to restart for bus-off recovery, but carrier is OK?\n"); /* No synchronization needed because the device is bus-off and * no messages can come in or go out. From 6841cab8c4504835e4011689cbdb3351dec693fd Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 29 Sep 2023 10:25:11 +0200 Subject: [PATCH 2/5] can: dev: can_restart(): fix race condition between controller restart and netif_carrier_on() This race condition was discovered while updating the at91_can driver to use can_bus_off(). The following scenario describes how the converted at91_can driver would behave. When a CAN device goes into BUS-OFF state, the driver usually stops/resets the CAN device and calls can_bus_off(). This function sets the netif carrier to off, and (if configured by user space) schedules a delayed work that calls can_restart() to restart the CAN device. The can_restart() function first checks if the carrier is off and triggers an error message if the carrier is OK. Then it calls the driver's do_set_mode() function to restart the device, then it sets the netif carrier to on. There is a race window between these two calls. The at91 CAN controller (observed on the sama5d3, a single core 32 bit ARM CPU) has a hardware limitation. If the device goes into bus-off while sending a CAN frame, there is no way to abort the sending of this frame. After the controller is enabled again, another attempt is made to send it. If the bus is still faulty, the device immediately goes back to the bus-off state. The driver calls can_bus_off(), the netif carrier is switched off and another can_restart is scheduled. This occurs within the race window before the original can_restart() handler marks the netif carrier as OK. This would cause the 2nd can_restart() to be called with an OK netif carrier, resulting in an error message. The flow of the 1st can_restart() looks like this: can_restart() // bail out if netif_carrier is OK netif_carrier_ok(dev) priv->do_set_mode(dev, CAN_MODE_START) // enable CAN controller // sama5d3 restarts sending old message // CAN devices goes into BUS_OFF, triggers IRQ // IRQ handler start at91_irq() at91_irq_err_line() can_bus_off() netif_carrier_off() schedule_delayed_work() // IRQ handler end netif_carrier_on() The 2nd can_restart() will be called with an OK netif carrier and the error message will be printed. To close the race window, first set the netif carrier to on, then restart the controller. In case the restart fails with an error code, roll back the netif carrier to off. Fixes: 39549eef3587 ("can: CAN Network device driver and Netlink interface") Link: https://lore.kernel.org/all/20231005-can-dev-fix-can-restart-v2-2-91b5c1fd922c@pengutronix.de Reviewed-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/dev.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index a5bbdfa9a269..735d5de3caa0 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -154,11 +154,12 @@ static void can_restart(struct net_device *dev) priv->can_stats.restarts++; /* Now restart the device */ - err = priv->do_set_mode(dev, CAN_MODE_START); - netif_carrier_on(dev); - if (err) + err = priv->do_set_mode(dev, CAN_MODE_START); + if (err) { netdev_err(dev, "Error %d during restart", err); + netif_carrier_off(dev); + } } static void can_restart_work(struct work_struct *work) From 8f3ec204d340af183fb2bb21b8e797ac2ed012b2 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 29 Sep 2023 09:47:38 +0200 Subject: [PATCH 3/5] can: dev: can_restart(): reverse logic to remove need for goto Reverse the logic in the if statement and eliminate the need for a goto to simplify code readability. Link: https://lore.kernel.org/all/20231005-can-dev-fix-can-restart-v2-3-91b5c1fd922c@pengutronix.de Reviewed-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/dev.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index 735d5de3caa0..9014256c486a 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -142,14 +142,11 @@ static void can_restart(struct net_device *dev) /* send restart message upstream */ skb = alloc_can_err_skb(dev, &cf); - if (!skb) - goto restart; + if (skb) { + cf->can_id |= CAN_ERR_RESTARTED; + netif_rx(skb); + } - cf->can_id |= CAN_ERR_RESTARTED; - - netif_rx(skb); - -restart: netdev_dbg(dev, "restarted\n"); priv->can_stats.restarts++; From f0e0c809c0be05fe865b9ac128ef3ee35c276021 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 29 Sep 2023 10:18:02 +0200 Subject: [PATCH 4/5] can: dev: can_restart(): move debug message and stats after successful restart Move the debug message "restarted" and the CAN restart stats_after_ the successful restart of the CAN device, because the restart may fail. While there update the error message from printing the error number to printing symbolic error names. Link: https://lore.kernel.org/all/20231005-can-dev-fix-can-restart-v2-4-91b5c1fd922c@pengutronix.de Reviewed-by: Vincent Mailhol [mkl: mention stats in subject and description, too] Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index 9014256c486a..82b12902fc35 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -147,15 +147,15 @@ static void can_restart(struct net_device *dev) netif_rx(skb); } - netdev_dbg(dev, "restarted\n"); - priv->can_stats.restarts++; - /* Now restart the device */ netif_carrier_on(dev); err = priv->do_set_mode(dev, CAN_MODE_START); if (err) { - netdev_err(dev, "Error %d during restart", err); + netdev_err(dev, "Restart failed, error %pe\n", ERR_PTR(err)); netif_carrier_off(dev); + } else { + netdev_dbg(dev, "Restarted\n"); + priv->can_stats.restarts++; } } From 6411959c10fe917288cbb1038886999148560057 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 29 Sep 2023 10:23:47 +0200 Subject: [PATCH 5/5] can: dev: can_put_echo_skb(): don't crash kernel if can_priv::echo_skb is accessed out of bounds If the "struct can_priv::echoo_skb" is accessed out of bounds, this would cause a kernel crash. Instead, issue a meaningful warning message and return with an error. Fixes: a6e4bc530403 ("can: make the number of echo skb's configurable") Link: https://lore.kernel.org/all/20231005-can-dev-fix-can-restart-v2-5-91b5c1fd922c@pengutronix.de Reviewed-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/skb.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/dev/skb.c b/drivers/net/can/dev/skb.c index f6d05b3ef59a..3ebd4f779b9b 100644 --- a/drivers/net/can/dev/skb.c +++ b/drivers/net/can/dev/skb.c @@ -49,7 +49,11 @@ int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, { struct can_priv *priv = netdev_priv(dev); - BUG_ON(idx >= priv->echo_skb_max); + if (idx >= priv->echo_skb_max) { + netdev_err(dev, "%s: BUG! Trying to access can_priv::echo_skb out of bounds (%u/max %u)\n", + __func__, idx, priv->echo_skb_max); + return -EINVAL; + } /* check flag whether this packet has to be looped back */ if (!(dev->flags & IFF_ECHO) ||