From 82db77f6fb16d23ea60d0f96dcf2b502a322a28f Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 8 Apr 2026 16:05:50 -0700 Subject: [PATCH 01/10] net: tso: Introduce tso_dma_map and helpers Add struct tso_dma_map to tso.h for tracking DMA addresses of mapped GSO payload data and tso_dma_map_completion_state. The tso_dma_map combines DMA mapping storage with iterator state, allowing drivers to walk pre-mapped DMA regions linearly. Includes fields for the DMA IOVA path (iova_state, iova_offset, total_len) and a fallback per-region path (linear_dma, frags[], frag_idx, offset). The tso_dma_map_completion_state makes the IOVA completion state opaque for drivers. Drivers are expected to allocate this and use the added helpers to update the completion state. Adds skb_frag_phys() to skbuff.h, returning the physical address of a paged fragment's data, which is used by the tso_dma_map helpers introduced in this commit described below. The added TSO DMA map helpers are: tso_dma_map_init(): DMA-maps the linear payload region and all frags upfront. Prefers the DMA IOVA API for a single contiguous mapping with one IOTLB sync; falls back to per-region dma_map_phys() otherwise. Returns 0 on success, cleans up partial mappings on failure. tso_dma_map_cleanup(): Handles both IOVA and fallback teardown paths. tso_dma_map_count(): counts how many descriptors the next N bytes of payload will need. Returns 1 if IOVA is used since the mapping is contiguous. tso_dma_map_next(): yields the next (dma_addr, chunk_len) pair. On the IOVA path, each segment is a single contiguous chunk. On the fallback path, indicates when a chunk starts a new DMA mapping so the driver can set dma_unmap_len on that descriptor for completion-time unmapping. tso_dma_map_completion_save(): updates the completion state. Drivers will call this at xmit time. tso_dma_map_complete(): tears down the mapping at completion time and returns true if the IOVA path was used. If it was not used, this is a no-op and returns false. Suggested-by: Jakub Kicinski Signed-off-by: Joe Damato Link: https://patch.msgid.link/20260408230607.2019402-2-joe@dama.to Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 11 ++ include/net/tso.h | 100 +++++++++++++++ net/core/tso.c | 269 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 380 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 26fe18bcfad8..2bcf78a4de7b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3763,6 +3763,17 @@ static inline void *skb_frag_address_safe(const skb_frag_t *frag) return ptr + skb_frag_off(frag); } +/** + * skb_frag_phys - gets the physical address of the data in a paged fragment + * @frag: the paged fragment buffer + * + * Returns: the physical address of the data within @frag. + */ +static inline phys_addr_t skb_frag_phys(const skb_frag_t *frag) +{ + return page_to_phys(skb_frag_page(frag)) + skb_frag_off(frag); +} + /** * skb_frag_page_copy() - sets the page in a fragment from another fragment * @fragto: skb fragment where page is set diff --git a/include/net/tso.h b/include/net/tso.h index e7e157ae0526..da82aabd1d48 100644 --- a/include/net/tso.h +++ b/include/net/tso.h @@ -3,6 +3,7 @@ #define _TSO_H #include +#include #include #define TSO_HEADER_SIZE 256 @@ -28,4 +29,103 @@ void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso, void tso_build_data(const struct sk_buff *skb, struct tso_t *tso, int size); int tso_start(struct sk_buff *skb, struct tso_t *tso); +/** + * struct tso_dma_map - DMA mapping state for GSO payload + * @dev: device used for DMA mapping + * @skb: the GSO skb being mapped + * @hdr_len: per-segment header length + * @iova_state: DMA IOVA state (when IOMMU available) + * @iova_offset: global byte offset into IOVA range (IOVA path only) + * @total_len: total payload length + * @frag_idx: current region (-1 = linear, 0..nr_frags-1 = frag) + * @offset: byte offset within current region + * @linear_dma: DMA address of the linear payload + * @linear_len: length of the linear payload + * @nr_frags: number of frags successfully DMA-mapped + * @frags: per-frag DMA address and length + * + * DMA-maps the payload regions of a GSO skb (linear data + frags). + * Prefers the DMA IOVA API for a single contiguous mapping with one + * IOTLB sync; falls back to per-region dma_map_phys() otherwise. + */ +struct tso_dma_map { + struct device *dev; + const struct sk_buff *skb; + unsigned int hdr_len; + /* IOVA path */ + struct dma_iova_state iova_state; + size_t iova_offset; + size_t total_len; + /* Fallback path if IOVA path fails */ + int frag_idx; + unsigned int offset; + dma_addr_t linear_dma; + unsigned int linear_len; + unsigned int nr_frags; + struct { + dma_addr_t dma; + unsigned int len; + } frags[MAX_SKB_FRAGS]; +}; + +/** + * struct tso_dma_map_completion_state - Completion-time cleanup state + * @iova_state: DMA IOVA state (when IOMMU available) + * @total_len: total payload length of the IOVA mapping + * + * Drivers store this on their SW ring at xmit time via + * tso_dma_map_completion_save(), then call tso_dma_map_complete() at + * completion time. + */ +struct tso_dma_map_completion_state { + struct dma_iova_state iova_state; + size_t total_len; +}; + +int tso_dma_map_init(struct tso_dma_map *map, struct device *dev, + const struct sk_buff *skb, unsigned int hdr_len); +void tso_dma_map_cleanup(struct tso_dma_map *map); +unsigned int tso_dma_map_count(struct tso_dma_map *map, unsigned int len); +bool tso_dma_map_next(struct tso_dma_map *map, dma_addr_t *addr, + unsigned int *chunk_len, unsigned int *mapping_len, + unsigned int seg_remaining); + +/** + * tso_dma_map_completion_save - save state needed for completion-time cleanup + * @map: the xmit-time DMA map + * @cstate: driver-owned storage that persists until completion + * + * Should be called at xmit time to update the completion state and later passed + * to tso_dma_map_complete(). + */ +static inline void +tso_dma_map_completion_save(const struct tso_dma_map *map, + struct tso_dma_map_completion_state *cstate) +{ + cstate->iova_state = map->iova_state; + cstate->total_len = map->total_len; +} + +/** + * tso_dma_map_complete - tear down mapping at completion time + * @dev: the device that owns the mapping + * @cstate: state saved by tso_dma_map_completion_save() + * + * Return: true if the IOVA path was used and the mapping has been + * destroyed; false if the fallback per-region path was used and the + * driver must unmap via its normal completion path. + */ +static inline bool +tso_dma_map_complete(struct device *dev, + struct tso_dma_map_completion_state *cstate) +{ + if (dma_use_iova(&cstate->iova_state)) { + dma_iova_destroy(dev, &cstate->iova_state, cstate->total_len, + DMA_TO_DEVICE, 0); + return true; + } + + return false; +} + #endif /* _TSO_H */ diff --git a/net/core/tso.c b/net/core/tso.c index 6df997b9076e..347b3856ddb9 100644 --- a/net/core/tso.c +++ b/net/core/tso.c @@ -3,6 +3,7 @@ #include #include #include +#include #include void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso, @@ -87,3 +88,271 @@ int tso_start(struct sk_buff *skb, struct tso_t *tso) return hdr_len; } EXPORT_SYMBOL(tso_start); + +static int tso_dma_iova_try(struct device *dev, struct tso_dma_map *map, + phys_addr_t phys, size_t linear_len, + size_t total_len, size_t *offset) +{ + const struct sk_buff *skb; + unsigned int nr_frags; + int i; + + if (!dma_iova_try_alloc(dev, &map->iova_state, phys, total_len)) + return 1; + + skb = map->skb; + nr_frags = skb_shinfo(skb)->nr_frags; + + if (linear_len) { + if (dma_iova_link(dev, &map->iova_state, + phys, *offset, linear_len, + DMA_TO_DEVICE, 0)) + goto iova_fail; + map->linear_len = linear_len; + *offset += linear_len; + } + + for (i = 0; i < nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + unsigned int frag_len = skb_frag_size(frag); + + if (dma_iova_link(dev, &map->iova_state, + skb_frag_phys(frag), *offset, + frag_len, DMA_TO_DEVICE, 0)) { + map->nr_frags = i; + goto iova_fail; + } + map->frags[i].len = frag_len; + *offset += frag_len; + map->nr_frags = i + 1; + } + + if (dma_iova_sync(dev, &map->iova_state, 0, total_len)) + goto iova_fail; + + return 0; + +iova_fail: + dma_iova_destroy(dev, &map->iova_state, *offset, + DMA_TO_DEVICE, 0); + memset(&map->iova_state, 0, sizeof(map->iova_state)); + + /* reset map state */ + map->frag_idx = -1; + map->offset = 0; + map->linear_len = 0; + map->nr_frags = 0; + + return 1; +} + +/** + * tso_dma_map_init - DMA-map GSO payload regions + * @map: map struct to initialize + * @dev: device for DMA mapping + * @skb: the GSO skb + * @hdr_len: per-segment header length in bytes + * + * DMA-maps the linear payload (after headers) and all frags. + * Prefers the DMA IOVA API (one contiguous mapping, one IOTLB sync); + * falls back to per-region dma_map_phys() when IOVA is not available. + * Positions the iterator at byte 0 of the payload. + * + * Return: 0 on success, -ENOMEM on DMA mapping failure (partial mappings + * are cleaned up internally). + */ +int tso_dma_map_init(struct tso_dma_map *map, struct device *dev, + const struct sk_buff *skb, unsigned int hdr_len) +{ + unsigned int linear_len = skb_headlen(skb) - hdr_len; + unsigned int nr_frags = skb_shinfo(skb)->nr_frags; + size_t total_len = skb->len - hdr_len; + size_t offset = 0; + phys_addr_t phys; + int i; + + map->dev = dev; + map->skb = skb; + map->hdr_len = hdr_len; + map->frag_idx = -1; + map->offset = 0; + map->iova_offset = 0; + map->total_len = total_len; + map->linear_len = 0; + map->nr_frags = 0; + memset(&map->iova_state, 0, sizeof(map->iova_state)); + + if (!total_len) + return 0; + + if (linear_len) + phys = virt_to_phys(skb->data + hdr_len); + else + phys = skb_frag_phys(&skb_shinfo(skb)->frags[0]); + + if (tso_dma_iova_try(dev, map, phys, linear_len, total_len, &offset)) { + /* IOVA path failed, map state was reset. Fallback to + * per-region dma_map_phys() + */ + if (linear_len) { + map->linear_dma = dma_map_phys(dev, phys, linear_len, + DMA_TO_DEVICE, 0); + if (dma_mapping_error(dev, map->linear_dma)) + return -ENOMEM; + map->linear_len = linear_len; + } + + for (i = 0; i < nr_frags; i++) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + unsigned int frag_len = skb_frag_size(frag); + + map->frags[i].len = frag_len; + map->frags[i].dma = dma_map_phys(dev, skb_frag_phys(frag), + frag_len, DMA_TO_DEVICE, 0); + if (dma_mapping_error(dev, map->frags[i].dma)) { + tso_dma_map_cleanup(map); + return -ENOMEM; + } + map->nr_frags = i + 1; + } + } + + if (linear_len == 0 && nr_frags > 0) + map->frag_idx = 0; + + return 0; +} +EXPORT_SYMBOL(tso_dma_map_init); + +/** + * tso_dma_map_cleanup - unmap all DMA regions in a tso_dma_map + * @map: the map to clean up + * + * Handles both IOVA and fallback paths. For IOVA, calls + * dma_iova_destroy(). For fallback, unmaps each region individually. + */ +void tso_dma_map_cleanup(struct tso_dma_map *map) +{ + int i; + + if (dma_use_iova(&map->iova_state)) { + dma_iova_destroy(map->dev, &map->iova_state, map->total_len, + DMA_TO_DEVICE, 0); + memset(&map->iova_state, 0, sizeof(map->iova_state)); + } else { + if (map->linear_len) + dma_unmap_phys(map->dev, map->linear_dma, + map->linear_len, DMA_TO_DEVICE, 0); + + for (i = 0; i < map->nr_frags; i++) + dma_unmap_phys(map->dev, map->frags[i].dma, + map->frags[i].len, DMA_TO_DEVICE, 0); + } + + map->linear_len = 0; + map->nr_frags = 0; +} +EXPORT_SYMBOL(tso_dma_map_cleanup); + +/** + * tso_dma_map_count - count descriptors for a payload range + * @map: the payload map + * @len: number of payload bytes in this segment + * + * Counts how many contiguous DMA region chunks the next @len bytes + * will span, without advancing the iterator. On the IOVA path this + * is always 1 (contiguous). On the fallback path, uses region sizes + * from the current position. + * + * Return: the number of descriptors needed for @len bytes of payload. + */ +unsigned int tso_dma_map_count(struct tso_dma_map *map, unsigned int len) +{ + unsigned int offset = map->offset; + int idx = map->frag_idx; + unsigned int count = 0; + + if (!len) + return 0; + + if (dma_use_iova(&map->iova_state)) + return 1; + + while (len > 0) { + unsigned int region_len, chunk; + + if (idx == -1) + region_len = map->linear_len; + else + region_len = map->frags[idx].len; + + chunk = min(len, region_len - offset); + len -= chunk; + count++; + offset = 0; + idx++; + } + + return count; +} +EXPORT_SYMBOL(tso_dma_map_count); + +/** + * tso_dma_map_next - yield the next DMA address range + * @map: the payload map + * @addr: output DMA address + * @chunk_len: output chunk length + * @mapping_len: full DMA mapping length when this chunk starts a new + * mapping region, or 0 when continuing a previous one. + * On the IOVA path this is always 0 (driver must not + * do per-region unmaps; use tso_dma_map_cleanup instead). + * @seg_remaining: bytes left in current segment + * + * Yields the next (dma_addr, chunk_len) pair and advances the iterator. + * On the IOVA path, the entire payload is contiguous so each segment + * is always a single chunk. + * + * Return: true if a chunk was yielded, false when @seg_remaining is 0. + */ +bool tso_dma_map_next(struct tso_dma_map *map, dma_addr_t *addr, + unsigned int *chunk_len, unsigned int *mapping_len, + unsigned int seg_remaining) +{ + unsigned int region_len, chunk; + + if (!seg_remaining) + return false; + + /* IOVA path: contiguous DMA range, no region boundaries */ + if (dma_use_iova(&map->iova_state)) { + *addr = map->iova_state.addr + map->iova_offset; + *chunk_len = seg_remaining; + *mapping_len = 0; + map->iova_offset += seg_remaining; + return true; + } + + /* Fallback path: per-region iteration */ + + if (map->frag_idx == -1) { + region_len = map->linear_len; + chunk = min(seg_remaining, region_len - map->offset); + *addr = map->linear_dma + map->offset; + } else { + region_len = map->frags[map->frag_idx].len; + chunk = min(seg_remaining, region_len - map->offset); + *addr = map->frags[map->frag_idx].dma + map->offset; + } + + *mapping_len = (map->offset == 0) ? region_len : 0; + *chunk_len = chunk; + map->offset += chunk; + + if (map->offset >= region_len) { + map->frag_idx++; + map->offset = 0; + } + + return true; +} +EXPORT_SYMBOL(tso_dma_map_next); From 268c63f2c6b23c80f3c903642f300a8e37ab3aa3 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 8 Apr 2026 16:05:51 -0700 Subject: [PATCH 02/10] net: bnxt: Export bnxt_xmit_get_cfa_action Export bnxt_xmit_get_cfa_action so that it can be used in future commits which add software USO support to bnxt. Suggested-by: Jakub Kicinski Reviewed-by: Pavan Chebbi Signed-off-by: Joe Damato Link: https://patch.msgid.link/20260408230607.2019402-3-joe@dama.to Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index fe8b886ff82e..d4288c458576 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -447,7 +447,7 @@ const u16 bnxt_lhint_arr[] = { TX_BD_FLAGS_LHINT_2048_AND_LARGER, }; -static u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb) +u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 3558a36ece12..2b40a5bd57af 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2969,6 +2969,7 @@ unsigned int bnxt_get_avail_cp_rings_for_en(struct bnxt *bp); int bnxt_reserve_rings(struct bnxt *bp, bool irq_re_init); void bnxt_tx_disable(struct bnxt *bp); void bnxt_tx_enable(struct bnxt *bp); +u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb); void bnxt_sched_reset_txr(struct bnxt *bp, struct bnxt_tx_ring_info *txr, u16 curr); void bnxt_report_link(struct bnxt *bp); From 637237d3d93cab7f183075f4fadbfbf62663c6f4 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 8 Apr 2026 16:05:52 -0700 Subject: [PATCH 03/10] net: bnxt: Add a helper for tx_bd_ext Factor out some code to setup tx_bd_exts into a helper function. This helper will be used by SW USO implementation in the following commits. Suggested-by: Jakub Kicinski Reviewed-by: Pavan Chebbi Signed-off-by: Joe Damato Link: https://patch.msgid.link/20260408230607.2019402-4-joe@dama.to Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 9 ++------- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index d4288c458576..d1f0969b781c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -663,10 +663,9 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) txbd->tx_bd_opaque = SET_TX_OPAQUE(bp, txr, prod, 2 + last_frag); prod = NEXT_TX(prod); - txbd1 = (struct tx_bd_ext *) - &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)]; + txbd1 = bnxt_init_ext_bd(bp, txr, prod, lflags, vlan_tag_flags, + cfa_action); - txbd1->tx_bd_hsize_lflags = lflags; if (skb_is_gso(skb)) { bool udp_gso = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4); u32 hdr_len; @@ -693,7 +692,6 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) } else if (skb->ip_summed == CHECKSUM_PARTIAL) { txbd1->tx_bd_hsize_lflags |= cpu_to_le32(TX_BD_FLAGS_TCP_UDP_CHKSUM); - txbd1->tx_bd_mss = 0; } length >>= 9; @@ -706,9 +704,6 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) flags |= bnxt_lhint_arr[length]; txbd->tx_bd_len_flags_type = cpu_to_le32(flags); - txbd1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags); - txbd1->tx_bd_cfa_action = - cpu_to_le32(cfa_action << TX_BD_CFA_ACTION_SHIFT); txbd0 = txbd; for (i = 0; i < last_frag; i++) { frag = &skb_shinfo(skb)->frags[i]; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 2b40a5bd57af..83b4136ccd31 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2836,6 +2836,24 @@ static inline u32 bnxt_tx_avail(struct bnxt *bp, return bp->tx_ring_size - (used & bp->tx_ring_mask); } +static inline struct tx_bd_ext * +bnxt_init_ext_bd(struct bnxt *bp, struct bnxt_tx_ring_info *txr, + u16 prod, __le32 lflags, u32 vlan_tag_flags, + u32 cfa_action) +{ + struct tx_bd_ext *txbd1; + + txbd1 = (struct tx_bd_ext *) + &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)]; + txbd1->tx_bd_hsize_lflags = lflags; + txbd1->tx_bd_mss = 0; + txbd1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags); + txbd1->tx_bd_cfa_action = + cpu_to_le32(cfa_action << TX_BD_CFA_ACTION_SHIFT); + + return txbd1; +} + static inline void bnxt_writeq(struct bnxt *bp, u64 val, volatile void __iomem *addr) { From 3cb430e62c83823b76fc7dc1aec8dc7bbf81a729 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 8 Apr 2026 16:05:53 -0700 Subject: [PATCH 04/10] net: bnxt: Use dma_unmap_len for TX completion unmapping Store the DMA mapping length in each TX buffer descriptor via dma_unmap_len_set at submit time, and use dma_unmap_len at completion time. This is a no-op for normal packets but prepares for software USO, where header BDs set dma_unmap_len to 0 because the header buffer is unmapped collectively rather than per-segment. Suggested-by: Jakub Kicinski Reviewed-by: Pavan Chebbi Signed-off-by: Joe Damato Link: https://patch.msgid.link/20260408230607.2019402-5-joe@dama.to Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 63 +++++++++++++++-------- 1 file changed, 41 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index d1f0969b781c..bc2dac2f137d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -656,6 +656,7 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_free; dma_unmap_addr_set(tx_buf, mapping, mapping); + dma_unmap_len_set(tx_buf, len, len); flags = (len << TX_BD_LEN_SHIFT) | TX_BD_TYPE_LONG_TX_BD | TX_BD_CNT(last_frag + 2); @@ -720,6 +721,7 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) tx_buf = &txr->tx_buf_ring[RING_TX(bp, prod)]; netmem_dma_unmap_addr_set(skb_frag_netmem(frag), tx_buf, mapping, mapping); + dma_unmap_len_set(tx_buf, len, len); txbd->tx_bd_haddr = cpu_to_le64(mapping); @@ -809,7 +811,8 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr, u16 hw_cons = txr->tx_hw_cons; unsigned int tx_bytes = 0; u16 cons = txr->tx_cons; - skb_frag_t *frag; + unsigned int dma_len; + dma_addr_t dma_addr; int tx_pkts = 0; bool rc = false; @@ -844,19 +847,27 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr, goto next_tx_int; } - dma_unmap_single(&pdev->dev, dma_unmap_addr(tx_buf, mapping), - skb_headlen(skb), DMA_TO_DEVICE); + if (dma_unmap_len(tx_buf, len)) { + dma_addr = dma_unmap_addr(tx_buf, mapping); + dma_len = dma_unmap_len(tx_buf, len); + + dma_unmap_single(&pdev->dev, dma_addr, dma_len, + DMA_TO_DEVICE); + } + last = tx_buf->nr_frags; for (j = 0; j < last; j++) { - frag = &skb_shinfo(skb)->frags[j]; cons = NEXT_TX(cons); tx_buf = &txr->tx_buf_ring[RING_TX(bp, cons)]; - netmem_dma_unmap_page_attrs(&pdev->dev, - dma_unmap_addr(tx_buf, - mapping), - skb_frag_size(frag), - DMA_TO_DEVICE, 0); + if (dma_unmap_len(tx_buf, len)) { + dma_addr = dma_unmap_addr(tx_buf, mapping); + dma_len = dma_unmap_len(tx_buf, len); + + netmem_dma_unmap_page_attrs(&pdev->dev, + dma_addr, dma_len, + DMA_TO_DEVICE, 0); + } } if (unlikely(is_ts_pkt)) { if (BNXT_CHIP_P5(bp)) { @@ -3394,6 +3405,8 @@ static void bnxt_free_one_tx_ring_skbs(struct bnxt *bp, { int i, max_idx; struct pci_dev *pdev = bp->pdev; + unsigned int dma_len; + dma_addr_t dma_addr; max_idx = bp->tx_nr_pages * TX_DESC_CNT; @@ -3404,9 +3417,10 @@ static void bnxt_free_one_tx_ring_skbs(struct bnxt *bp, if (idx < bp->tx_nr_rings_xdp && tx_buf->action == XDP_REDIRECT) { - dma_unmap_single(&pdev->dev, - dma_unmap_addr(tx_buf, mapping), - dma_unmap_len(tx_buf, len), + dma_addr = dma_unmap_addr(tx_buf, mapping); + dma_len = dma_unmap_len(tx_buf, len); + + dma_unmap_single(&pdev->dev, dma_addr, dma_len, DMA_TO_DEVICE); xdp_return_frame(tx_buf->xdpf); tx_buf->action = 0; @@ -3429,23 +3443,28 @@ static void bnxt_free_one_tx_ring_skbs(struct bnxt *bp, continue; } - dma_unmap_single(&pdev->dev, - dma_unmap_addr(tx_buf, mapping), - skb_headlen(skb), - DMA_TO_DEVICE); + if (dma_unmap_len(tx_buf, len)) { + dma_addr = dma_unmap_addr(tx_buf, mapping); + dma_len = dma_unmap_len(tx_buf, len); + + dma_unmap_single(&pdev->dev, dma_addr, dma_len, + DMA_TO_DEVICE); + } last = tx_buf->nr_frags; i += 2; for (j = 0; j < last; j++, i++) { int ring_idx = i & bp->tx_ring_mask; - skb_frag_t *frag = &skb_shinfo(skb)->frags[j]; tx_buf = &txr->tx_buf_ring[ring_idx]; - netmem_dma_unmap_page_attrs(&pdev->dev, - dma_unmap_addr(tx_buf, - mapping), - skb_frag_size(frag), - DMA_TO_DEVICE, 0); + if (dma_unmap_len(tx_buf, len)) { + dma_addr = dma_unmap_addr(tx_buf, mapping); + dma_len = dma_unmap_len(tx_buf, len); + + netmem_dma_unmap_page_attrs(&pdev->dev, + dma_addr, dma_len, + DMA_TO_DEVICE, 0); + } } dev_kfree_skb(skb); } From 0c26a0e765e70b7342a47dd3a29f5e0734081d7d Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 8 Apr 2026 16:05:54 -0700 Subject: [PATCH 05/10] net: bnxt: Add TX inline buffer infrastructure Add per-ring pre-allocated inline buffer fields (tx_inline_buf, tx_inline_dma, tx_inline_size) to bnxt_tx_ring_info and helpers to allocate and free them. A producer and consumer (tx_inline_prod, tx_inline_cons) are added to track which slot(s) of the inline buffer are in-use. The inline buffer will be used by the SW USO path for pre-allocated, pre-DMA-mapped per-segment header copies. In the future, this could be extended to support TX copybreak. Allocation helper is marked __maybe_unused in this commit because it will be wired in later. Suggested-by: Jakub Kicinski Reviewed-by: Pavan Chebbi Signed-off-by: Joe Damato Link: https://patch.msgid.link/20260408230607.2019402-6-joe@dama.to Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 35 +++++++++++++++++++++++ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 6 ++++ 2 files changed, 41 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index bc2dac2f137d..bd93edb09ee0 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3979,6 +3979,39 @@ static int bnxt_alloc_rx_rings(struct bnxt *bp) return rc; } +static void bnxt_free_tx_inline_buf(struct bnxt_tx_ring_info *txr, + struct pci_dev *pdev) +{ + if (!txr->tx_inline_buf) + return; + + dma_unmap_single(&pdev->dev, txr->tx_inline_dma, + txr->tx_inline_size, DMA_TO_DEVICE); + kfree(txr->tx_inline_buf); + txr->tx_inline_buf = NULL; + txr->tx_inline_size = 0; +} + +static int __maybe_unused bnxt_alloc_tx_inline_buf(struct bnxt_tx_ring_info *txr, + struct pci_dev *pdev, + unsigned int size) +{ + txr->tx_inline_buf = kmalloc(size, GFP_KERNEL); + if (!txr->tx_inline_buf) + return -ENOMEM; + + txr->tx_inline_dma = dma_map_single(&pdev->dev, txr->tx_inline_buf, + size, DMA_TO_DEVICE); + if (dma_mapping_error(&pdev->dev, txr->tx_inline_dma)) { + kfree(txr->tx_inline_buf); + txr->tx_inline_buf = NULL; + return -ENOMEM; + } + txr->tx_inline_size = size; + + return 0; +} + static void bnxt_free_tx_rings(struct bnxt *bp) { int i; @@ -3997,6 +4030,8 @@ static void bnxt_free_tx_rings(struct bnxt *bp) txr->tx_push = NULL; } + bnxt_free_tx_inline_buf(txr, pdev); + ring = &txr->tx_ring_struct; bnxt_free_ring(bp, &ring->ring_mem); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 83b4136ccd31..d98a58aa30f6 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -996,6 +996,12 @@ struct bnxt_tx_ring_info { dma_addr_t tx_push_mapping; __le64 data_mapping; + void *tx_inline_buf; + dma_addr_t tx_inline_dma; + unsigned int tx_inline_size; + u16 tx_inline_prod; + u16 tx_inline_cons; + #define BNXT_DEV_STATE_CLOSING 0x1 u32 dev_state; From 0440e27eedace1c96b46b67d5607348bb07281ec Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 8 Apr 2026 16:05:55 -0700 Subject: [PATCH 06/10] net: bnxt: Add boilerplate GSO code Add bnxt_gso.c and bnxt_gso.h with a stub bnxt_sw_udp_gso_xmit() function, SW USO constants (BNXT_SW_USO_MAX_SEGS, BNXT_SW_USO_MAX_DESCS), and the is_sw_gso field in bnxt_sw_tx_bd with BNXT_SW_GSO_MID/LAST markers. The full SW USO implementation will be added in a future commit. Suggested-by: Jakub Kicinski Reviewed-by: Pavan Chebbi Signed-off-by: Joe Damato Link: https://patch.msgid.link/20260408230607.2019402-7-joe@dama.to Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/Makefile | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 4 +++ drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c | 30 ++++++++++++++++++ drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h | 31 +++++++++++++++++++ 4 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h diff --git a/drivers/net/ethernet/broadcom/bnxt/Makefile b/drivers/net/ethernet/broadcom/bnxt/Makefile index ba6c239d52fa..debef78c8b6d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/Makefile +++ b/drivers/net/ethernet/broadcom/bnxt/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_BNXT) += bnxt_en.o -bnxt_en-y := bnxt.o bnxt_hwrm.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_ptp.o bnxt_vfr.o bnxt_devlink.o bnxt_dim.o bnxt_coredump.o +bnxt_en-y := bnxt.o bnxt_hwrm.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_ptp.o bnxt_vfr.o bnxt_devlink.o bnxt_dim.o bnxt_coredump.o bnxt_gso.o bnxt_en-$(CONFIG_BNXT_FLOWER_OFFLOAD) += bnxt_tc.o bnxt_en-$(CONFIG_DEBUG_FS) += bnxt_debugfs.o bnxt_en-$(CONFIG_BNXT_HWMON) += bnxt_hwmon.o diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index d98a58aa30f6..6b38b84924e0 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -892,6 +892,7 @@ struct bnxt_sw_tx_bd { struct page *page; u8 is_ts_pkt; u8 is_push; + u8 is_sw_gso; u8 action; unsigned short nr_frags; union { @@ -900,6 +901,9 @@ struct bnxt_sw_tx_bd { }; }; +#define BNXT_SW_GSO_MID 1 +#define BNXT_SW_GSO_LAST 2 + struct bnxt_sw_rx_bd { void *data; u8 *data_ptr; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c new file mode 100644 index 000000000000..b296769ee4fe --- /dev/null +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Broadcom NetXtreme-C/E network driver. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bnxt.h" +#include "bnxt_gso.h" + +netdev_tx_t bnxt_sw_udp_gso_xmit(struct bnxt *bp, + struct bnxt_tx_ring_info *txr, + struct netdev_queue *txq, + struct sk_buff *skb) +{ + dev_kfree_skb_any(skb); + dev_core_stats_tx_dropped_inc(bp->dev); + return NETDEV_TX_OK; +} diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h new file mode 100644 index 000000000000..f01e8102dcd7 --- /dev/null +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Broadcom NetXtreme-C/E network driver. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. + */ + +#ifndef BNXT_GSO_H +#define BNXT_GSO_H + +/* Maximum segments the stack may send in a single SW USO skb. + * This caps gso_max_segs for NICs without HW USO support. + */ +#define BNXT_SW_USO_MAX_SEGS 64 + +/* Worst-case TX descriptors consumed by one SW USO packet: + * Each segment: 1 long BD + 1 ext BD + payload BDs. + * Total payload BDs across all segs <= num_segs + nr_frags (each frag + * boundary crossing adds at most 1 extra BD). + * So: 3 * max_segs + MAX_SKB_FRAGS + 1 = 3 * 64 + 17 + 1 = 210. + */ +#define BNXT_SW_USO_MAX_DESCS (3 * BNXT_SW_USO_MAX_SEGS + MAX_SKB_FRAGS + 1) + +netdev_tx_t bnxt_sw_udp_gso_xmit(struct bnxt *bp, + struct bnxt_tx_ring_info *txr, + struct netdev_queue *txq, + struct sk_buff *skb); + +#endif From cc5d90667db81474ed7a92a1b2fa3daec5559307 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 8 Apr 2026 16:05:56 -0700 Subject: [PATCH 07/10] net: bnxt: Implement software USO Implement bnxt_sw_udp_gso_xmit() using the core tso_dma_map API and the pre-allocated TX inline buffer for per-segment headers. The xmit path: 1. Calls tso_start() to initialize TSO state 2. Stack-allocates a tso_dma_map and calls tso_dma_map_init() to DMA-map the linear payload and all frags upfront. 3. For each segment: - Copies and patches headers via tso_build_hdr() into the pre-allocated tx_inline_buf (DMA-synced per segment) - Counts payload BDs via tso_dma_map_count() - Emits long BD (header) + ext BD + payload BDs - Payload BDs use tso_dma_map_next() which yields (dma_addr, chunk_len, mapping_len) tuples. Header BDs set dma_unmap_len=0 since the inline buffer is pre-allocated and unmapped only at ring teardown. Completion state is updated by calling tso_dma_map_completion_save() for the last segment. Suggested-by: Jakub Kicinski Signed-off-by: Joe Damato Link: https://patch.msgid.link/20260408230607.2019402-8-joe@dama.to Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 3 + drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c | 210 ++++++++++++++++++ drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h | 6 + 3 files changed, 219 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 6b38b84924e0..fe50576ae525 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -11,6 +11,8 @@ #ifndef BNXT_H #define BNXT_H +#include + #define DRV_MODULE_NAME "bnxt_en" /* DO NOT CHANGE DRV_VER_* defines @@ -899,6 +901,7 @@ struct bnxt_sw_tx_bd { u16 rx_prod; u16 txts_prod; }; + struct tso_dma_map_completion_state sw_gso_cstate; }; #define BNXT_SW_GSO_MID 1 diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c index b296769ee4fe..f317f60414e8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c @@ -19,11 +19,221 @@ #include "bnxt.h" #include "bnxt_gso.h" +static u32 bnxt_sw_gso_lhint(unsigned int len) +{ + if (len <= 512) + return TX_BD_FLAGS_LHINT_512_AND_SMALLER; + else if (len <= 1023) + return TX_BD_FLAGS_LHINT_512_TO_1023; + else if (len <= 2047) + return TX_BD_FLAGS_LHINT_1024_TO_2047; + else + return TX_BD_FLAGS_LHINT_2048_AND_LARGER; +} + netdev_tx_t bnxt_sw_udp_gso_xmit(struct bnxt *bp, struct bnxt_tx_ring_info *txr, struct netdev_queue *txq, struct sk_buff *skb) { + unsigned int last_unmap_len __maybe_unused = 0; + dma_addr_t last_unmap_addr __maybe_unused = 0; + struct bnxt_sw_tx_bd *last_unmap_buf = NULL; + unsigned int hdr_len, mss, num_segs; + struct pci_dev *pdev = bp->pdev; + unsigned int total_payload; + struct tso_dma_map map; + u32 vlan_tag_flags = 0; + int i, bds_needed; + struct tso_t tso; + u16 cfa_action; + __le32 csum; + u16 prod; + + hdr_len = tso_start(skb, &tso); + mss = skb_shinfo(skb)->gso_size; + total_payload = skb->len - hdr_len; + num_segs = DIV_ROUND_UP(total_payload, mss); + + if (unlikely(num_segs <= 1)) + goto drop; + + /* Upper bound on the number of descriptors needed. + * + * Each segment uses 1 long BD + 1 ext BD + payload BDs, which is + * at most num_segs + nr_frags (each frag boundary crossing adds at + * most 1 extra BD). + */ + bds_needed = 3 * num_segs + skb_shinfo(skb)->nr_frags + 1; + + if (unlikely(bnxt_tx_avail(bp, txr) < bds_needed)) { + netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr), + bp->tx_wake_thresh); + return NETDEV_TX_BUSY; + } + + /* BD backpressure alone cannot prevent overwriting in-flight + * headers in the inline buffer. Check slot availability directly. + */ + if (!netif_txq_maybe_stop(txq, bnxt_inline_avail(txr), + num_segs, num_segs)) + return NETDEV_TX_BUSY; + + if (unlikely(tso_dma_map_init(&map, &pdev->dev, skb, hdr_len))) + goto drop; + + cfa_action = bnxt_xmit_get_cfa_action(skb); + if (skb_vlan_tag_present(skb)) { + vlan_tag_flags = TX_BD_CFA_META_KEY_VLAN | + skb_vlan_tag_get(skb); + if (skb->vlan_proto == htons(ETH_P_8021Q)) + vlan_tag_flags |= 1 << TX_BD_CFA_META_TPID_SHIFT; + } + + csum = cpu_to_le32(TX_BD_FLAGS_TCP_UDP_CHKSUM); + if (!tso.ipv6) + csum |= cpu_to_le32(TX_BD_FLAGS_IP_CKSUM); + + prod = txr->tx_prod; + + for (i = 0; i < num_segs; i++) { + unsigned int seg_payload = min_t(unsigned int, mss, + total_payload - i * mss); + u16 slot = (txr->tx_inline_prod + i) & + (BNXT_SW_USO_MAX_SEGS - 1); + struct bnxt_sw_tx_bd *tx_buf; + unsigned int mapping_len; + dma_addr_t this_hdr_dma; + unsigned int chunk_len; + unsigned int offset; + dma_addr_t dma_addr; + struct tx_bd *txbd; + struct udphdr *uh; + void *this_hdr; + int bd_count; + bool last; + u32 flags; + + last = (i == num_segs - 1); + offset = slot * TSO_HEADER_SIZE; + this_hdr = txr->tx_inline_buf + offset; + this_hdr_dma = txr->tx_inline_dma + offset; + + tso_build_hdr(skb, this_hdr, &tso, seg_payload, last); + + /* Zero stale csum fields copied from the original skb; + * HW offload recomputes from scratch. + */ + uh = this_hdr + skb_transport_offset(skb); + uh->check = 0; + if (!tso.ipv6) { + struct iphdr *iph = this_hdr + skb_network_offset(skb); + + iph->check = 0; + } + + dma_sync_single_for_device(&pdev->dev, this_hdr_dma, + hdr_len, DMA_TO_DEVICE); + + bd_count = tso_dma_map_count(&map, seg_payload); + + tx_buf = &txr->tx_buf_ring[RING_TX(bp, prod)]; + txbd = &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)]; + + tx_buf->skb = skb; + tx_buf->nr_frags = bd_count; + tx_buf->is_push = 0; + tx_buf->is_ts_pkt = 0; + + dma_unmap_addr_set(tx_buf, mapping, this_hdr_dma); + dma_unmap_len_set(tx_buf, len, 0); + + if (last) { + tx_buf->is_sw_gso = BNXT_SW_GSO_LAST; + tso_dma_map_completion_save(&map, &tx_buf->sw_gso_cstate); + } else { + tx_buf->is_sw_gso = BNXT_SW_GSO_MID; + } + + flags = (hdr_len << TX_BD_LEN_SHIFT) | + TX_BD_TYPE_LONG_TX_BD | + TX_BD_CNT(2 + bd_count); + + flags |= bnxt_sw_gso_lhint(hdr_len + seg_payload); + + txbd->tx_bd_len_flags_type = cpu_to_le32(flags); + txbd->tx_bd_haddr = cpu_to_le64(this_hdr_dma); + txbd->tx_bd_opaque = SET_TX_OPAQUE(bp, txr, prod, + 2 + bd_count); + + prod = NEXT_TX(prod); + bnxt_init_ext_bd(bp, txr, prod, csum, + vlan_tag_flags, cfa_action); + + /* set dma_unmap_len on the LAST BD touching each + * region. Since completions are in-order, the last segment + * completes after all earlier ones, so the unmap is safe. + */ + while (tso_dma_map_next(&map, &dma_addr, &chunk_len, + &mapping_len, seg_payload)) { + prod = NEXT_TX(prod); + txbd = &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)]; + tx_buf = &txr->tx_buf_ring[RING_TX(bp, prod)]; + + txbd->tx_bd_haddr = cpu_to_le64(dma_addr); + dma_unmap_addr_set(tx_buf, mapping, dma_addr); + dma_unmap_len_set(tx_buf, len, 0); + tx_buf->skb = NULL; + tx_buf->is_sw_gso = 0; + + if (mapping_len) { + if (last_unmap_buf) { + dma_unmap_addr_set(last_unmap_buf, + mapping, + last_unmap_addr); + dma_unmap_len_set(last_unmap_buf, + len, + last_unmap_len); + } + last_unmap_addr = dma_addr; + last_unmap_len = mapping_len; + } + last_unmap_buf = tx_buf; + + flags = chunk_len << TX_BD_LEN_SHIFT; + txbd->tx_bd_len_flags_type = cpu_to_le32(flags); + txbd->tx_bd_opaque = 0; + + seg_payload -= chunk_len; + } + + txbd->tx_bd_len_flags_type |= + cpu_to_le32(TX_BD_FLAGS_PACKET_END); + + prod = NEXT_TX(prod); + } + + if (last_unmap_buf) { + dma_unmap_addr_set(last_unmap_buf, mapping, last_unmap_addr); + dma_unmap_len_set(last_unmap_buf, len, last_unmap_len); + } + + txr->tx_inline_prod += num_segs; + + netdev_tx_sent_queue(txq, skb->len); + + WRITE_ONCE(txr->tx_prod, prod); + /* Sync BDs before doorbell */ + wmb(); + bnxt_db_write(bp, &txr->tx_db, prod); + + if (unlikely(bnxt_tx_avail(bp, txr) <= bp->tx_wake_thresh)) + netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr), + bp->tx_wake_thresh); + + return NETDEV_TX_OK; + +drop: dev_kfree_skb_any(skb); dev_core_stats_tx_dropped_inc(bp->dev); return NETDEV_TX_OK; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h index f01e8102dcd7..6ba8ccc451de 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h @@ -23,6 +23,12 @@ */ #define BNXT_SW_USO_MAX_DESCS (3 * BNXT_SW_USO_MAX_SEGS + MAX_SKB_FRAGS + 1) +static inline u16 bnxt_inline_avail(struct bnxt_tx_ring_info *txr) +{ + return BNXT_SW_USO_MAX_SEGS - + (u16)(txr->tx_inline_prod - READ_ONCE(txr->tx_inline_cons)); +} + netdev_tx_t bnxt_sw_udp_gso_xmit(struct bnxt *bp, struct bnxt_tx_ring_info *txr, struct netdev_queue *txq, From 87550ba2dc39489be292ecb485118ea662800b82 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 8 Apr 2026 16:05:57 -0700 Subject: [PATCH 08/10] net: bnxt: Add SW GSO completion and teardown support Update __bnxt_tx_int and bnxt_free_one_tx_ring_skbs to handle SW GSO segments: - MID segments: adjust tx_pkts/tx_bytes accounting and skip skb free (the skb is shared across all segments and freed only once) - LAST segments: call tso_dma_map_complete() to tear down the IOVA mapping if one was used. On the fallback path, payload DMA unmapping is handled by the existing per-BD dma_unmap_len walk. Both MID and LAST completions advance tx_inline_cons to release the segment's inline header slot back to the ring. is_sw_gso is initialized to zero, so the new code paths are not run. Add logic for feature advertisement and guardrails for ring sizing. Suggested-by: Jakub Kicinski Reviewed-by: Pavan Chebbi Signed-off-by: Joe Damato Link: https://patch.msgid.link/20260408230607.2019402-9-joe@dama.to Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 75 ++++++++++++++++--- .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 19 ++++- drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h | 9 +++ 3 files changed, 92 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index bd93edb09ee0..26aae48a7d0e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -74,6 +74,8 @@ #include "bnxt_debugfs.h" #include "bnxt_coredump.h" #include "bnxt_hwmon.h" +#include "bnxt_gso.h" +#include #define BNXT_TX_TIMEOUT (5 * HZ) #define BNXT_DEF_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_HW | \ @@ -817,12 +819,13 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr, bool rc = false; while (RING_TX(bp, cons) != hw_cons) { - struct bnxt_sw_tx_bd *tx_buf; + struct bnxt_sw_tx_bd *tx_buf, *head_buf; struct sk_buff *skb; bool is_ts_pkt; int j, last; tx_buf = &txr->tx_buf_ring[RING_TX(bp, cons)]; + head_buf = tx_buf; skb = tx_buf->skb; if (unlikely(!skb)) { @@ -869,6 +872,22 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr, DMA_TO_DEVICE, 0); } } + + if (unlikely(head_buf->is_sw_gso)) { + u16 inline_cons = txr->tx_inline_cons + 1; + + WRITE_ONCE(txr->tx_inline_cons, inline_cons); + if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) { + tso_dma_map_complete(&pdev->dev, + &head_buf->sw_gso_cstate); + } else { + tx_pkts--; + tx_bytes -= skb->len; + skb = NULL; + } + head_buf->is_sw_gso = 0; + } + if (unlikely(is_ts_pkt)) { if (BNXT_CHIP_P5(bp)) { /* PTP worker takes ownership of the skb */ @@ -3412,6 +3431,7 @@ static void bnxt_free_one_tx_ring_skbs(struct bnxt *bp, for (i = 0; i < max_idx;) { struct bnxt_sw_tx_bd *tx_buf = &txr->tx_buf_ring[i]; + struct bnxt_sw_tx_bd *head_buf = tx_buf; struct sk_buff *skb; int j, last; @@ -3466,7 +3486,20 @@ static void bnxt_free_one_tx_ring_skbs(struct bnxt *bp, DMA_TO_DEVICE, 0); } } - dev_kfree_skb(skb); + if (head_buf->is_sw_gso) { + u16 inline_cons = txr->tx_inline_cons + 1; + + WRITE_ONCE(txr->tx_inline_cons, inline_cons); + if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) { + tso_dma_map_complete(&pdev->dev, + &head_buf->sw_gso_cstate); + } else { + skb = NULL; + } + head_buf->is_sw_gso = 0; + } + if (skb) + dev_kfree_skb(skb); } netdev_tx_reset_queue(netdev_get_tx_queue(bp->dev, idx)); } @@ -3992,9 +4025,9 @@ static void bnxt_free_tx_inline_buf(struct bnxt_tx_ring_info *txr, txr->tx_inline_size = 0; } -static int __maybe_unused bnxt_alloc_tx_inline_buf(struct bnxt_tx_ring_info *txr, - struct pci_dev *pdev, - unsigned int size) +static int bnxt_alloc_tx_inline_buf(struct bnxt_tx_ring_info *txr, + struct pci_dev *pdev, + unsigned int size) { txr->tx_inline_buf = kmalloc(size, GFP_KERNEL); if (!txr->tx_inline_buf) @@ -4097,6 +4130,13 @@ static int bnxt_alloc_tx_rings(struct bnxt *bp) sizeof(struct tx_push_bd); txr->data_mapping = cpu_to_le64(mapping); } + if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP)) { + rc = bnxt_alloc_tx_inline_buf(txr, pdev, + BNXT_SW_USO_MAX_SEGS * + TSO_HEADER_SIZE); + if (rc) + return rc; + } qidx = bp->tc_to_qidx[j]; ring->queue_id = bp->q_info[qidx].queue_id; spin_lock_init(&txr->xdp_tx_lock); @@ -4635,10 +4675,13 @@ static int bnxt_init_rx_rings(struct bnxt *bp) static int bnxt_init_tx_rings(struct bnxt *bp) { + netdev_features_t features; u16 i; + features = bp->dev->features; + bp->tx_wake_thresh = max_t(int, bp->tx_ring_size / 2, - BNXT_MIN_TX_DESC_CNT); + bnxt_min_tx_desc_cnt(bp, features)); for (i = 0; i < bp->tx_nr_rings; i++) { struct bnxt_tx_ring_info *txr = &bp->tx_ring[i]; @@ -13837,6 +13880,11 @@ static netdev_features_t bnxt_fix_features(struct net_device *dev, if ((features & NETIF_F_NTUPLE) && !bnxt_rfs_capable(bp, false)) features &= ~NETIF_F_NTUPLE; + if ((features & NETIF_F_GSO_UDP_L4) && + !(bp->flags & BNXT_FLAG_UDP_GSO_CAP) && + bp->tx_ring_size < 2 * BNXT_SW_USO_MAX_DESCS) + features &= ~NETIF_F_GSO_UDP_L4; + if ((bp->flags & BNXT_FLAG_NO_AGG_RINGS) || bp->xdp_prog) features &= ~(NETIF_F_LRO | NETIF_F_GRO_HW); @@ -13882,6 +13930,9 @@ static int bnxt_set_features(struct net_device *dev, netdev_features_t features) int rc = 0; bool re_init = false; + bp->tx_wake_thresh = max_t(int, bp->tx_ring_size / 2, + bnxt_min_tx_desc_cnt(bp, features)); + flags &= ~BNXT_FLAG_ALL_CONFIG_FEATS; if (features & NETIF_F_GRO_HW) flags |= BNXT_FLAG_GRO; @@ -16907,8 +16958,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM | NETIF_F_GSO_PARTIAL | NETIF_F_RXHASH | NETIF_F_RXCSUM | NETIF_F_GRO; - if (bp->flags & BNXT_FLAG_UDP_GSO_CAP) - dev->hw_features |= NETIF_F_GSO_UDP_L4; + dev->hw_features |= NETIF_F_GSO_UDP_L4; if (BNXT_SUPPORTS_TPA(bp)) dev->hw_features |= NETIF_F_LRO; @@ -16941,8 +16991,15 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dev->priv_flags |= IFF_UNICAST_FLT; netif_set_tso_max_size(dev, GSO_MAX_SIZE); - if (bp->tso_max_segs) + if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP)) { + u16 max_segs = BNXT_SW_USO_MAX_SEGS; + + if (bp->tso_max_segs) + max_segs = min_t(u16, max_segs, bp->tso_max_segs); + netif_set_tso_max_segs(dev, max_segs); + } else if (bp->tso_max_segs) { netif_set_tso_max_segs(dev, bp->tso_max_segs); + } dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_RX_SG; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 6826bf762d26..9ded88196bb4 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -33,6 +33,7 @@ #include "bnxt_xdp.h" #include "bnxt_ptp.h" #include "bnxt_ethtool.h" +#include "bnxt_gso.h" #include "bnxt_nvm_defs.h" /* NVRAM content constant and structure defs */ #include "bnxt_fw_hdr.h" /* Firmware hdr constant and structure defs */ #include "bnxt_coredump.h" @@ -852,12 +853,18 @@ static int bnxt_set_ringparam(struct net_device *dev, u8 tcp_data_split = kernel_ering->tcp_data_split; struct bnxt *bp = netdev_priv(dev); u8 hds_config_mod; + int rc; if ((ering->rx_pending > BNXT_MAX_RX_DESC_CNT) || (ering->tx_pending > BNXT_MAX_TX_DESC_CNT) || (ering->tx_pending < BNXT_MIN_TX_DESC_CNT)) return -EINVAL; + if ((dev->features & NETIF_F_GSO_UDP_L4) && + !(bp->flags & BNXT_FLAG_UDP_GSO_CAP) && + ering->tx_pending < 2 * BNXT_SW_USO_MAX_DESCS) + return -EINVAL; + hds_config_mod = tcp_data_split != dev->cfg->hds_config; if (tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_DISABLED && hds_config_mod) return -EINVAL; @@ -882,9 +889,17 @@ static int bnxt_set_ringparam(struct net_device *dev, bp->tx_ring_size = ering->tx_pending; bnxt_set_ring_params(bp); - if (netif_running(dev)) - return bnxt_open_nic(bp, false, false); + if (netif_running(dev)) { + rc = bnxt_open_nic(bp, false, false); + if (rc) + return rc; + } + /* ring size changes may affect features (SW USO requires a minimum + * ring size), so recalculate features to ensure the correct features + * are blocked/available. + */ + netdev_update_features(dev); return 0; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h index 6ba8ccc451de..47528c20f311 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h @@ -29,6 +29,15 @@ static inline u16 bnxt_inline_avail(struct bnxt_tx_ring_info *txr) (u16)(txr->tx_inline_prod - READ_ONCE(txr->tx_inline_cons)); } +static inline int bnxt_min_tx_desc_cnt(struct bnxt *bp, + netdev_features_t features) +{ + if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP) && + (features & NETIF_F_GSO_UDP_L4)) + return BNXT_SW_USO_MAX_DESCS; + return BNXT_MIN_TX_DESC_CNT; +} + netdev_tx_t bnxt_sw_udp_gso_xmit(struct bnxt *bp, struct bnxt_tx_ring_info *txr, struct netdev_queue *txq, From 28f2c22398fbaaad9f16ac84647a3ed9e1a1e284 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 8 Apr 2026 16:05:58 -0700 Subject: [PATCH 09/10] net: bnxt: Dispatch to SW USO Wire in the SW USO path added in preceding commits when hardware USO is not possible. When a GSO skb with SKB_GSO_UDP_L4 arrives and the NIC lacks HW USO capability, redirect to bnxt_sw_udp_gso_xmit() which handles software segmentation into individual UDP frames submitted directly to the TX ring. Suggested-by: Jakub Kicinski Reviewed-by: Pavan Chebbi Signed-off-by: Joe Damato Link: https://patch.msgid.link/20260408230607.2019402-10-joe@dama.to Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 26aae48a7d0e..2715632115a5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -508,6 +508,11 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) } } #endif + if (skb_is_gso(skb) && + (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) && + !(bp->flags & BNXT_FLAG_UDP_GSO_CAP)) + return bnxt_sw_udp_gso_xmit(bp, txr, txq, skb); + free_size = bnxt_tx_avail(bp, txr); if (unlikely(free_size < skb_shinfo(skb)->nr_frags + 2)) { /* We must have raced with NAPI cleanup */ From 5d3b12d1a24b72e147fbb585158f51585593f640 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 8 Apr 2026 16:05:59 -0700 Subject: [PATCH 10/10] selftests: drv-net: Add USO test Add a simple test for USO. Tests both ipv4 and ipv6 with several full segments and a partial segment. Suggested-by: Jakub Kicinski Signed-off-by: Joe Damato Link: https://patch.msgid.link/20260408230607.2019402-11-joe@dama.to Signed-off-by: Jakub Kicinski --- .../testing/selftests/drivers/net/hw/Makefile | 1 + tools/testing/selftests/drivers/net/hw/uso.py | 103 ++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/hw/uso.py diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index d84e955ac87b..85ca4d1ecf9e 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -45,6 +45,7 @@ TEST_PROGS = \ rss_input_xfrm.py \ toeplitz.py \ tso.py \ + uso.py \ xdp_metadata.py \ xsk_reconfig.py \ # diff --git a/tools/testing/selftests/drivers/net/hw/uso.py b/tools/testing/selftests/drivers/net/hw/uso.py new file mode 100755 index 000000000000..6d61e56cab3c --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/uso.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +"""Test USO + +Sends large UDP datagrams with UDP_SEGMENT and verifies that the peer +receives the expected total payload and that the NIC transmitted at least +the expected number of segments. +""" +import random +import socket +import string + +from lib.py import ksft_run, ksft_exit, KsftSkipEx +from lib.py import ksft_eq, ksft_ge, ksft_variants, KsftNamedVariant +from lib.py import NetDrvEpEnv +from lib.py import bkg, defer, ethtool, ip, rand_port, wait_port_listen + +# python doesn't expose this constant, so we need to hardcode it to enable UDP +# segmentation for large payloads +UDP_SEGMENT = 103 + + +def _send_uso(cfg, ipver, mss, total_payload, port): + if ipver == "4": + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + dst = (cfg.remote_addr_v["4"], port) + else: + sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) + dst = (cfg.remote_addr_v["6"], port) + + sock.setsockopt(socket.IPPROTO_UDP, UDP_SEGMENT, mss) + payload = ''.join(random.choice(string.ascii_lowercase) + for _ in range(total_payload)) + sock.sendto(payload.encode(), dst) + sock.close() + + +def _get_tx_packets(cfg): + stats = ip(f"-s link show dev {cfg.ifname}", json=True)[0] + return stats['stats64']['tx']['packets'] + + +def _test_uso(cfg, ipver, mss, total_payload): + cfg.require_ipver(ipver) + cfg.require_cmd("socat", remote=True) + + features = ethtool(f"-k {cfg.ifname}", json=True) + uso_was_on = features[0]["tx-udp-segmentation"]["active"] + + try: + ethtool(f"-K {cfg.ifname} tx-udp-segmentation on") + except Exception as exc: + raise KsftSkipEx( + "Device does not support tx-udp-segmentation") from exc + if not uso_was_on: + defer(ethtool, f"-K {cfg.ifname} tx-udp-segmentation off") + + expected_segs = (total_payload + mss - 1) // mss + + port = rand_port(stype=socket.SOCK_DGRAM) + rx_cmd = f"socat -{ipver} -T 2 -u UDP-LISTEN:{port},reuseport STDOUT" + + tx_before = _get_tx_packets(cfg) + + with bkg(rx_cmd, host=cfg.remote, exit_wait=True) as rx: + wait_port_listen(port, proto="udp", host=cfg.remote) + _send_uso(cfg, ipver, mss, total_payload, port) + + ksft_eq(len(rx.stdout), total_payload, + comment=f"Received {len(rx.stdout)}B, expected {total_payload}B") + + cfg.wait_hw_stats_settle() + + tx_after = _get_tx_packets(cfg) + tx_delta = tx_after - tx_before + + ksft_ge(tx_delta, expected_segs, + comment=f"Expected >= {expected_segs} tx packets, got {tx_delta}") + + +def _uso_variants(): + for ipver in ["4", "6"]: + yield KsftNamedVariant(f"v{ipver}_partial", ipver, 1400, 1400 * 10 + 500) + yield KsftNamedVariant(f"v{ipver}_exact", ipver, 1400, 1400 * 5) + + +@ksft_variants(_uso_variants()) +def test_uso(cfg, ipver, mss, total_payload): + """Send a USO datagram and verify the peer receives the expected segments.""" + _test_uso(cfg, ipver, mss, total_payload) + + +def main() -> None: + """Run USO tests.""" + with NetDrvEpEnv(__file__) as cfg: + ksft_run([test_uso], + args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main()