diff --git a/include/linux/swap.h b/include/linux/swap.h
index 62fc7499b408..0effe3cc50f5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -208,7 +208,6 @@ enum {
 	SWP_DISCARDABLE = (1 << 2),	/* blkdev support discard */
 	SWP_DISCARDING	= (1 << 3),	/* now discarding a free cluster */
 	SWP_SOLIDSTATE	= (1 << 4),	/* blkdev seeks are cheap */
-	SWP_CONTINUED	= (1 << 5),	/* swap_map has count continuation */
 	SWP_BLKDEV	= (1 << 6),	/* its a block device */
 	SWP_ACTIVATED	= (1 << 7),	/* set after swap_activate success */
 	SWP_FS_OPS	= (1 << 8),	/* swapfile operations go through fs */
@@ -223,16 +222,6 @@ enum {
 #define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10)
 #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
 
-/* Bit flag in swap_map */
-#define COUNT_CONTINUED	0x80	/* Flag swap_map continuation for full count */
-
-/* Special value in first swap_map */
-#define SWAP_MAP_MAX	0x3e	/* Max count */
-#define SWAP_MAP_BAD	0x3f	/* Note page is bad */
-
-/* Special value in each swap_map continuation */
-#define SWAP_CONT_MAX	0x7f	/* Max count */
-
 /*
  * The first page in the swap file is the swap header, which is always marked
  * bad to prevent it from being allocated as an entry. This also prevents the
@@ -264,8 +253,7 @@ struct swap_info_struct {
 	signed short	prio;		/* swap priority of this type */
 	struct plist_node list;		/* entry in swap_active_head */
 	signed char	type;		/* strange name for an index */
-	unsigned int	max;		/* extent of the swap_map */
-	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
+	unsigned int	max;		/* size of this swap device */
 	unsigned long *zeromap;		/* kvmalloc'ed bitmap to track zero pages */
 	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
 	struct list_head free_clusters; /* free clusters list */
@@ -284,18 +272,14 @@ struct swap_info_struct {
 	struct completion comp;		/* seldom referenced */
 	spinlock_t lock;		/*
 					 * protect map scan related fields like
-					 * swap_map, inuse_pages and all cluster
-					 * lists. other fields are only changed
+					 * inuse_pages and all cluster lists.
+					 * Other fields are only changed
 					 * at swapon/swapoff, so are protected
 					 * by swap_lock. changing flags need
 					 * hold this lock and swap_lock. If
 					 * both locks need hold, hold swap_lock
 					 * first.
 					 */
-	spinlock_t cont_lock;		/*
-					 * protect swap count continuation page
-					 * list.
-					 */
 	struct work_struct discard_work; /* discard worker */
 	struct work_struct reclaim_work; /* reclaim worker */
 	struct list_head discard_clusters; /* discard clusters list */
@@ -451,7 +435,6 @@ static inline long get_nr_swap_pages(void)
 }
 
 extern void si_swapinfo(struct sysinfo *);
-extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 int swap_type_of(dev_t device, sector_t offset);
 int find_first_swap(dev_t *device);
 extern unsigned int count_swap_pages(int, int);
@@ -517,11 +500,6 @@ static inline void free_swap_cache(struct folio *folio)
 {
 }
 
-static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
-{
-	return 0;
-}
-
 static inline int swap_dup_entry_direct(swp_entry_t ent)
 {
 	return 0;
diff --git a/mm/memory.c b/mm/memory.c
index 2f815a34d924..7084c426f933 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1346,7 +1346,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 
 	if (ret == -EIO) {
 		VM_WARN_ON_ONCE(!entry.val);
-		if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
+		if (swap_retry_table_alloc(entry, GFP_KERNEL) < 0) {
 			ret = -ENOMEM;
 			goto out;
 		}
diff --git a/mm/swap.h b/mm/swap.h
index bfafa637c458..0a91e21e92b1 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -37,6 +37,7 @@ struct swap_cluster_info {
 	u8 flags;
 	u8 order;
 	atomic_long_t __rcu *table;	/* Swap table entries, see mm/swap_table.h */
+	unsigned int *extend_table;	/* For large swap count, protected by ci->lock */
 	struct list_head list;
 };
 
@@ -183,6 +184,8 @@ static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci)
 	spin_unlock_irq(&ci->lock);
 }
 
+extern int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp);
+
 /*
  * Below are the core routines for doing swap for a folio.
  * All helpers requires the folio to be locked, and a locked folio
@@ -206,9 +209,9 @@ int folio_dup_swap(struct folio *folio, struct page *subpage);
 void folio_put_swap(struct folio *folio, struct page *subpage);
 
 /* For internal use */
-extern void swap_entries_free(struct swap_info_struct *si,
-			      struct swap_cluster_info *ci,
-			      unsigned long offset, unsigned int nr_pages);
+extern void __swap_cluster_free_entries(struct swap_info_struct *si,
+					struct swap_cluster_info *ci,
+					unsigned int ci_off, unsigned int nr_pages);
 
 /* linux/mm/page_io.c */
 int sio_pool_init(void);
@@ -446,6 +449,11 @@ static inline int swap_writeout(struct folio *folio,
 	return 0;
 }
 
+static inline int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp)
+{
+	return -EINVAL;
+}
+
 static inline bool swap_cache_has_folio(swp_entry_t entry)
 {
 	return false;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e213ee35c1d2..e7618ffe6d70 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -140,21 +140,20 @@ void *swap_cache_get_shadow(swp_entry_t entry)
 void __swap_cache_add_folio(struct swap_cluster_info *ci,
 			    struct folio *folio, swp_entry_t entry)
 {
-	unsigned long new_tb;
-	unsigned int ci_start, ci_off, ci_end;
+	unsigned int ci_off = swp_cluster_offset(entry), ci_end;
 	unsigned long nr_pages = folio_nr_pages(folio);
+	unsigned long pfn = folio_pfn(folio);
+	unsigned long old_tb;
 
 	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
 	VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio);
 	VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio);
 
-	new_tb = folio_to_swp_tb(folio, 0);
-	ci_start = swp_cluster_offset(entry);
-	ci_off = ci_start;
-	ci_end = ci_start + nr_pages;
+	ci_end = ci_off + nr_pages;
 	do {
-		VM_WARN_ON_ONCE(swp_tb_is_folio(__swap_table_get(ci, ci_off)));
-		__swap_table_set(ci, ci_off, new_tb);
+		old_tb = __swap_table_get(ci, ci_off);
+		VM_WARN_ON_ONCE(swp_tb_is_folio(old_tb));
+		__swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb)));
 	} while (++ci_off < ci_end);
 
 	folio_ref_add(folio, nr_pages);
@@ -183,14 +182,13 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
 	unsigned long old_tb;
 	struct swap_info_struct *si;
 	struct swap_cluster_info *ci;
-	unsigned int ci_start, ci_off, ci_end, offset;
+	unsigned int ci_start, ci_off, ci_end;
 	unsigned long nr_pages = folio_nr_pages(folio);
 
 	si = __swap_entry_to_info(entry);
 	ci_start = swp_cluster_offset(entry);
 	ci_end = ci_start + nr_pages;
 	ci_off = ci_start;
-	offset = swp_offset(entry);
 	ci = swap_cluster_lock(si, swp_offset(entry));
 	if (unlikely(!ci->table)) {
 		err = -ENOENT;
@@ -202,13 +200,12 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
 			err = -EEXIST;
 			goto failed;
 		}
-		if (unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) {
+		if (unlikely(!__swp_tb_get_count(old_tb))) {
 			err = -ENOENT;
 			goto failed;
 		}
 		if (swp_tb_is_shadow(old_tb))
 			shadow = swp_tb_to_shadow(old_tb);
-		offset++;
 	} while (++ci_off < ci_end);
 	__swap_cache_add_folio(ci, folio, entry);
 	swap_cluster_unlock(ci);
@@ -237,8 +234,9 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry,
 void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
 			    swp_entry_t entry, void *shadow)
 {
+	int count;
+	unsigned long old_tb;
 	struct swap_info_struct *si;
-	unsigned long old_tb, new_tb;
 	unsigned int ci_start, ci_off, ci_end;
 	bool folio_swapped = false, need_free = false;
 	unsigned long nr_pages = folio_nr_pages(folio);
@@ -249,20 +247,20 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
 	VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio);
 
 	si = __swap_entry_to_info(entry);
-	new_tb = shadow_to_swp_tb(shadow, 0);
 	ci_start = swp_cluster_offset(entry);
 	ci_end = ci_start + nr_pages;
 	ci_off = ci_start;
 	do {
-		/* If shadow is NULL, we sets an empty shadow */
-		old_tb = __swap_table_xchg(ci, ci_off, new_tb);
+		old_tb = __swap_table_get(ci, ci_off);
 		WARN_ON_ONCE(!swp_tb_is_folio(old_tb) ||
 			     swp_tb_to_folio(old_tb) != folio);
-		if (__swap_count(swp_entry(si->type,
-				 swp_offset(entry) + ci_off - ci_start)))
+		count = __swp_tb_get_count(old_tb);
+		if (count)
 			folio_swapped = true;
 		else
 			need_free = true;
+		/* If shadow is NULL, we sets an empty shadow. */
+		__swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow, count));
 	} while (++ci_off < ci_end);
 
 	folio->swap.val = 0;
@@ -271,13 +269,13 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio,
 	lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages);
 
 	if (!folio_swapped) {
-		swap_entries_free(si, ci, swp_offset(entry), nr_pages);
+		__swap_cluster_free_entries(si, ci, ci_start, nr_pages);
 	} else if (need_free) {
+		ci_off = ci_start;
 		do {
-			if (!__swap_count(entry))
-				swap_entries_free(si, ci, swp_offset(entry), 1);
-			entry.val++;
-		} while (--nr_pages);
+			if (!__swp_tb_get_count(__swap_table_get(ci, ci_off)))
+				__swap_cluster_free_entries(si, ci, ci_off, 1);
+		} while (++ci_off < ci_end);
 	}
 }
 
@@ -324,17 +322,18 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci,
 	unsigned long nr_pages = folio_nr_pages(new);
 	unsigned int ci_off = swp_cluster_offset(entry);
 	unsigned int ci_end = ci_off + nr_pages;
-	unsigned long old_tb, new_tb;
+	unsigned long pfn = folio_pfn(new);
+	unsigned long old_tb;
 
 	VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new));
 	VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new));
 	VM_WARN_ON_ONCE(!entry.val);
 
 	/* Swap cache still stores N entries instead of a high-order entry */
-	new_tb = folio_to_swp_tb(new, 0);
 	do {
-		old_tb = __swap_table_xchg(ci, ci_off, new_tb);
+		old_tb = __swap_table_get(ci, ci_off);
 		WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old);
+		__swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb)));
 	} while (++ci_off < ci_end);
 
 	/*
@@ -368,7 +367,7 @@ void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents)
 	ci_end = ci_off + nr_ents;
 	do {
 		old = __swap_table_xchg(ci, ci_off, null_to_swp_tb());
-		WARN_ON_ONCE(swp_tb_is_folio(old));
+		WARN_ON_ONCE(swp_tb_is_folio(old) || swp_tb_get_count(old));
 	} while (++ci_off < ci_end);
 }
 
diff --git a/mm/swap_table.h b/mm/swap_table.h
index 10762ac5f4f5..8415ffbe2b9c 100644
--- a/mm/swap_table.h
+++ b/mm/swap_table.h
@@ -191,6 +191,11 @@ static inline int swp_tb_get_count(unsigned long swp_tb)
 	return -EINVAL;
 }
 
+static inline unsigned long __swp_tb_mk_count(unsigned long swp_tb, int count)
+{
+	return ((swp_tb & ~SWP_TB_COUNT_MASK) | __count_to_swp_tb(count));
+}
+
 /*
  * Helpers for accessing or modifying the swap table of a cluster,
  * the swap cluster must be locked.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 54a19ebce540..cf976ecae8a8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -51,15 +51,8 @@
 #include "swap_table.h"
 #include "swap.h"
 
-static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
-				 unsigned char);
-static void free_swap_count_continuations(struct swap_info_struct *);
 static void swap_range_alloc(struct swap_info_struct *si,
 			     unsigned int nr_entries);
-static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr);
-static void swap_put_entry_locked(struct swap_info_struct *si,
-				  struct swap_cluster_info *ci,
-				  unsigned long offset);
 static bool folio_swapcache_freeable(struct folio *folio);
 static void move_cluster(struct swap_info_struct *si,
 			 struct swap_cluster_info *ci, struct list_head *list,
@@ -182,22 +175,19 @@ static long swap_usage_in_pages(struct swap_info_struct *si)
 /* Reclaim the swap entry if swap is getting full */
 #define TTRS_FULL		0x4
 
-static bool swap_only_has_cache(struct swap_info_struct *si,
-				struct swap_cluster_info *ci,
+static bool swap_only_has_cache(struct swap_cluster_info *ci,
 				unsigned long offset, int nr_pages)
 {
 	unsigned int ci_off = offset % SWAPFILE_CLUSTER;
-	unsigned char *map = si->swap_map + offset;
-	unsigned char *map_end = map + nr_pages;
+	unsigned int ci_end = ci_off + nr_pages;
 	unsigned long swp_tb;
 
 	do {
 		swp_tb = __swap_table_get(ci, ci_off);
 		VM_WARN_ON_ONCE(!swp_tb_is_folio(swp_tb));
-		if (*map)
+		if (swp_tb_get_count(swp_tb))
 			return false;
-		++ci_off;
-	} while (++map < map_end);
+	} while (++ci_off < ci_end);
 
 	return true;
 }
@@ -256,7 +246,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
 	 * reference or pending writeback, and can't be allocated to others.
 	 */
 	ci = swap_cluster_lock(si, offset);
-	need_reclaim = swap_only_has_cache(si, ci, offset, nr_pages);
+	need_reclaim = swap_only_has_cache(ci, offset, nr_pages);
 	swap_cluster_unlock(ci);
 	if (!need_reclaim)
 		goto out_unlock;
@@ -479,6 +469,7 @@ static void swap_cluster_assert_empty(struct swap_cluster_info *ci,
 	} while (++ci_off < ci_end);
 
 	WARN_ON_ONCE(bad_slots != (swapoff ? ci->count : 0));
+	WARN_ON_ONCE(nr == SWAPFILE_CLUSTER && ci->extend_table);
 }
 
 static void swap_cluster_free_table(struct swap_cluster_info *ci)
@@ -807,7 +798,6 @@ static int swap_cluster_setup_bad_slot(struct swap_info_struct *si,
 		pr_warn("Duplicated bad slot offset %d\n", offset);
 		ret = -EINVAL;
 	} else {
-		si->swap_map[offset] = SWAP_MAP_BAD;
 		ci->count++;
 	}
 	spin_unlock(&ci->lock);
@@ -829,18 +819,16 @@ static bool cluster_reclaim_range(struct swap_info_struct *si,
 {
 	unsigned int nr_pages = 1 << order;
 	unsigned long offset = start, end = start + nr_pages;
-	unsigned char *map = si->swap_map;
 	unsigned long swp_tb;
 
 	spin_unlock(&ci->lock);
 	do {
-		if (READ_ONCE(map[offset]))
-			break;
 		swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER);
-		if (swp_tb_is_folio(swp_tb)) {
+		if (swp_tb_get_count(swp_tb))
+			break;
+		if (swp_tb_is_folio(swp_tb))
 			if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY) < 0)
 				break;
-		}
 	} while (++offset < end);
 	spin_lock(&ci->lock);
 
@@ -864,7 +852,7 @@ static bool cluster_reclaim_range(struct swap_info_struct *si,
 	 */
 	for (offset = start; offset < end; offset++) {
 		swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
-		if (map[offset] || !swp_tb_is_null(swp_tb))
+		if (!swp_tb_is_null(swp_tb))
 			return false;
 	}
 
@@ -876,37 +864,35 @@ static bool cluster_scan_range(struct swap_info_struct *si,
 			       unsigned long offset, unsigned int nr_pages,
 			       bool *need_reclaim)
 {
-	unsigned long end = offset + nr_pages;
-	unsigned char *map = si->swap_map;
+	unsigned int ci_off = offset % SWAPFILE_CLUSTER;
+	unsigned int ci_end = ci_off + nr_pages;
 	unsigned long swp_tb;
 
-	if (cluster_is_empty(ci))
-		return true;
-
 	do {
-		if (map[offset])
-			return false;
-		swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
-		if (swp_tb_is_folio(swp_tb)) {
+		swp_tb = __swap_table_get(ci, ci_off);
+		if (swp_tb_is_null(swp_tb))
+			continue;
+		if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) {
 			if (!vm_swap_full())
 				return false;
 			*need_reclaim = true;
-		} else {
-			/* A entry with no count and no cache must be null */
-			VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb));
+			continue;
 		}
-	} while (++offset < end);
+		/* Slot with zero count can only be NULL or folio */
+		VM_WARN_ON(!swp_tb_get_count(swp_tb));
+		return false;
+	} while (++ci_off < ci_end);
 
 	return true;
 }
 
-static bool cluster_alloc_range(struct swap_info_struct *si,
-				struct swap_cluster_info *ci,
-				struct folio *folio,
-				unsigned int offset)
+static bool __swap_cluster_alloc_entries(struct swap_info_struct *si,
+					 struct swap_cluster_info *ci,
+					 struct folio *folio,
+					 unsigned int ci_off)
 {
-	unsigned long nr_pages;
 	unsigned int order;
+	unsigned long nr_pages;
 
 	lockdep_assert_held(&ci->lock);
 
@@ -925,14 +911,15 @@ static bool cluster_alloc_range(struct swap_info_struct *si,
 	if (likely(folio)) {
 		order = folio_order(folio);
 		nr_pages = 1 << order;
-		swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, nr_pages, false);
-		__swap_cache_add_folio(ci, folio, swp_entry(si->type, offset));
+		swap_cluster_assert_empty(ci, ci_off, nr_pages, false);
+		__swap_cache_add_folio(ci, folio, swp_entry(si->type,
+							    ci_off + cluster_offset(si, ci)));
 	} else if (IS_ENABLED(CONFIG_HIBERNATION)) {
 		order = 0;
 		nr_pages = 1;
-		WARN_ON_ONCE(si->swap_map[offset]);
-		si->swap_map[offset] = 1;
-		swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, 1, false);
+		swap_cluster_assert_empty(ci, ci_off, 1, false);
+		/* Sets a fake shadow as placeholder */
+		__swap_table_set(ci, ci_off, shadow_to_swp_tb(NULL, 1));
 	} else {
 		/* Allocation without folio is only possible with hibernation */
 		WARN_ON_ONCE(1);
@@ -983,7 +970,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
 			if (!ret)
 				continue;
 		}
-		if (!cluster_alloc_range(si, ci, folio, offset))
+		if (!__swap_cluster_alloc_entries(si, ci, folio, offset % SWAPFILE_CLUSTER))
 			break;
 		found = offset;
 		offset += nr_pages;
@@ -1030,7 +1017,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
 	long to_scan = 1;
 	unsigned long offset, end;
 	struct swap_cluster_info *ci;
-	unsigned char *map = si->swap_map;
+	unsigned long swp_tb;
 	int nr_reclaim;
 
 	if (force)
@@ -1042,8 +1029,8 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
 		to_scan--;
 
 		while (offset < end) {
-			if (!READ_ONCE(map[offset]) &&
-			    swp_tb_is_folio(swap_table_get(ci, offset % SWAPFILE_CLUSTER))) {
+			swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER);
+			if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) {
 				spin_unlock(&ci->lock);
 				nr_reclaim = __try_to_reclaim_swap(si, offset,
 								   TTRS_ANYWAY);
@@ -1452,40 +1439,127 @@ static bool swap_sync_discard(void)
 	return false;
 }
 
+static int swap_extend_table_alloc(struct swap_info_struct *si,
+				   struct swap_cluster_info *ci, gfp_t gfp)
+{
+	void *table;
+
+	table = kzalloc(sizeof(ci->extend_table[0]) * SWAPFILE_CLUSTER, gfp);
+	if (!table)
+		return -ENOMEM;
+
+	spin_lock(&ci->lock);
+	if (!ci->extend_table)
+		ci->extend_table = table;
+	else
+		kfree(table);
+	spin_unlock(&ci->lock);
+	return 0;
+}
+
+int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp)
+{
+	int ret;
+	struct swap_info_struct *si;
+	struct swap_cluster_info *ci;
+	unsigned long offset = swp_offset(entry);
+
+	si = get_swap_device(entry);
+	if (!si)
+		return 0;
+
+	ci = __swap_offset_to_cluster(si, offset);
+	ret = swap_extend_table_alloc(si, ci, gfp);
+
+	put_swap_device(si);
+	return ret;
+}
+
+static void swap_extend_table_try_free(struct swap_cluster_info *ci)
+{
+	unsigned long i;
+	bool can_free = true;
+
+	if (!ci->extend_table)
+		return;
+
+	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+		if (ci->extend_table[i])
+			can_free = false;
+	}
+
+	if (can_free) {
+		kfree(ci->extend_table);
+		ci->extend_table = NULL;
+	}
+}
+
+/* Decrease the swap count of one slot, without freeing it */
+static void __swap_cluster_put_entry(struct swap_cluster_info *ci,
+				    unsigned int ci_off)
+{
+	int count;
+	unsigned long swp_tb;
+
+	lockdep_assert_held(&ci->lock);
+	swp_tb = __swap_table_get(ci, ci_off);
+	count = __swp_tb_get_count(swp_tb);
+
+	VM_WARN_ON_ONCE(count <= 0);
+	VM_WARN_ON_ONCE(count > SWP_TB_COUNT_MAX);
+
+	if (count == SWP_TB_COUNT_MAX) {
+		count = ci->extend_table[ci_off];
+		/* Overflow starts with SWP_TB_COUNT_MAX */
+		VM_WARN_ON_ONCE(count < SWP_TB_COUNT_MAX);
+		count--;
+		if (count == (SWP_TB_COUNT_MAX - 1)) {
+			ci->extend_table[ci_off] = 0;
+			__swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count));
+			swap_extend_table_try_free(ci);
+		} else {
+			ci->extend_table[ci_off] = count;
+		}
+	} else {
+		__swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, --count));
+	}
+}
+
 /**
- * swap_put_entries_cluster - Decrease the swap count of a set of slots.
+ * swap_put_entries_cluster - Decrease the swap count of slots within one cluster
  * @si: The swap device.
- * @start: start offset of slots.
+ * @offset: start offset of slots.
  * @nr: number of slots.
- * @reclaim_cache: if true, also reclaim the swap cache.
+ * @reclaim_cache: if true, also reclaim the swap cache if slots are freed.
  *
  * This helper decreases the swap count of a set of slots and tries to
  * batch free them. Also reclaims the swap cache if @reclaim_cache is true.
- * Context: The caller must ensure that all slots belong to the same
- * cluster and their swap count doesn't go underflow.
+ *
+ * Context: The specified slots must be pinned by existing swap count or swap
+ * cache reference, so they won't be released until this helper returns.
  */
 static void swap_put_entries_cluster(struct swap_info_struct *si,
-				     unsigned long start, int nr,
+				     pgoff_t offset, int nr,
 				     bool reclaim_cache)
 {
-	unsigned long offset = start, end = start + nr;
-	unsigned long batch_start = SWAP_ENTRY_INVALID;
 	struct swap_cluster_info *ci;
+	unsigned int ci_off, ci_end;
+	pgoff_t end = offset + nr;
 	bool need_reclaim = false;
 	unsigned int nr_reclaimed;
 	unsigned long swp_tb;
-	unsigned int count;
+	int ci_batch = -1;
 
 	ci = swap_cluster_lock(si, offset);
+	ci_off = offset % SWAPFILE_CLUSTER;
+	ci_end = ci_off + nr;
 	do {
-		swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER);
-		count = si->swap_map[offset];
-		VM_WARN_ON(count < 1 || count == SWAP_MAP_BAD);
-		if (count == 1) {
+		swp_tb = __swap_table_get(ci, ci_off);
+		if (swp_tb_get_count(swp_tb) == 1) {
 			/* count == 1 and non-cached slots will be batch freed. */
 			if (!swp_tb_is_folio(swp_tb)) {
-				if (!batch_start)
-					batch_start = offset;
+				if (ci_batch == -1)
+					ci_batch = ci_off;
 				continue;
 			}
 			/* count will be 0 after put, slot can be reclaimed */
@@ -1497,21 +1571,20 @@ static void swap_put_entries_cluster(struct swap_info_struct *si,
 		 * slots will be freed when folio is removed from swap cache
 		 * (__swap_cache_del_folio).
 		 */
-		swap_put_entry_locked(si, ci, offset);
-		if (batch_start) {
-			swap_entries_free(si, ci, batch_start, offset - batch_start);
-			batch_start = SWAP_ENTRY_INVALID;
+		__swap_cluster_put_entry(ci, ci_off);
+		if (ci_batch != -1) {
+			__swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch);
+			ci_batch = -1;
 		}
-	} while (++offset < end);
+	} while (++ci_off < ci_end);
 
-	if (batch_start)
-		swap_entries_free(si, ci, batch_start, offset - batch_start);
+	if (ci_batch != -1)
+		__swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch);
 	swap_cluster_unlock(ci);
 
 	if (!need_reclaim || !reclaim_cache)
 		return;
 
-	offset = start;
 	do {
 		nr_reclaimed = __try_to_reclaim_swap(si, offset,
 						     TTRS_UNMAPPED | TTRS_FULL);
@@ -1521,6 +1594,92 @@ static void swap_put_entries_cluster(struct swap_info_struct *si,
 	} while (offset < end);
 }
 
+/* Increase the swap count of one slot. */
+static int __swap_cluster_dup_entry(struct swap_cluster_info *ci,
+				    unsigned int ci_off)
+{
+	int count;
+	unsigned long swp_tb;
+
+	lockdep_assert_held(&ci->lock);
+	swp_tb = __swap_table_get(ci, ci_off);
+	/* Bad or special slots can't be handled */
+	if (WARN_ON_ONCE(swp_tb_is_bad(swp_tb)))
+		return -EINVAL;
+	count = __swp_tb_get_count(swp_tb);
+	/* Must be either cached or have a count already */
+	if (WARN_ON_ONCE(!count && !swp_tb_is_folio(swp_tb)))
+		return -ENOENT;
+
+	if (likely(count < (SWP_TB_COUNT_MAX - 1))) {
+		__swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count + 1));
+		VM_WARN_ON_ONCE(ci->extend_table && ci->extend_table[ci_off]);
+	} else if (count == (SWP_TB_COUNT_MAX - 1)) {
+		if (ci->extend_table) {
+			VM_WARN_ON_ONCE(ci->extend_table[ci_off]);
+			ci->extend_table[ci_off] = SWP_TB_COUNT_MAX;
+			__swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, SWP_TB_COUNT_MAX));
+		} else {
+			return -ENOMEM;
+		}
+	} else if (count == SWP_TB_COUNT_MAX) {
+		VM_WARN_ON_ONCE(ci->extend_table[ci_off] >=
+                               type_max(typeof(ci->extend_table[0])));
+		++ci->extend_table[ci_off];
+	} else {
+		/* Never happens unless counting went wrong */
+		WARN_ON_ONCE(1);
+	}
+
+	return 0;
+}
+
+/**
+ * swap_dup_entries_cluster: Increase the swap count of slots within one cluster.
+ * @si: The swap device.
+ * @offset: start offset of slots.
+ * @nr: number of slots.
+ *
+ * Context: The specified slots must be pinned by existing swap count or swap
+ * cache reference, so they won't be released until this helper returns.
+ * Return: 0 on success. -ENOMEM if the swap count maxed out (SWP_TB_COUNT_MAX)
+ * and failed to allocate an extended table, -EINVAL if any entry is bad entry.
+ */
+static int swap_dup_entries_cluster(struct swap_info_struct *si,
+				    pgoff_t offset, int nr)
+{
+	int err;
+	struct swap_cluster_info *ci;
+	unsigned int ci_start, ci_off, ci_end;
+
+	ci_start = offset % SWAPFILE_CLUSTER;
+	ci_end = ci_start + nr;
+	ci_off = ci_start;
+	ci = swap_cluster_lock(si, offset);
+restart:
+	do {
+		err = __swap_cluster_dup_entry(ci, ci_off);
+		if (unlikely(err)) {
+			if (err == -ENOMEM) {
+				spin_unlock(&ci->lock);
+				err = swap_extend_table_alloc(si, ci, GFP_ATOMIC);
+				spin_lock(&ci->lock);
+				if (!err)
+					goto restart;
+			}
+			goto failed;
+		}
+	} while (++ci_off < ci_end);
+	swap_cluster_unlock(ci);
+	return 0;
+failed:
+	while (ci_off-- > ci_start)
+		__swap_cluster_put_entry(ci, ci_off);
+	swap_extend_table_try_free(ci);
+	swap_cluster_unlock(ci);
+	return err;
+}
+
 /**
  * folio_alloc_swap - allocate swap space for a folio
  * @folio: folio we want to move to swap
@@ -1589,13 +1748,10 @@ int folio_alloc_swap(struct folio *folio)
  * Context: Caller must ensure the folio is locked and in the swap cache.
  * NOTE: The caller also has to ensure there is no raced call to
  * swap_put_entries_direct on its swap entry before this helper returns, or
- * the swap map may underflow. Currently, we only accept @subpage == NULL
- * for shmem due to the limitation of swap continuation: shmem always
- * duplicates the swap entry only once, so there is no such issue for it.
+ * the swap count may underflow.
  */
 int folio_dup_swap(struct folio *folio, struct page *subpage)
 {
-	int err = 0;
 	swp_entry_t entry = folio->swap;
 	unsigned long nr_pages = folio_nr_pages(folio);
 
@@ -1607,10 +1763,8 @@ int folio_dup_swap(struct folio *folio, struct page *subpage)
 		nr_pages = 1;
 	}
 
-	while (!err && __swap_duplicate(entry, 1, nr_pages) == -ENOMEM)
-		err = add_swap_count_continuation(entry, GFP_ATOMIC);
-
-	return err;
+	return swap_dup_entries_cluster(swap_entry_to_info(entry),
+					swp_offset(entry), nr_pages);
 }
 
 /**
@@ -1639,28 +1793,6 @@ void folio_put_swap(struct folio *folio, struct page *subpage)
 	swap_put_entries_cluster(si, swp_offset(entry), nr_pages, false);
 }
 
-static void swap_put_entry_locked(struct swap_info_struct *si,
-				  struct swap_cluster_info *ci,
-				  unsigned long offset)
-{
-	unsigned char count;
-
-	count = si->swap_map[offset];
-	if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
-		if (count == COUNT_CONTINUED) {
-			if (swap_count_continued(si, offset, count))
-				count = SWAP_MAP_MAX | COUNT_CONTINUED;
-			else
-				count = SWAP_MAP_MAX;
-		} else
-			count--;
-	}
-
-	WRITE_ONCE(si->swap_map[offset], count);
-	if (!count && !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER)))
-		swap_entries_free(si, ci, offset, 1);
-}
-
 /*
  * When we get a swap entry, if there aren't some other ways to
  * prevent swapoff, such as the folio in swap cache is locked, RCU
@@ -1727,31 +1859,30 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
 }
 
 /*
- * Drop the last ref of swap entries, caller have to ensure all entries
- * belong to the same cgroup and cluster.
+ * Free a set of swap slots after their swap count dropped to zero, or will be
+ * zero after putting the last ref (saves one __swap_cluster_put_entry call).
  */
-void swap_entries_free(struct swap_info_struct *si,
-		       struct swap_cluster_info *ci,
-		       unsigned long offset, unsigned int nr_pages)
+void __swap_cluster_free_entries(struct swap_info_struct *si,
+				 struct swap_cluster_info *ci,
+				 unsigned int ci_start, unsigned int nr_pages)
 {
-	swp_entry_t entry = swp_entry(si->type, offset);
-	unsigned char *map = si->swap_map + offset;
-	unsigned char *map_end = map + nr_pages;
+	unsigned long old_tb;
+	unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages;
+	unsigned long offset = cluster_offset(si, ci) + ci_start;
 
-	/* It should never free entries across different clusters */
-	VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1));
-	VM_BUG_ON(cluster_is_empty(ci));
-	VM_BUG_ON(ci->count < nr_pages);
+	VM_WARN_ON(ci->count < nr_pages);
 
 	ci->count -= nr_pages;
 	do {
-		VM_WARN_ON(*map > 1);
-		*map = 0;
-	} while (++map < map_end);
+		old_tb = __swap_table_get(ci, ci_off);
+		/* Release the last ref, or after swap cache is dropped */
+		VM_WARN_ON(!swp_tb_is_shadow(old_tb) || __swp_tb_get_count(old_tb) > 1);
+		__swap_table_set(ci, ci_off, null_to_swp_tb());
+	} while (++ci_off < ci_end);
 
-	mem_cgroup_uncharge_swap(entry, nr_pages);
+	mem_cgroup_uncharge_swap(swp_entry(si->type, offset), nr_pages);
 	swap_range_free(si, offset, nr_pages);
-	swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, nr_pages, false);
+	swap_cluster_assert_empty(ci, ci_start, nr_pages, false);
 
 	if (!ci->count)
 		free_cluster(si, ci);
@@ -1761,10 +1892,10 @@ void swap_entries_free(struct swap_info_struct *si,
 
 int __swap_count(swp_entry_t entry)
 {
-	struct swap_info_struct *si = __swap_entry_to_info(entry);
-	pgoff_t offset = swp_offset(entry);
+	struct swap_cluster_info *ci = __swap_entry_to_cluster(entry);
+	unsigned int ci_off = swp_cluster_offset(entry);
 
-	return si->swap_map[offset];
+	return swp_tb_get_count(__swap_table_get(ci, ci_off));
 }
 
 /**
@@ -1776,81 +1907,62 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
 {
 	pgoff_t offset = swp_offset(entry);
 	struct swap_cluster_info *ci;
-	int count;
+	unsigned long swp_tb;
 
 	ci = swap_cluster_lock(si, offset);
-	count = si->swap_map[offset];
+	swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER);
 	swap_cluster_unlock(ci);
 
-	return count && count != SWAP_MAP_BAD;
+	return swp_tb_get_count(swp_tb) > 0;
 }
 
 /*
  * How many references to @entry are currently swapped out?
- * This considers COUNT_CONTINUED so it returns exact answer.
+ * This returns exact answer.
  */
 int swp_swapcount(swp_entry_t entry)
 {
-	int count, tmp_count, n;
 	struct swap_info_struct *si;
 	struct swap_cluster_info *ci;
-	struct page *page;
-	pgoff_t offset;
-	unsigned char *map;
+	unsigned long swp_tb;
+	int count;
 
 	si = get_swap_device(entry);
 	if (!si)
 		return 0;
 
-	offset = swp_offset(entry);
-
-	ci = swap_cluster_lock(si, offset);
-
-	count = si->swap_map[offset];
-	if (!(count & COUNT_CONTINUED))
-		goto out;
-
-	count &= ~COUNT_CONTINUED;
-	n = SWAP_MAP_MAX + 1;
-
-	page = vmalloc_to_page(si->swap_map + offset);
-	offset &= ~PAGE_MASK;
-	VM_BUG_ON(page_private(page) != SWP_CONTINUED);
-
-	do {
-		page = list_next_entry(page, lru);
-		map = kmap_local_page(page);
-		tmp_count = map[offset];
-		kunmap_local(map);
-
-		count += (tmp_count & ~COUNT_CONTINUED) * n;
-		n *= (SWAP_CONT_MAX + 1);
-	} while (tmp_count & COUNT_CONTINUED);
-out:
+	ci = swap_cluster_lock(si, swp_offset(entry));
+	swp_tb = __swap_table_get(ci, swp_cluster_offset(entry));
+	count = swp_tb_get_count(swp_tb);
+	if (count == SWP_TB_COUNT_MAX)
+		count = ci->extend_table[swp_cluster_offset(entry)];
 	swap_cluster_unlock(ci);
 	put_swap_device(si);
-	return count;
+
+	return count < 0 ? 0 : count;
 }
 
 static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
 					 swp_entry_t entry, int order)
 {
 	struct swap_cluster_info *ci;
-	unsigned char *map = si->swap_map;
 	unsigned int nr_pages = 1 << order;
 	unsigned long roffset = swp_offset(entry);
 	unsigned long offset = round_down(roffset, nr_pages);
+	unsigned int ci_off;
 	int i;
 	bool ret = false;
 
 	ci = swap_cluster_lock(si, offset);
 	if (nr_pages == 1) {
-		if (map[roffset])
+		ci_off = roffset % SWAPFILE_CLUSTER;
+		if (swp_tb_get_count(__swap_table_get(ci, ci_off)))
 			ret = true;
 		goto unlock_out;
 	}
 	for (i = 0; i < nr_pages; i++) {
-		if (map[offset + i]) {
+		ci_off = (offset + i) % SWAPFILE_CLUSTER;
+		if (swp_tb_get_count(__swap_table_get(ci, ci_off))) {
 			ret = true;
 			break;
 		}
@@ -2016,7 +2128,8 @@ void swap_free_hibernation_slot(swp_entry_t entry)
 		return;
 
 	ci = swap_cluster_lock(si, offset);
-	swap_put_entry_locked(si, ci, offset);
+	__swap_cluster_put_entry(ci, offset % SWAPFILE_CLUSTER);
+	__swap_cluster_free_entries(si, ci, offset % SWAPFILE_CLUSTER, 1);
 	swap_cluster_unlock(ci);
 
 	/* In theory readahead might add it to the swap cache by accident */
@@ -2242,13 +2355,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 			unsigned int type)
 {
 	pte_t *pte = NULL;
-	struct swap_info_struct *si;
 
-	si = swap_info[type];
 	do {
 		struct folio *folio;
-		unsigned long offset;
-		unsigned char swp_count;
+		unsigned long swp_tb;
 		softleaf_t entry;
 		int ret;
 		pte_t ptent;
@@ -2267,7 +2377,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		if (swp_type(entry) != type)
 			continue;
 
-		offset = swp_offset(entry);
 		pte_unmap(pte);
 		pte = NULL;
 
@@ -2284,8 +2393,9 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 						&vmf);
 		}
 		if (!folio) {
-			swp_count = READ_ONCE(si->swap_map[offset]);
-			if (swp_count == 0 || swp_count == SWAP_MAP_BAD)
+			swp_tb = swap_table_get(__swap_entry_to_cluster(entry),
+						swp_cluster_offset(entry));
+			if (swp_tb_get_count(swp_tb) <= 0)
 				continue;
 			return -ENOMEM;
 		}
@@ -2413,7 +2523,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type)
 }
 
 /*
- * Scan swap_map from current position to next entry still in use.
+ * Scan swap table from current position to next entry still in use.
  * Return 0 if there are no inuse entries after prev till end of
  * the map.
  */
@@ -2422,7 +2532,6 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
 {
 	unsigned int i;
 	unsigned long swp_tb;
-	unsigned char count;
 
 	/*
 	 * No need for swap_lock here: we're just looking
@@ -2431,12 +2540,9 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
 	 * allocations from this area (while holding swap_lock).
 	 */
 	for (i = prev + 1; i < si->max; i++) {
-		count = READ_ONCE(si->swap_map[i]);
 		swp_tb = swap_table_get(__swap_offset_to_cluster(si, i),
 					i % SWAPFILE_CLUSTER);
-		if (count == SWAP_MAP_BAD)
-			continue;
-		if (count || swp_tb_is_folio(swp_tb))
+		if (!swp_tb_is_null(swp_tb) && !swp_tb_is_bad(swp_tb))
 			break;
 		if ((i % LATENCY_LIMIT) == 0)
 			cond_resched();
@@ -2796,7 +2902,6 @@ static void flush_percpu_swap_cluster(struct swap_info_struct *si)
 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 {
 	struct swap_info_struct *p = NULL;
-	unsigned char *swap_map;
 	unsigned long *zeromap;
 	struct swap_cluster_info *cluster_info;
 	struct file *swap_file, *victim;
@@ -2874,8 +2979,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	flush_percpu_swap_cluster(p);
 
 	destroy_swap_extents(p, p->swap_file);
-	if (p->flags & SWP_CONTINUED)
-		free_swap_count_continuations(p);
 
 	if (!(p->flags & SWP_SOLIDSTATE))
 		atomic_dec(&nr_rotate_swap);
@@ -2887,8 +2990,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 
 	swap_file = p->swap_file;
 	p->swap_file = NULL;
-	swap_map = p->swap_map;
-	p->swap_map = NULL;
 	zeromap = p->zeromap;
 	p->zeromap = NULL;
 	maxpages = p->max;
@@ -2902,7 +3003,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	mutex_unlock(&swapon_mutex);
 	kfree(p->global_cluster);
 	p->global_cluster = NULL;
-	vfree(swap_map);
 	kvfree(zeromap);
 	free_swap_cluster_info(cluster_info, maxpages);
 	/* Destroy swap account information */
@@ -3122,7 +3222,6 @@ static struct swap_info_struct *alloc_swap_info(void)
 		kvfree(defer);
 	}
 	spin_lock_init(&p->lock);
-	spin_lock_init(&p->cont_lock);
 	atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT);
 	init_completion(&p->comp);
 
@@ -3249,19 +3348,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si,
 	return maxpages;
 }
 
-static int setup_swap_map(struct swap_info_struct *si,
-			  union swap_header *swap_header,
-			  unsigned long maxpages)
-{
-	unsigned char *swap_map;
-
-	swap_map = vzalloc(maxpages);
-	si->swap_map = swap_map;
-	if (!swap_map)
-		return -ENOMEM;
-	return 0;
-}
-
 static int setup_swap_clusters_info(struct swap_info_struct *si,
 				    union swap_header *swap_header,
 				    unsigned long maxpages)
@@ -3446,11 +3532,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 
 	maxpages = si->max;
 
-	/* Setup the swap map and apply bad block */
-	error = setup_swap_map(si, swap_header, maxpages);
-	if (error)
-		goto bad_swap_unlock_inode;
-
 	/* Set up the swap cluster info */
 	error = setup_swap_clusters_info(si, swap_header, maxpages);
 	if (error)
@@ -3571,8 +3652,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	inode = NULL;
 	destroy_swap_extents(si, swap_file);
 	swap_cgroup_swapoff(si->type);
-	vfree(si->swap_map);
-	si->swap_map = NULL;
 	free_swap_cluster_info(si->cluster_info, si->max);
 	si->cluster_info = NULL;
 	kvfree(si->zeromap);
@@ -3614,67 +3693,20 @@ void si_swapinfo(struct sysinfo *val)
 }
 
 /*
- * Verify that nr swap entries are valid and increment their swap map counts.
+ * swap_dup_entry_direct() - Increase reference count of a swap entry by one.
+ * @entry: first swap entry from which we want to increase the refcount.
  *
- * Returns error code in following case.
- * - success -> 0
- * - swp_entry is invalid -> EINVAL
- * - swap-mapped reference is requested but the entry is not used. -> ENOENT
- * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
+ * Returns 0 for success, or -ENOMEM if the extend table is required
+ * but could not be atomically allocated.  Returns -EINVAL if the swap
+ * entry is invalid, which might occur if a page table entry has got
+ * corrupted.
+ *
+ * Context: Caller must ensure there is no race condition on the reference
+ * owner. e.g., locking the PTL of a PTE containing the entry being increased.
  */
-static int swap_dup_entries(struct swap_info_struct *si,
-			    struct swap_cluster_info *ci,
-			    unsigned long offset,
-			    unsigned char usage, int nr)
+int swap_dup_entry_direct(swp_entry_t entry)
 {
-	int i;
-	unsigned char count;
-
-	for (i = 0; i < nr; i++) {
-		count = si->swap_map[offset + i];
-		/*
-		 * For swapin out, allocator never allocates bad slots. for
-		 * swapin, readahead is guarded by swap_entry_swapped.
-		 */
-		if (WARN_ON(count == SWAP_MAP_BAD))
-			return -ENOENT;
-		/*
-		 * Swap count duplication must be guarded by either swap cache folio (from
-		 * folio_dup_swap) or external lock of existing entry (from swap_dup_entry_direct).
-		 */
-		if (WARN_ON(!count &&
-			    !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))))
-			return -ENOENT;
-		if (WARN_ON((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX))
-			return -EINVAL;
-	}
-
-	for (i = 0; i < nr; i++) {
-		count = si->swap_map[offset + i];
-		if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
-			count += usage;
-		else if (swap_count_continued(si, offset + i, count))
-			count = COUNT_CONTINUED;
-		else {
-			/*
-			 * Don't need to rollback changes, because if
-			 * usage == 1, there must be nr == 1.
-			 */
-			return -ENOMEM;
-		}
-
-		WRITE_ONCE(si->swap_map[offset + i], count);
-	}
-
-	return 0;
-}
-
-static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
-{
-	int err;
 	struct swap_info_struct *si;
-	struct swap_cluster_info *ci;
-	unsigned long offset = swp_offset(entry);
 
 	si = swap_entry_to_info(entry);
 	if (WARN_ON_ONCE(!si)) {
@@ -3682,253 +3714,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
 		return -EINVAL;
 	}
 
-	VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
-	ci = swap_cluster_lock(si, offset);
-	err = swap_dup_entries(si, ci, offset, usage, nr);
-	swap_cluster_unlock(ci);
-	return err;
-}
-
-/*
- * swap_dup_entry_direct() - Increase reference count of a swap entry by one.
- * @entry: first swap entry from which we want to increase the refcount.
- *
- * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
- * but could not be atomically allocated.  Returns 0, just as if it succeeded,
- * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
- * might occur if a page table entry has got corrupted.
- *
- * Context: Caller must ensure there is no race condition on the reference
- * owner. e.g., locking the PTL of a PTE containing the entry being increased.
- */
-int swap_dup_entry_direct(swp_entry_t entry)
-{
-	int err = 0;
-	while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM)
-		err = add_swap_count_continuation(entry, GFP_ATOMIC);
-	return err;
-}
-
-/*
- * add_swap_count_continuation - called when a swap count is duplicated
- * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
- * page of the original vmalloc'ed swap_map, to hold the continuation count
- * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
- * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
- *
- * These continuation pages are seldom referenced: the common paths all work
- * on the original swap_map, only referring to a continuation page when the
- * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
- *
- * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
- * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
- * can be called after dropping locks.
- */
-int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
-{
-	struct swap_info_struct *si;
-	struct swap_cluster_info *ci;
-	struct page *head;
-	struct page *page;
-	struct page *list_page;
-	pgoff_t offset;
-	unsigned char count;
-	int ret = 0;
-
-	/*
-	 * When debugging, it's easier to use __GFP_ZERO here; but it's better
-	 * for latency not to zero a page while GFP_ATOMIC and holding locks.
-	 */
-	page = alloc_page(gfp_mask | __GFP_HIGHMEM);
-
-	si = get_swap_device(entry);
-	if (!si) {
-		/*
-		 * An acceptable race has occurred since the failing
-		 * __swap_duplicate(): the swap device may be swapoff
-		 */
-		goto outer;
-	}
-
-	offset = swp_offset(entry);
-
-	ci = swap_cluster_lock(si, offset);
-
-	count = si->swap_map[offset];
-
-	if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
-		/*
-		 * The higher the swap count, the more likely it is that tasks
-		 * will race to add swap count continuation: we need to avoid
-		 * over-provisioning.
-		 */
-		goto out;
-	}
-
-	if (!page) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	head = vmalloc_to_page(si->swap_map + offset);
-	offset &= ~PAGE_MASK;
-
-	spin_lock(&si->cont_lock);
-	/*
-	 * Page allocation does not initialize the page's lru field,
-	 * but it does always reset its private field.
-	 */
-	if (!page_private(head)) {
-		BUG_ON(count & COUNT_CONTINUED);
-		INIT_LIST_HEAD(&head->lru);
-		set_page_private(head, SWP_CONTINUED);
-		si->flags |= SWP_CONTINUED;
-	}
-
-	list_for_each_entry(list_page, &head->lru, lru) {
-		unsigned char *map;
-
-		/*
-		 * If the previous map said no continuation, but we've found
-		 * a continuation page, free our allocation and use this one.
-		 */
-		if (!(count & COUNT_CONTINUED))
-			goto out_unlock_cont;
-
-		map = kmap_local_page(list_page) + offset;
-		count = *map;
-		kunmap_local(map);
-
-		/*
-		 * If this continuation count now has some space in it,
-		 * free our allocation and use this one.
-		 */
-		if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
-			goto out_unlock_cont;
-	}
-
-	list_add_tail(&page->lru, &head->lru);
-	page = NULL;			/* now it's attached, don't free it */
-out_unlock_cont:
-	spin_unlock(&si->cont_lock);
-out:
-	swap_cluster_unlock(ci);
-	put_swap_device(si);
-outer:
-	if (page)
-		__free_page(page);
-	return ret;
-}
-
-/*
- * swap_count_continued - when the original swap_map count is incremented
- * from SWAP_MAP_MAX, check if there is already a continuation page to carry
- * into, carry if so, or else fail until a new continuation page is allocated;
- * when the original swap_map count is decremented from 0 with continuation,
- * borrow from the continuation and report whether it still holds more.
- * Called while __swap_duplicate() or caller of swap_put_entry_locked()
- * holds cluster lock.
- */
-static bool swap_count_continued(struct swap_info_struct *si,
-				 pgoff_t offset, unsigned char count)
-{
-	struct page *head;
-	struct page *page;
-	unsigned char *map;
-	bool ret;
-
-	head = vmalloc_to_page(si->swap_map + offset);
-	if (page_private(head) != SWP_CONTINUED) {
-		BUG_ON(count & COUNT_CONTINUED);
-		return false;		/* need to add count continuation */
-	}
-
-	spin_lock(&si->cont_lock);
-	offset &= ~PAGE_MASK;
-	page = list_next_entry(head, lru);
-	map = kmap_local_page(page) + offset;
-
-	if (count == SWAP_MAP_MAX)	/* initial increment from swap_map */
-		goto init_map;		/* jump over SWAP_CONT_MAX checks */
-
-	if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
-		/*
-		 * Think of how you add 1 to 999
-		 */
-		while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
-			kunmap_local(map);
-			page = list_next_entry(page, lru);
-			BUG_ON(page == head);
-			map = kmap_local_page(page) + offset;
-		}
-		if (*map == SWAP_CONT_MAX) {
-			kunmap_local(map);
-			page = list_next_entry(page, lru);
-			if (page == head) {
-				ret = false;	/* add count continuation */
-				goto out;
-			}
-			map = kmap_local_page(page) + offset;
-init_map:		*map = 0;		/* we didn't zero the page */
-		}
-		*map += 1;
-		kunmap_local(map);
-		while ((page = list_prev_entry(page, lru)) != head) {
-			map = kmap_local_page(page) + offset;
-			*map = COUNT_CONTINUED;
-			kunmap_local(map);
-		}
-		ret = true;			/* incremented */
-
-	} else {				/* decrementing */
-		/*
-		 * Think of how you subtract 1 from 1000
-		 */
-		BUG_ON(count != COUNT_CONTINUED);
-		while (*map == COUNT_CONTINUED) {
-			kunmap_local(map);
-			page = list_next_entry(page, lru);
-			BUG_ON(page == head);
-			map = kmap_local_page(page) + offset;
-		}
-		BUG_ON(*map == 0);
-		*map -= 1;
-		if (*map == 0)
-			count = 0;
-		kunmap_local(map);
-		while ((page = list_prev_entry(page, lru)) != head) {
-			map = kmap_local_page(page) + offset;
-			*map = SWAP_CONT_MAX | count;
-			count = COUNT_CONTINUED;
-			kunmap_local(map);
-		}
-		ret = count == COUNT_CONTINUED;
-	}
-out:
-	spin_unlock(&si->cont_lock);
-	return ret;
-}
-
-/*
- * free_swap_count_continuations - swapoff free all the continuation pages
- * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
- */
-static void free_swap_count_continuations(struct swap_info_struct *si)
-{
-	pgoff_t offset;
-
-	for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
-		struct page *head;
-		head = vmalloc_to_page(si->swap_map + offset);
-		if (page_private(head)) {
-			struct page *page, *next;
-
-			list_for_each_entry_safe(page, next, &head->lru, lru) {
-				list_del(&page->lru);
-				__free_page(page);
-			}
-		}
-	}
+	return swap_dup_entries_cluster(si, swp_offset(entry), 1);
 }
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)