From 7ac3eafc7692a85e46388680ff06a90af49cbc0b Mon Sep 17 00:00:00 2001 From: Marlies Ruck Date: Thu, 16 May 2013 14:30:39 -0400 Subject: [PATCH 01/82] Staging: Fixes string split across lines in zram Fixes the following checkpatch warning in zram_drv.c: WARNING: quoted string split across lines Signed-off-by: Marlies Ruck Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 596b3dd4c8e172db7806372c9d0347a4e7d28bc5) Signed-off-by: Alex Shi --- drivers/staging/zram/zram_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c index a333d44d0cff..2652dfac0b32 100644 --- a/drivers/staging/zram/zram_drv.c +++ b/drivers/staging/zram/zram_drv.c @@ -302,8 +302,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, handle = zs_malloc(meta->mem_pool, clen); if (!handle) { - pr_info("Error allocating memory for compressed " - "page: %u, size=%zu\n", index, clen); + pr_info("Error allocating memory for compressed page: %u, size=%zu\n", + index, clen); ret = -ENOMEM; goto out; } From f02a1549c748f57f7dadce5d10b4a23796584a72 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 7 Jun 2013 00:07:28 +0800 Subject: [PATCH 02/82] zram: simplify and optimize dev_to_zram() Simplify and optimize dev_to_zram() without walking the zram_devices array. Signed-off-by: Jiang Liu Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 80de574dca050b734d8413a98a983fba3d06240b) Signed-off-by: Alex Shi --- drivers/staging/zram/zram_sysfs.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/drivers/staging/zram/zram_sysfs.c b/drivers/staging/zram/zram_sysfs.c index dc76a3dba1b8..e239d9452726 100644 --- a/drivers/staging/zram/zram_sysfs.c +++ b/drivers/staging/zram/zram_sysfs.c @@ -30,18 +30,9 @@ static u64 zram_stat64_read(struct zram *zram, u64 *v) return val; } -static struct zram *dev_to_zram(struct device *dev) +static inline struct zram *dev_to_zram(struct device *dev) { - int i; - struct zram *zram = NULL; - - for (i = 0; i < zram_get_num_devices(); i++) { - zram = &zram_devices[i]; - if (disk_to_dev(zram->disk) == dev) - break; - } - - return zram; + return (struct zram *)dev_to_disk(dev)->private_data; } static ssize_t disksize_show(struct device *dev, From 72f7aaa7ded9c9be6d5ec8886ced1238909a1c7a Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 7 Jun 2013 00:07:29 +0800 Subject: [PATCH 03/82] zram: kill unused zram_get_num_devices() Now there's no caller of zram_get_num_devices(), so kill it. And change zram_devices to static because it's only used in zram_drv.c. Signed-off-by: Jiang Liu Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 0f0e3ba346c8d8d2cb409b157df79805931a1c2c) Signed-off-by: Alex Shi --- drivers/staging/zram/zram_drv.c | 7 +------ drivers/staging/zram/zram_drv.h | 2 -- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c index 2652dfac0b32..49f34b065181 100644 --- a/drivers/staging/zram/zram_drv.c +++ b/drivers/staging/zram/zram_drv.c @@ -37,7 +37,7 @@ /* Globals */ static int zram_major; -struct zram *zram_devices; +static struct zram *zram_devices; /* Module params (documentation at end) */ static unsigned int num_devices = 1; @@ -679,11 +679,6 @@ static void destroy_device(struct zram *zram) blk_cleanup_queue(zram->queue); } -unsigned int zram_get_num_devices(void) -{ - return num_devices; -} - static int __init zram_init(void) { int ret, dev_id; diff --git a/drivers/staging/zram/zram_drv.h b/drivers/staging/zram/zram_drv.h index d542eee81357..b3a315d1eb23 100644 --- a/drivers/staging/zram/zram_drv.h +++ b/drivers/staging/zram/zram_drv.h @@ -110,8 +110,6 @@ struct zram { struct zram_stats stats; }; -extern struct zram *zram_devices; -unsigned int zram_get_num_devices(void); #ifdef CONFIG_SYSFS extern struct attribute_group zram_disk_attr_group; #endif From 728260fce381aa3a2b10a35d7ab727f9761f58bd Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 7 Jun 2013 00:07:30 +0800 Subject: [PATCH 04/82] zram: optimize memory operations with clear_page()/copy_page() Some architectures provides architecture-specific, optimized version of clear_page()/copy_page(), which may have better performance than memset()/memcpy(). So use clear_page()/copy_page() to optimize zram performance if possible. Signed-off-by: Jiang Liu Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 42e99bd975fdd24d2bf1a24ebb8b0b42bab8ba65) Signed-off-by: Alex Shi --- drivers/staging/zram/zram_drv.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c index 49f34b065181..18a89863ae53 100644 --- a/drivers/staging/zram/zram_drv.c +++ b/drivers/staging/zram/zram_drv.c @@ -128,23 +128,26 @@ static void zram_free_page(struct zram *zram, size_t index) meta->table[index].size = 0; } +static inline int is_partial_io(struct bio_vec *bvec) +{ + return bvec->bv_len != PAGE_SIZE; +} + static void handle_zero_page(struct bio_vec *bvec) { struct page *page = bvec->bv_page; void *user_mem; user_mem = kmap_atomic(page); - memset(user_mem + bvec->bv_offset, 0, bvec->bv_len); + if (is_partial_io(bvec)) + memset(user_mem + bvec->bv_offset, 0, bvec->bv_len); + else + clear_page(user_mem); kunmap_atomic(user_mem); flush_dcache_page(page); } -static inline int is_partial_io(struct bio_vec *bvec) -{ - return bvec->bv_len != PAGE_SIZE; -} - static int zram_decompress_page(struct zram *zram, char *mem, u32 index) { int ret = LZO_E_OK; @@ -154,13 +157,13 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) unsigned long handle = meta->table[index].handle; if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) { - memset(mem, 0, PAGE_SIZE); + clear_page(mem); return 0; } cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); if (meta->table[index].size == PAGE_SIZE) - memcpy(mem, cmem, PAGE_SIZE); + copy_page(mem, cmem); else ret = lzo1x_decompress_safe(cmem, meta->table[index].size, mem, &clen); @@ -309,11 +312,13 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, } cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); - if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) + if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) { src = kmap_atomic(page); - memcpy(cmem, src, clen); - if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) + copy_page(cmem, src); kunmap_atomic(src); + } else { + memcpy(cmem, src, clen); + } zs_unmap_object(meta->mem_pool, handle); From dc18dd5cb1f638af5fc0328677b5d96328769ca3 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 7 Jun 2013 00:07:31 +0800 Subject: [PATCH 05/82] zram: use atomic64_xxx() to replace zram_stat64_xxx() Use atomic64_xxx() to replace open-coded zram_stat64_xxx(). Some architectures have native support of atomic64 operations, so we can get rid of the spin_lock() in zram_stat64_xxx(). On the other hand, for platforms use generic version of atomic64 implement, it may cause an extra save/restore of the interrupt flag. So it's a tradeoff. Signed-off-by: Jiang Liu Signed-off-by: Greg Kroah-Hartman (cherry picked from commit da5cc7d338f97886ebf35be92995460289379b73) Signed-off-by: Alex Shi --- drivers/staging/zram/zram_drv.c | 37 +++++++------------------------ drivers/staging/zram/zram_drv.h | 19 +++++++++------- drivers/staging/zram/zram_sysfs.c | 21 +++++------------- 3 files changed, 24 insertions(+), 53 deletions(-) diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c index 18a89863ae53..97899eac15df 100644 --- a/drivers/staging/zram/zram_drv.c +++ b/drivers/staging/zram/zram_drv.c @@ -42,25 +42,6 @@ static struct zram *zram_devices; /* Module params (documentation at end) */ static unsigned int num_devices = 1; -static void zram_stat64_add(struct zram *zram, u64 *v, u64 inc) -{ - spin_lock(&zram->stat64_lock); - *v = *v + inc; - spin_unlock(&zram->stat64_lock); -} - -static void zram_stat64_sub(struct zram *zram, u64 *v, u64 dec) -{ - spin_lock(&zram->stat64_lock); - *v = *v - dec; - spin_unlock(&zram->stat64_lock); -} - -static void zram_stat64_inc(struct zram *zram, u64 *v) -{ - zram_stat64_add(zram, v, 1); -} - static int zram_test_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) { @@ -120,8 +101,7 @@ static void zram_free_page(struct zram *zram, size_t index) if (size <= PAGE_SIZE / 2) zram->stats.good_compress--; - zram_stat64_sub(zram, &zram->stats.compr_size, - meta->table[index].size); + atomic64_sub(meta->table[index].size, &zram->stats.compr_size); zram->stats.pages_stored--; meta->table[index].handle = 0; @@ -172,7 +152,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) /* Should NEVER happen. Return bio error if it does. */ if (unlikely(ret != LZO_E_OK)) { pr_err("Decompression failed! err=%d, page=%u\n", ret, index); - zram_stat64_inc(zram, &zram->stats.failed_reads); + atomic64_inc(&zram->stats.failed_reads); return ret; } @@ -326,7 +306,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, meta->table[index].size = clen; /* Update stats */ - zram_stat64_add(zram, &zram->stats.compr_size, clen); + atomic64_add(clen, &zram->stats.compr_size); zram->stats.pages_stored++; if (clen <= PAGE_SIZE / 2) zram->stats.good_compress++; @@ -336,7 +316,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, kfree(uncmem); if (ret) - zram_stat64_inc(zram, &zram->stats.failed_writes); + atomic64_inc(&zram->stats.failed_writes); return ret; } @@ -373,10 +353,10 @@ static void __zram_make_request(struct zram *zram, struct bio *bio, int rw) switch (rw) { case READ: - zram_stat64_inc(zram, &zram->stats.num_reads); + atomic64_inc(&zram->stats.num_reads); break; case WRITE: - zram_stat64_inc(zram, &zram->stats.num_writes); + atomic64_inc(&zram->stats.num_writes); break; } @@ -456,7 +436,7 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) goto error; if (!valid_io_request(zram, bio)) { - zram_stat64_inc(zram, &zram->stats.invalid_io); + atomic64_inc(&zram->stats.invalid_io); goto error; } @@ -595,7 +575,7 @@ static void zram_slot_free_notify(struct block_device *bdev, down_write(&zram->lock); zram_free_page(zram, index); up_write(&zram->lock); - zram_stat64_inc(zram, &zram->stats.notify_free); + atomic64_inc(&zram->stats.notify_free); } static const struct block_device_operations zram_devops = { @@ -609,7 +589,6 @@ static int create_device(struct zram *zram, int device_id) init_rwsem(&zram->lock); init_rwsem(&zram->init_lock); - spin_lock_init(&zram->stat64_lock); zram->queue = blk_alloc_queue(GFP_KERNEL); if (!zram->queue) { diff --git a/drivers/staging/zram/zram_drv.h b/drivers/staging/zram/zram_drv.h index b3a315d1eb23..11b09fc25953 100644 --- a/drivers/staging/zram/zram_drv.h +++ b/drivers/staging/zram/zram_drv.h @@ -69,14 +69,18 @@ struct table { u8 flags; } __aligned(4); +/* + * All 64bit fields should only be manipulated by 64bit atomic accessors. + * All modifications to 32bit counter should be protected by zram->lock. + */ struct zram_stats { - u64 compr_size; /* compressed size of pages stored */ - u64 num_reads; /* failed + successful */ - u64 num_writes; /* --do-- */ - u64 failed_reads; /* should NEVER! happen */ - u64 failed_writes; /* can happen when memory is too low */ - u64 invalid_io; /* non-page-aligned I/O requests */ - u64 notify_free; /* no. of swap slot free notifications */ + atomic64_t compr_size; /* compressed size of pages stored */ + atomic64_t num_reads; /* failed + successful */ + atomic64_t num_writes; /* --do-- */ + atomic64_t failed_reads; /* should NEVER! happen */ + atomic64_t failed_writes; /* can happen when memory is too low */ + atomic64_t invalid_io; /* non-page-aligned I/O requests */ + atomic64_t notify_free; /* no. of swap slot free notifications */ u32 pages_zero; /* no. of zero filled pages */ u32 pages_stored; /* no. of pages currently stored */ u32 good_compress; /* % of pages with compression ratio<=50% */ @@ -92,7 +96,6 @@ struct zram_meta { struct zram { struct zram_meta *meta; - spinlock_t stat64_lock; /* protect 64-bit stats */ struct rw_semaphore lock; /* protect compression buffers, table, * 32bit stat counters against concurrent * notifications, reads and writes */ diff --git a/drivers/staging/zram/zram_sysfs.c b/drivers/staging/zram/zram_sysfs.c index e239d9452726..93a2f9cafd7c 100644 --- a/drivers/staging/zram/zram_sysfs.c +++ b/drivers/staging/zram/zram_sysfs.c @@ -19,17 +19,6 @@ #include "zram_drv.h" -static u64 zram_stat64_read(struct zram *zram, u64 *v) -{ - u64 val; - - spin_lock(&zram->stat64_lock); - val = *v; - spin_unlock(&zram->stat64_lock); - - return val; -} - static inline struct zram *dev_to_zram(struct device *dev) { return (struct zram *)dev_to_disk(dev)->private_data; @@ -116,7 +105,7 @@ static ssize_t num_reads_show(struct device *dev, struct zram *zram = dev_to_zram(dev); return sprintf(buf, "%llu\n", - zram_stat64_read(zram, &zram->stats.num_reads)); + (u64)atomic64_read(&zram->stats.num_reads)); } static ssize_t num_writes_show(struct device *dev, @@ -125,7 +114,7 @@ static ssize_t num_writes_show(struct device *dev, struct zram *zram = dev_to_zram(dev); return sprintf(buf, "%llu\n", - zram_stat64_read(zram, &zram->stats.num_writes)); + (u64)atomic64_read(&zram->stats.num_writes)); } static ssize_t invalid_io_show(struct device *dev, @@ -134,7 +123,7 @@ static ssize_t invalid_io_show(struct device *dev, struct zram *zram = dev_to_zram(dev); return sprintf(buf, "%llu\n", - zram_stat64_read(zram, &zram->stats.invalid_io)); + (u64)atomic64_read(&zram->stats.invalid_io)); } static ssize_t notify_free_show(struct device *dev, @@ -143,7 +132,7 @@ static ssize_t notify_free_show(struct device *dev, struct zram *zram = dev_to_zram(dev); return sprintf(buf, "%llu\n", - zram_stat64_read(zram, &zram->stats.notify_free)); + (u64)atomic64_read(&zram->stats.notify_free)); } static ssize_t zero_pages_show(struct device *dev, @@ -169,7 +158,7 @@ static ssize_t compr_data_size_show(struct device *dev, struct zram *zram = dev_to_zram(dev); return sprintf(buf, "%llu\n", - zram_stat64_read(zram, &zram->stats.compr_size)); + (u64)atomic64_read(&zram->stats.compr_size)); } static ssize_t mem_used_total_show(struct device *dev, From c5544682efcda470167318ac56559f023e578b09 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sat, 22 Jun 2013 03:21:18 +0300 Subject: [PATCH 06/82] zram: remove zram_sysfs file (v2) Move zram sysfs code to zram drv and remove zram_sysfs.c file. This gives ability to make static a number of previously exported zram functions, used from zram sysfs, e.g. internal zram zram_meta_alloc/free(). We also can drop zram_drv wrapper functions, used from zram sysfs: e.g. zram_reset_device()/__zram_reset_device() pair. v2: as suggested by Greg K-H, move MODULE description to the bottom of the file. Signed-off-by: Sergey Senozhatsky Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 9b3bb7abcdf2df0f1b2657e6cbc9d06bc2b3b36f) Signed-off-by: Alex Shi --- drivers/staging/zram/Makefile | 2 +- drivers/staging/zram/zram_drv.c | 516 ++++++++++++++++++++---------- drivers/staging/zram/zram_drv.h | 10 - drivers/staging/zram/zram_sysfs.c | 209 ------------ 4 files changed, 350 insertions(+), 387 deletions(-) delete mode 100644 drivers/staging/zram/zram_sysfs.c diff --git a/drivers/staging/zram/Makefile b/drivers/staging/zram/Makefile index 7f4a3019e9c4..cb0f9ced6a93 100644 --- a/drivers/staging/zram/Makefile +++ b/drivers/staging/zram/Makefile @@ -1,3 +1,3 @@ -zram-y := zram_drv.o zram_sysfs.o +zram-y := zram_drv.o obj-$(CONFIG_ZRAM) += zram.o diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c index 97899eac15df..753877431b5f 100644 --- a/drivers/staging/zram/zram_drv.c +++ b/drivers/staging/zram/zram_drv.c @@ -42,6 +42,104 @@ static struct zram *zram_devices; /* Module params (documentation at end) */ static unsigned int num_devices = 1; +static inline struct zram *dev_to_zram(struct device *dev) +{ + return (struct zram *)dev_to_disk(dev)->private_data; +} + +static ssize_t disksize_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + return sprintf(buf, "%llu\n", zram->disksize); +} + +static ssize_t initstate_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + return sprintf(buf, "%u\n", zram->init_done); +} + +static ssize_t num_reads_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + return sprintf(buf, "%llu\n", + (u64)atomic64_read(&zram->stats.num_reads)); +} + +static ssize_t num_writes_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + return sprintf(buf, "%llu\n", + (u64)atomic64_read(&zram->stats.num_writes)); +} + +static ssize_t invalid_io_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + return sprintf(buf, "%llu\n", + (u64)atomic64_read(&zram->stats.invalid_io)); +} + +static ssize_t notify_free_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + return sprintf(buf, "%llu\n", + (u64)atomic64_read(&zram->stats.notify_free)); +} + +static ssize_t zero_pages_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + return sprintf(buf, "%u\n", zram->stats.pages_zero); +} + +static ssize_t orig_data_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + return sprintf(buf, "%llu\n", + (u64)(zram->stats.pages_stored) << PAGE_SHIFT); +} + +static ssize_t compr_data_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + return sprintf(buf, "%llu\n", + (u64)atomic64_read(&zram->stats.compr_size)); +} + +static ssize_t mem_used_total_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u64 val = 0; + struct zram *zram = dev_to_zram(dev); + struct zram_meta *meta = zram->meta; + + down_read(&zram->init_lock); + if (zram->init_done) + val = zs_get_total_size_bytes(meta->mem_pool); + up_read(&zram->init_lock); + + return sprintf(buf, "%llu\n", val); +} + static int zram_test_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) { @@ -60,6 +158,97 @@ static void zram_clear_flag(struct zram_meta *meta, u32 index, meta->table[index].flags &= ~BIT(flag); } +static inline int is_partial_io(struct bio_vec *bvec) +{ + return bvec->bv_len != PAGE_SIZE; +} + +/* + * Check if request is within bounds and aligned on zram logical blocks. + */ +static inline int valid_io_request(struct zram *zram, struct bio *bio) +{ + u64 start, end, bound; + + /* unaligned request */ + if (unlikely(bio->bi_sector & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) + return 0; + if (unlikely(bio->bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) + return 0; + + start = bio->bi_sector; + end = start + (bio->bi_size >> SECTOR_SHIFT); + bound = zram->disksize >> SECTOR_SHIFT; + /* out of range range */ + if (unlikely(start >= bound || end >= bound || start > end)) + return 0; + + /* I/O request is valid */ + return 1; +} + +static void zram_meta_free(struct zram_meta *meta) +{ + zs_destroy_pool(meta->mem_pool); + kfree(meta->compress_workmem); + free_pages((unsigned long)meta->compress_buffer, 1); + vfree(meta->table); + kfree(meta); +} + +static struct zram_meta *zram_meta_alloc(u64 disksize) +{ + size_t num_pages; + struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL); + if (!meta) + goto out; + + meta->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); + if (!meta->compress_workmem) + goto free_meta; + + meta->compress_buffer = + (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); + if (!meta->compress_buffer) { + pr_err("Error allocating compressor buffer space\n"); + goto free_workmem; + } + + num_pages = disksize >> PAGE_SHIFT; + meta->table = vzalloc(num_pages * sizeof(*meta->table)); + if (!meta->table) { + pr_err("Error allocating zram address table\n"); + goto free_buffer; + } + + meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM); + if (!meta->mem_pool) { + pr_err("Error creating memory pool\n"); + goto free_table; + } + + return meta; + +free_table: + vfree(meta->table); +free_buffer: + free_pages((unsigned long)meta->compress_buffer, 1); +free_workmem: + kfree(meta->compress_workmem); +free_meta: + kfree(meta); + meta = NULL; +out: + return meta; +} + +static void update_position(u32 *index, int *offset, struct bio_vec *bvec) +{ + if (*offset + bvec->bv_len >= PAGE_SIZE) + (*index)++; + *offset = (*offset + bvec->bv_len) % PAGE_SIZE; +} + static int page_zero_filled(void *ptr) { unsigned int pos; @@ -75,6 +264,21 @@ static int page_zero_filled(void *ptr) return 1; } +static void handle_zero_page(struct bio_vec *bvec) +{ + struct page *page = bvec->bv_page; + void *user_mem; + + user_mem = kmap_atomic(page); + if (is_partial_io(bvec)) + memset(user_mem + bvec->bv_offset, 0, bvec->bv_len); + else + clear_page(user_mem); + kunmap_atomic(user_mem); + + flush_dcache_page(page); +} + static void zram_free_page(struct zram *zram, size_t index) { struct zram_meta *meta = zram->meta; @@ -108,26 +312,6 @@ static void zram_free_page(struct zram *zram, size_t index) meta->table[index].size = 0; } -static inline int is_partial_io(struct bio_vec *bvec) -{ - return bvec->bv_len != PAGE_SIZE; -} - -static void handle_zero_page(struct bio_vec *bvec) -{ - struct page *page = bvec->bv_page; - void *user_mem; - - user_mem = kmap_atomic(page); - if (is_partial_io(bvec)) - memset(user_mem + bvec->bv_offset, 0, bvec->bv_len); - else - clear_page(user_mem); - kunmap_atomic(user_mem); - - flush_dcache_page(page); -} - static int zram_decompress_page(struct zram *zram, char *mem, u32 index) { int ret = LZO_E_OK; @@ -338,11 +522,117 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, return ret; } -static void update_position(u32 *index, int *offset, struct bio_vec *bvec) +static void zram_reset_device(struct zram *zram) { - if (*offset + bvec->bv_len >= PAGE_SIZE) - (*index)++; - *offset = (*offset + bvec->bv_len) % PAGE_SIZE; + size_t index; + struct zram_meta *meta; + + if (!zram->init_done) + return; + + meta = zram->meta; + zram->init_done = 0; + + /* Free all pages that are still in this zram device */ + for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) { + unsigned long handle = meta->table[index].handle; + if (!handle) + continue; + + zs_free(meta->mem_pool, handle); + } + + zram_meta_free(zram->meta); + zram->meta = NULL; + /* Reset stats */ + memset(&zram->stats, 0, sizeof(zram->stats)); + + zram->disksize = 0; + set_capacity(zram->disk, 0); +} + +static void zram_init_device(struct zram *zram, struct zram_meta *meta) +{ + if (zram->disksize > 2 * (totalram_pages << PAGE_SHIFT)) { + pr_info( + "There is little point creating a zram of greater than " + "twice the size of memory since we expect a 2:1 compression " + "ratio. Note that zram uses about 0.1%% of the size of " + "the disk when not in use so a huge zram is " + "wasteful.\n" + "\tMemory Size: %lu kB\n" + "\tSize you selected: %llu kB\n" + "Continuing anyway ...\n", + (totalram_pages << PAGE_SHIFT) >> 10, zram->disksize >> 10 + ); + } + + /* zram devices sort of resembles non-rotational disks */ + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); + + zram->meta = meta; + zram->init_done = 1; + + pr_debug("Initialization done!\n"); +} + +static ssize_t disksize_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + u64 disksize; + struct zram_meta *meta; + struct zram *zram = dev_to_zram(dev); + + disksize = memparse(buf, NULL); + if (!disksize) + return -EINVAL; + + disksize = PAGE_ALIGN(disksize); + meta = zram_meta_alloc(disksize); + down_write(&zram->init_lock); + if (zram->init_done) { + up_write(&zram->init_lock); + zram_meta_free(meta); + pr_info("Cannot change disksize for initialized device\n"); + return -EBUSY; + } + + zram->disksize = disksize; + set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); + zram_init_device(zram, meta); + up_write(&zram->init_lock); + + return len; +} + +static ssize_t reset_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + int ret; + unsigned short do_reset; + struct zram *zram; + struct block_device *bdev; + + zram = dev_to_zram(dev); + bdev = bdget_disk(zram->disk, 0); + + /* Do not reset an active device! */ + if (bdev->bd_holders) + return -EBUSY; + + ret = kstrtou16(buf, 10, &do_reset); + if (ret) + return ret; + + if (!do_reset) + return -EINVAL; + + /* Make sure all pending I/O is finished */ + if (bdev) + fsync_bdev(bdev); + + zram_reset_device(zram); + return len; } static void __zram_make_request(struct zram *zram, struct bio *bio, int rw) @@ -400,30 +690,6 @@ static void __zram_make_request(struct zram *zram, struct bio *bio, int rw) bio_io_error(bio); } -/* - * Check if request is within bounds and aligned on zram logical blocks. - */ -static inline int valid_io_request(struct zram *zram, struct bio *bio) -{ - u64 start, end, bound; - - /* unaligned request */ - if (unlikely(bio->bi_sector & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) - return 0; - if (unlikely(bio->bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) - return 0; - - start = bio->bi_sector; - end = start + (bio->bi_size >> SECTOR_SHIFT); - bound = zram->disksize >> SECTOR_SHIFT; - /* out of range range */ - if (unlikely(start >= bound || end > bound || start > end)) - return 0; - - /* I/O request is valid */ - return 1; -} - /* * Handler function for all zram I/O requests. */ @@ -450,122 +716,6 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) bio_io_error(bio); } -static void __zram_reset_device(struct zram *zram) -{ - size_t index; - struct zram_meta *meta; - - if (!zram->init_done) - return; - - meta = zram->meta; - zram->init_done = 0; - - /* Free all pages that are still in this zram device */ - for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) { - unsigned long handle = meta->table[index].handle; - if (!handle) - continue; - - zs_free(meta->mem_pool, handle); - } - - zram_meta_free(zram->meta); - zram->meta = NULL; - /* Reset stats */ - memset(&zram->stats, 0, sizeof(zram->stats)); - - zram->disksize = 0; - set_capacity(zram->disk, 0); -} - -void zram_reset_device(struct zram *zram) -{ - down_write(&zram->init_lock); - __zram_reset_device(zram); - up_write(&zram->init_lock); -} - -void zram_meta_free(struct zram_meta *meta) -{ - zs_destroy_pool(meta->mem_pool); - kfree(meta->compress_workmem); - free_pages((unsigned long)meta->compress_buffer, 1); - vfree(meta->table); - kfree(meta); -} - -struct zram_meta *zram_meta_alloc(u64 disksize) -{ - size_t num_pages; - struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL); - if (!meta) - goto out; - - meta->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); - if (!meta->compress_workmem) - goto free_meta; - - meta->compress_buffer = - (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); - if (!meta->compress_buffer) { - pr_err("Error allocating compressor buffer space\n"); - goto free_workmem; - } - - num_pages = disksize >> PAGE_SHIFT; - meta->table = vzalloc(num_pages * sizeof(*meta->table)); - if (!meta->table) { - pr_err("Error allocating zram address table\n"); - goto free_buffer; - } - - meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM); - if (!meta->mem_pool) { - pr_err("Error creating memory pool\n"); - goto free_table; - } - - return meta; - -free_table: - vfree(meta->table); -free_buffer: - free_pages((unsigned long)meta->compress_buffer, 1); -free_workmem: - kfree(meta->compress_workmem); -free_meta: - kfree(meta); - meta = NULL; -out: - return meta; -} - -void zram_init_device(struct zram *zram, struct zram_meta *meta) -{ - if (zram->disksize > 2 * (totalram_pages << PAGE_SHIFT)) { - pr_info( - "There is little point creating a zram of greater than " - "twice the size of memory since we expect a 2:1 compression " - "ratio. Note that zram uses about 0.1%% of the size of " - "the disk when not in use so a huge zram is " - "wasteful.\n" - "\tMemory Size: %lu kB\n" - "\tSize you selected: %llu kB\n" - "Continuing anyway ...\n", - (totalram_pages << PAGE_SHIFT) >> 10, zram->disksize >> 10 - ); - } - - /* zram devices sort of resembles non-rotational disks */ - queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); - - zram->meta = meta; - zram->init_done = 1; - - pr_debug("Initialization done!\n"); -} - static void zram_slot_free_notify(struct block_device *bdev, unsigned long index) { @@ -583,6 +733,38 @@ static const struct block_device_operations zram_devops = { .owner = THIS_MODULE }; +static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR, + disksize_show, disksize_store); +static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); +static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); +static DEVICE_ATTR(num_reads, S_IRUGO, num_reads_show, NULL); +static DEVICE_ATTR(num_writes, S_IRUGO, num_writes_show, NULL); +static DEVICE_ATTR(invalid_io, S_IRUGO, invalid_io_show, NULL); +static DEVICE_ATTR(notify_free, S_IRUGO, notify_free_show, NULL); +static DEVICE_ATTR(zero_pages, S_IRUGO, zero_pages_show, NULL); +static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); +static DEVICE_ATTR(compr_data_size, S_IRUGO, compr_data_size_show, NULL); +static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); + +static struct attribute *zram_disk_attrs[] = { + &dev_attr_disksize.attr, + &dev_attr_initstate.attr, + &dev_attr_reset.attr, + &dev_attr_num_reads.attr, + &dev_attr_num_writes.attr, + &dev_attr_invalid_io.attr, + &dev_attr_notify_free.attr, + &dev_attr_zero_pages.attr, + &dev_attr_orig_data_size.attr, + &dev_attr_compr_data_size.attr, + &dev_attr_mem_used_total.attr, + NULL, +}; + +static struct attribute_group zram_disk_attr_group = { + .attrs = zram_disk_attrs, +}; + static int create_device(struct zram *zram, int device_id) { int ret = -ENOMEM; @@ -728,12 +910,12 @@ static void __exit zram_exit(void) pr_debug("Cleanup done!\n"); } -module_param(num_devices, uint, 0); -MODULE_PARM_DESC(num_devices, "Number of zram devices"); - module_init(zram_init); module_exit(zram_exit); +module_param(num_devices, uint, 0); +MODULE_PARM_DESC(num_devices, "Number of zram devices"); + MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Nitin Gupta "); MODULE_DESCRIPTION("Compressed RAM Block Device"); diff --git a/drivers/staging/zram/zram_drv.h b/drivers/staging/zram/zram_drv.h index 11b09fc25953..9e57bfb29b4f 100644 --- a/drivers/staging/zram/zram_drv.h +++ b/drivers/staging/zram/zram_drv.h @@ -112,14 +112,4 @@ struct zram { struct zram_stats stats; }; - -#ifdef CONFIG_SYSFS -extern struct attribute_group zram_disk_attr_group; -#endif - -extern void zram_reset_device(struct zram *zram); -extern struct zram_meta *zram_meta_alloc(u64 disksize); -extern void zram_meta_free(struct zram_meta *meta); -extern void zram_init_device(struct zram *zram, struct zram_meta *meta); - #endif diff --git a/drivers/staging/zram/zram_sysfs.c b/drivers/staging/zram/zram_sysfs.c deleted file mode 100644 index 93a2f9cafd7c..000000000000 --- a/drivers/staging/zram/zram_sysfs.c +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Compressed RAM block device - * - * Copyright (C) 2008, 2009, 2010 Nitin Gupta - * - * This code is released using a dual license strategy: BSD/GPL - * You can choose the licence that better fits your requirements. - * - * Released under the terms of 3-clause BSD License - * Released under the terms of GNU General Public License Version 2.0 - * - * Project home: http://compcache.googlecode.com/ - */ - -#include -#include -#include -#include - -#include "zram_drv.h" - -static inline struct zram *dev_to_zram(struct device *dev) -{ - return (struct zram *)dev_to_disk(dev)->private_data; -} - -static ssize_t disksize_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", zram->disksize); -} - -static ssize_t disksize_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - u64 disksize; - struct zram_meta *meta; - struct zram *zram = dev_to_zram(dev); - - disksize = memparse(buf, NULL); - if (!disksize) - return -EINVAL; - - disksize = PAGE_ALIGN(disksize); - meta = zram_meta_alloc(disksize); - down_write(&zram->init_lock); - if (zram->init_done) { - up_write(&zram->init_lock); - zram_meta_free(meta); - pr_info("Cannot change disksize for initialized device\n"); - return -EBUSY; - } - - zram->disksize = disksize; - set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - zram_init_device(zram, meta); - up_write(&zram->init_lock); - - return len; -} - -static ssize_t initstate_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%u\n", zram->init_done); -} - -static ssize_t reset_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - int ret; - unsigned short do_reset; - struct zram *zram; - struct block_device *bdev; - - zram = dev_to_zram(dev); - bdev = bdget_disk(zram->disk, 0); - - /* Do not reset an active device! */ - if (bdev->bd_holders) - return -EBUSY; - - ret = kstrtou16(buf, 10, &do_reset); - if (ret) - return ret; - - if (!do_reset) - return -EINVAL; - - /* Make sure all pending I/O is finished */ - if (bdev) - fsync_bdev(bdev); - - zram_reset_device(zram); - return len; -} - -static ssize_t num_reads_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.num_reads)); -} - -static ssize_t num_writes_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.num_writes)); -} - -static ssize_t invalid_io_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.invalid_io)); -} - -static ssize_t notify_free_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.notify_free)); -} - -static ssize_t zero_pages_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%u\n", zram->stats.pages_zero); -} - -static ssize_t orig_data_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)(zram->stats.pages_stored) << PAGE_SHIFT); -} - -static ssize_t compr_data_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.compr_size)); -} - -static ssize_t mem_used_total_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - u64 val = 0; - struct zram *zram = dev_to_zram(dev); - struct zram_meta *meta = zram->meta; - - down_read(&zram->init_lock); - if (zram->init_done) - val = zs_get_total_size_bytes(meta->mem_pool); - up_read(&zram->init_lock); - - return sprintf(buf, "%llu\n", val); -} - -static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR, - disksize_show, disksize_store); -static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); -static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); -static DEVICE_ATTR(num_reads, S_IRUGO, num_reads_show, NULL); -static DEVICE_ATTR(num_writes, S_IRUGO, num_writes_show, NULL); -static DEVICE_ATTR(invalid_io, S_IRUGO, invalid_io_show, NULL); -static DEVICE_ATTR(notify_free, S_IRUGO, notify_free_show, NULL); -static DEVICE_ATTR(zero_pages, S_IRUGO, zero_pages_show, NULL); -static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); -static DEVICE_ATTR(compr_data_size, S_IRUGO, compr_data_size_show, NULL); -static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); - -static struct attribute *zram_disk_attrs[] = { - &dev_attr_disksize.attr, - &dev_attr_initstate.attr, - &dev_attr_reset.attr, - &dev_attr_num_reads.attr, - &dev_attr_num_writes.attr, - &dev_attr_invalid_io.attr, - &dev_attr_notify_free.attr, - &dev_attr_zero_pages.attr, - &dev_attr_orig_data_size.attr, - &dev_attr_compr_data_size.attr, - &dev_attr_mem_used_total.attr, - NULL, -}; - -struct attribute_group zram_disk_attr_group = { - .attrs = zram_disk_attrs, -}; From 687f091b39ef238b975b6ad4e4e282ee9aade598 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 26 Jun 2013 15:28:39 +0300 Subject: [PATCH 07/82] staging: zram: protect zram_reset_device() call Commit 9b3bb7abcdf2df0f1b2657e6cbc9d06bc2b3b36f (remove zram_sysfs file (v2)) accidentally made zram_reset_device() racy. Protect zram_reset_device() call with zram->lock. Signed-off-by: Sergey Senozhatsky Acked-by: Jerome Marchand Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 644d478793c6594277f8ae76954da4ace7ac6f96) Signed-off-by: Alex Shi --- drivers/staging/zram/zram_drv.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c index 753877431b5f..c549e3940bcf 100644 --- a/drivers/staging/zram/zram_drv.c +++ b/drivers/staging/zram/zram_drv.c @@ -527,8 +527,11 @@ static void zram_reset_device(struct zram *zram) size_t index; struct zram_meta *meta; - if (!zram->init_done) + down_write(&zram->init_lock); + if (!zram->init_done) { + up_write(&zram->init_lock); return; + } meta = zram->meta; zram->init_done = 0; @@ -549,6 +552,7 @@ static void zram_reset_device(struct zram *zram) zram->disksize = 0; set_capacity(zram->disk, 0); + up_write(&zram->init_lock); } static void zram_init_device(struct zram *zram, struct zram_meta *meta) From 068e927e51265b16110951101506d112ef38fb36 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 12 Jul 2013 14:20:52 -0400 Subject: [PATCH 08/82] staging: zram: Add auto loading of module if user opens /dev/zram. Greg spotted that said driver is not subscribing to the automagic mechanism of auto-loading if a user tries to open /dev/zram. This fixes it. CC: Minchan Kim Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Greg Kroah-Hartman (cherry picked from commit c70bda992c12e593e411c02a52e4bd6985407539) Signed-off-by: Alex Shi --- drivers/staging/zram/zram_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c index c549e3940bcf..77f40a7a6726 100644 --- a/drivers/staging/zram/zram_drv.c +++ b/drivers/staging/zram/zram_drv.c @@ -923,3 +923,4 @@ MODULE_PARM_DESC(num_devices, "Number of zram devices"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Nitin Gupta "); MODULE_DESCRIPTION("Compressed RAM Block Device"); +MODULE_ALIAS("devname:zram"); From 1da6589d327fdfbb2c91551ea9047e598b4115f5 Mon Sep 17 00:00:00 2001 From: Sunghan Suh Date: Wed, 3 Jul 2013 20:10:05 +0900 Subject: [PATCH 09/82] zram: prevent data loss in error cases of function zram_bvec_write() In function zram_bvec_write(), previous data at the index is already freed by function zram_free_page(). When failed to compress or zs_malloc, there is no way to restore old data. Therefore, free previous data when it's about to update. Also, no need to check whether table is not empty outside of function zram_free_page(), because the function properly checks inside. Signed-off-by: Sunghan Suh Signed-off-by: Greg Kroah-Hartman (cherry picked from commit f40ac2ae1b506484dd9261a24bbf3e86b2206ff8) Signed-off-by: Alex Shi --- drivers/staging/zram/zram_drv.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c index 77f40a7a6726..84df3999d6af 100644 --- a/drivers/staging/zram/zram_drv.c +++ b/drivers/staging/zram/zram_drv.c @@ -418,14 +418,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } - /* - * System overwrites unused sectors. Free memory associated - * with this sector now. - */ - if (meta->table[index].handle || - zram_test_flag(meta, index, ZRAM_ZERO)) - zram_free_page(zram, index); - user_mem = kmap_atomic(page); if (is_partial_io(bvec)) { @@ -439,6 +431,9 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, if (page_zero_filled(uncmem)) { kunmap_atomic(user_mem); + /* Free memory associated with this sector now. */ + zram_free_page(zram, index); + zram->stats.pages_zero++; zram_set_flag(meta, index, ZRAM_ZERO); ret = 0; @@ -486,6 +481,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, zs_unmap_object(meta->mem_pool, handle); + /* + * Free memory associated with this sector + * before overwriting unused sectors. + */ + zram_free_page(zram, index); + meta->table[index].handle = handle; meta->table[index].size = clen; From b237ffc2780c1c6d082930835e383143a51ea405 Mon Sep 17 00:00:00 2001 From: Kumar Gaurav Date: Thu, 8 Aug 2013 23:53:24 +0530 Subject: [PATCH 10/82] Staging: zram: zram_drv.c: Fixed Error of trailing whitespace Fixed by removing trailing whitespace Signed-off-by: Kumar Gaurav Signed-off-by: Greg Kroah-Hartman (cherry picked from commit a539c72a195c081d950475c2945cb82d80be9b66) Signed-off-by: Alex Shi --- drivers/staging/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c index 84df3999d6af..35d536a11395 100644 --- a/drivers/staging/zram/zram_drv.c +++ b/drivers/staging/zram/zram_drv.c @@ -169,7 +169,7 @@ static inline int is_partial_io(struct bio_vec *bvec) static inline int valid_io_request(struct zram *zram, struct bio *bio) { u64 start, end, bound; - + /* unaligned request */ if (unlikely(bio->bi_sector & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) return 0; From 5fc58bd448ae1b2bafd83fcb5e2c0d65fcee2c37 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 12 Aug 2013 15:13:55 +0900 Subject: [PATCH 11/82] zram: fix invalid memory access [1] tried to fix invalid memory access on zram->disk but it didn't fix properly because get_disk failed during module exit path. Actually, we don't need to reset zram->disk's capacity to zero in module exit path so that this patch introduces new argument "reset_capacity" on zram_reset_divice and it only reset it when reset_store is called. [1] 6030ea9b, zram: avoid invalid memory access in zram_exit() Cc: Nitin Gupta Cc: Jiang Liu Cc: stable@vger.kernel.org Signed-off-by: Minchan Kim Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 2b86ab9cc29fcd435cde9378c3b9ffe8b5c76128) Signed-off-by: Alex Shi --- drivers/staging/zram/zram_drv.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c index 35d536a11395..255d512763f2 100644 --- a/drivers/staging/zram/zram_drv.c +++ b/drivers/staging/zram/zram_drv.c @@ -523,7 +523,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, return ret; } -static void zram_reset_device(struct zram *zram) +static void zram_reset_device(struct zram *zram, bool reset_capacity) { size_t index; struct zram_meta *meta; @@ -552,7 +552,8 @@ static void zram_reset_device(struct zram *zram) memset(&zram->stats, 0, sizeof(zram->stats)); zram->disksize = 0; - set_capacity(zram->disk, 0); + if (reset_capacity) + set_capacity(zram->disk, 0); up_write(&zram->init_lock); } @@ -636,7 +637,7 @@ static ssize_t reset_store(struct device *dev, if (bdev) fsync_bdev(bdev); - zram_reset_device(zram); + zram_reset_device(zram, true); return len; } @@ -903,10 +904,12 @@ static void __exit zram_exit(void) for (i = 0; i < num_devices; i++) { zram = &zram_devices[i]; - get_disk(zram->disk); destroy_device(zram); - zram_reset_device(zram); - put_disk(zram->disk); + /* + * Shouldn't access zram->disk after destroy_device + * because destroy_device already released zram->disk. + */ + zram_reset_device(zram, false); } unregister_blkdev(zram_major, "zram"); From 92fe27ba3d29df5971bec2f2b55ff350194772a4 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 12 Aug 2013 15:13:56 +0900 Subject: [PATCH 12/82] zram: don't grab mutex in zram_slot_free_noity [1] introduced down_write in zram_slot_free_notify to prevent race between zram_slot_free_notify and zram_bvec_[read|write]. The race could happen if somebody who has right permission to open swap device is reading swap device while it is used by swap in parallel. However, zram_slot_free_notify is called with holding spin_lock of swap layer so we shouldn't avoid holing mutex. Otherwise, lockdep warns it. This patch adds new list to handle free slot and workqueue so zram_slot_free_notify just registers slot index to be freed and registers the request to workqueue. If workqueue is expired, it holds mutex_lock so there is no problem any more. If any I/O is issued, zram handles pending slot-free request caused by zram_slot_free_notify right before handling issued request because workqueue wouldn't be expired yet so zram I/O request handling function can miss it. Lastly, when zram is reset, flush_work could handle all of pending free request so we shouldn't have memory leak. NOTE: If zram_slot_free_notify's kmalloc with GFP_ATOMIC would be failed, the slot will be freed when next write I/O write the slot. [1] [57ab0485, zram: use zram->lock to protect zram_free_page() in swap free notify path] * from v2 * refactoring * from v1 * totally redesign Cc: Nitin Gupta Cc: Jiang Liu Cc: stable@vger.kernel.org Signed-off-by: Minchan Kim Signed-off-by: Greg Kroah-Hartman (cherry picked from commit a0c516cbfc7452c8cbd564525fef66d9f20b46d1) Signed-off-by: Alex Shi --- drivers/staging/zram/zram_drv.c | 60 +++++++++++++++++++++++++++++++-- drivers/staging/zram/zram_drv.h | 10 ++++++ 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c index 255d512763f2..3d08ff11e700 100644 --- a/drivers/staging/zram/zram_drv.c +++ b/drivers/staging/zram/zram_drv.c @@ -440,6 +440,14 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } + /* + * zram_slot_free_notify could miss free so that let's + * double check. + */ + if (unlikely(meta->table[index].handle || + zram_test_flag(meta, index, ZRAM_ZERO))) + zram_free_page(zram, index); + ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen, meta->compress_workmem); @@ -505,6 +513,20 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, return ret; } +static void handle_pending_slot_free(struct zram *zram) +{ + struct zram_slot_free *free_rq; + + spin_lock(&zram->slot_free_lock); + while (zram->slot_free_rq) { + free_rq = zram->slot_free_rq; + zram->slot_free_rq = free_rq->next; + zram_free_page(zram, free_rq->index); + kfree(free_rq); + } + spin_unlock(&zram->slot_free_lock); +} + static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, struct bio *bio, int rw) { @@ -512,10 +534,12 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, if (rw == READ) { down_read(&zram->lock); + handle_pending_slot_free(zram); ret = zram_bvec_read(zram, bvec, index, offset, bio); up_read(&zram->lock); } else { down_write(&zram->lock); + handle_pending_slot_free(zram); ret = zram_bvec_write(zram, bvec, index, offset); up_write(&zram->lock); } @@ -528,6 +552,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) size_t index; struct zram_meta *meta; + flush_work(&zram->free_work); + down_write(&zram->init_lock); if (!zram->init_done) { up_write(&zram->init_lock); @@ -722,16 +748,40 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) bio_io_error(bio); } +static void zram_slot_free(struct work_struct *work) +{ + struct zram *zram; + + zram = container_of(work, struct zram, free_work); + down_write(&zram->lock); + handle_pending_slot_free(zram); + up_write(&zram->lock); +} + +static void add_slot_free(struct zram *zram, struct zram_slot_free *free_rq) +{ + spin_lock(&zram->slot_free_lock); + free_rq->next = zram->slot_free_rq; + zram->slot_free_rq = free_rq; + spin_unlock(&zram->slot_free_lock); +} + static void zram_slot_free_notify(struct block_device *bdev, unsigned long index) { struct zram *zram; + struct zram_slot_free *free_rq; zram = bdev->bd_disk->private_data; - down_write(&zram->lock); - zram_free_page(zram, index); - up_write(&zram->lock); atomic64_inc(&zram->stats.notify_free); + + free_rq = kmalloc(sizeof(struct zram_slot_free), GFP_ATOMIC); + if (!free_rq) + return; + + free_rq->index = index; + add_slot_free(zram, free_rq); + schedule_work(&zram->free_work); } static const struct block_device_operations zram_devops = { @@ -778,6 +828,10 @@ static int create_device(struct zram *zram, int device_id) init_rwsem(&zram->lock); init_rwsem(&zram->init_lock); + INIT_WORK(&zram->free_work, zram_slot_free); + spin_lock_init(&zram->slot_free_lock); + zram->slot_free_rq = NULL; + zram->queue = blk_alloc_queue(GFP_KERNEL); if (!zram->queue) { pr_err("Error allocating disk queue for device %d\n", diff --git a/drivers/staging/zram/zram_drv.h b/drivers/staging/zram/zram_drv.h index 9e57bfb29b4f..97a3acf6ab76 100644 --- a/drivers/staging/zram/zram_drv.h +++ b/drivers/staging/zram/zram_drv.h @@ -94,11 +94,20 @@ struct zram_meta { struct zs_pool *mem_pool; }; +struct zram_slot_free { + unsigned long index; + struct zram_slot_free *next; +}; + struct zram { struct zram_meta *meta; struct rw_semaphore lock; /* protect compression buffers, table, * 32bit stat counters against concurrent * notifications, reads and writes */ + + struct work_struct free_work; /* handle pending free request */ + struct zram_slot_free *slot_free_rq; /* list head of free request */ + struct request_queue *queue; struct gendisk *disk; int init_done; @@ -109,6 +118,7 @@ struct zram { * we can store in a disk. */ u64 disksize; /* bytes */ + spinlock_t slot_free_lock; struct zram_stats stats; }; From e2671233fa7df020b7ea720db33f1237c418f3ab Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 12 Sep 2013 15:41:31 -0700 Subject: [PATCH 13/82] Revert "staging: zram: Add auto loading of module if user opens /dev/zram." This reverts commit c70bda992c12e593e411c02a52e4bd6985407539. It's incorrect, Kay writes: Please just remove it. "devname" is meant to be used for single-instance devices with a static dev_t, never for things like zramX. It will not do anything useful here, it does nothing really without a statically assigned dev_t, and it should not be used for devices of this kind anyway. Reported-by: Tom Gundersen Reported-by: Kay Sievers Cc: Minchan Kim Cc: Konrad Rzeszutek Wilk Signed-off-by: Greg Kroah-Hartman (cherry picked from commit f0f65a95de2840db3fa61c953dca267e7b773168) Signed-off-by: Alex Shi --- drivers/staging/zram/zram_drv.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c index 3d08ff11e700..7d8ff31f67f2 100644 --- a/drivers/staging/zram/zram_drv.c +++ b/drivers/staging/zram/zram_drv.c @@ -981,4 +981,3 @@ MODULE_PARM_DESC(num_devices, "Number of zram devices"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Nitin Gupta "); MODULE_DESCRIPTION("Compressed RAM Block Device"); -MODULE_ALIAS("devname:zram"); From 922959916229ecc5f143b0fe66a9a8bbf4b55169 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Wed, 30 Oct 2013 18:43:32 +0530 Subject: [PATCH 14/82] Staging: zram: Fix variable dereferenced before check This patch fixes the following Smatch warning in zram_drv.c- drivers/staging/zram/zram_drv.c:899 destroy_device() warn: variable dereferenced before check 'zram->disk' (see line 896) Acked-by: Minchan Kim Acked-by: Jerome Marchand Signed-off-by: Rashika Kheria Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 59d3fe540454dd8fc48d4eda44e200f9c98bef10) Signed-off-by: Alex Shi --- drivers/staging/zram/zram_drv.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c index 7d8ff31f67f2..a125cfae6942 100644 --- a/drivers/staging/zram/zram_drv.c +++ b/drivers/staging/zram/zram_drv.c @@ -896,13 +896,10 @@ static void destroy_device(struct zram *zram) sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, &zram_disk_attr_group); - if (zram->disk) { - del_gendisk(zram->disk); - put_disk(zram->disk); - } + del_gendisk(zram->disk); + put_disk(zram->disk); - if (zram->queue) - blk_cleanup_queue(zram->queue); + blk_cleanup_queue(zram->queue); } static int __init zram_init(void) From 3c073fe1e7f533e087a8e2faaff31f8b02e6aed9 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Wed, 30 Oct 2013 18:36:32 +0530 Subject: [PATCH 15/82] Staging: zram: Fix access of NULL pointer This patch fixes the bug in reset_store caused by accessing NULL pointer. The bdev gets its value from bdget_disk() which could fail when memory pressure is severe and hence can return NULL because allocation of inode in bdget could fail. Hence, this patch introduces a check for bdev to prevent reference to a NULL pointer in the later part of the code. It also removes unnecessary check of bdev for fsync_bdev(). Cc: stable Acked-by: Jerome Marchand Signed-off-by: Rashika Kheria Acked-by: Minchan Kim Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 46a51c80216cb891f271ad021f59009f34677499) Signed-off-by: Alex Shi --- drivers/staging/zram/zram_drv.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c index a125cfae6942..206f59d9a7a8 100644 --- a/drivers/staging/zram/zram_drv.c +++ b/drivers/staging/zram/zram_drv.c @@ -648,6 +648,9 @@ static ssize_t reset_store(struct device *dev, zram = dev_to_zram(dev); bdev = bdget_disk(zram->disk, 0); + if (!bdev) + return -ENOMEM; + /* Do not reset an active device! */ if (bdev->bd_holders) return -EBUSY; @@ -660,8 +663,7 @@ static ssize_t reset_store(struct device *dev, return -EINVAL; /* Make sure all pending I/O is finished */ - if (bdev) - fsync_bdev(bdev); + fsync_bdev(bdev); zram_reset_device(zram, true); return len; From 2b299eb831324558d68be874fa1c9d65a8cfe5f3 Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Sun, 10 Nov 2013 22:13:53 +0530 Subject: [PATCH 16/82] Staging: zram: Fix memory leak by refcount mismatch As suggested by Minchan Kim and Jerome Marchand "The code in reset_store get the block device (bdget_disk()) but it does not put it (bdput()) when it's done using it. The usage count is therefore incremented but never decremented." This patch also puts bdput() for all error cases. Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: stable@vger.kernel.org Signed-off-by: Rashika Kheria Signed-off-by: Greg Kroah-Hartman (cherry picked from commit 1b672224d128ec2570eb37572ff803cfe452b4f7) Signed-off-by: Alex Shi --- drivers/staging/zram/zram_drv.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c index 206f59d9a7a8..689ebf105acd 100644 --- a/drivers/staging/zram/zram_drv.c +++ b/drivers/staging/zram/zram_drv.c @@ -652,21 +652,30 @@ static ssize_t reset_store(struct device *dev, return -ENOMEM; /* Do not reset an active device! */ - if (bdev->bd_holders) - return -EBUSY; + if (bdev->bd_holders) { + ret = -EBUSY; + goto out; + } ret = kstrtou16(buf, 10, &do_reset); if (ret) - return ret; + goto out; - if (!do_reset) - return -EINVAL; + if (!do_reset) { + ret = -EINVAL; + goto out; + } /* Make sure all pending I/O is finished */ fsync_bdev(bdev); + bdput(bdev); zram_reset_device(zram, true); return len; + +out: + bdput(bdev); + return ret; } static void __zram_make_request(struct zram *zram, struct bio *bio, int rw) From 03dc2ac5b10ee9ab68090a486d54a7d53492c86d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:45:50 -0800 Subject: [PATCH 17/82] zsmalloc: move it under mm This patch moves zsmalloc under mm directory. Before that, description will explain why we have needed custom allocator. Zsmalloc is a new slab-based memory allocator for storing compressed pages. It is designed for low fragmentation and high allocation success rate on large object, but <= PAGE_SIZE allocations. zsmalloc differs from the kernel slab allocator in two primary ways to achieve these design goals. zsmalloc never requires high order page allocations to back slabs, or "size classes" in zsmalloc terms. Instead it allows multiple single-order pages to be stitched together into a "zspage" which backs the slab. This allows for higher allocation success rate under memory pressure. Also, zsmalloc allows objects to span page boundaries within the zspage. This allows for lower fragmentation than could be had with the kernel slab allocator for objects between PAGE_SIZE/2 and PAGE_SIZE. With the kernel slab allocator, if a page compresses to 60% of it original size, the memory savings gained through compression is lost in fragmentation because another object of the same size can't be stored in the leftover space. This ability to span pages results in zsmalloc allocations not being directly addressable by the user. The user is given an non-dereferencable handle in response to an allocation request. That handle must be mapped, using zs_map_object(), which returns a pointer to the mapped region that can be used. The mapping is necessary since the object data may reside in two different noncontigious pages. The zsmalloc fulfills the allocation needs for zram perfectly [sjenning@linux.vnet.ibm.com: borrow Seth's quote] Signed-off-by: Minchan Kim Acked-by: Nitin Gupta Reviewed-by: Konrad Rzeszutek Wilk Cc: Bob Liu Cc: Greg Kroah-Hartman Cc: Hugh Dickins Cc: Jens Axboe Cc: Luigi Semenzato Cc: Mel Gorman Cc: Pekka Enberg Cc: Rik van Riel Cc: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit bcf1647d0899666f0fb90d176abf63bae22abb7c) Signed-off-by: Alex Shi Conflicts: drivers/staging/zsmalloc/Kconfig mm/Kconfig mm/Makefile Conflicts solutions: only move zsmalloc to mm/, skip unrelated cma/zbud/zswap --- drivers/staging/Kconfig | 2 -- drivers/staging/Makefile | 1 - drivers/staging/zram/zram_drv.h | 3 +-- drivers/staging/zsmalloc/Makefile | 3 --- .../zsmalloc => include/linux}/zsmalloc.h | 0 mm/Kconfig | 25 +++++++++++++++++++ mm/Makefile | 1 + .../zsmalloc/zsmalloc-main.c => mm/zsmalloc.c | 3 +-- 8 files changed, 28 insertions(+), 10 deletions(-) delete mode 100644 drivers/staging/zsmalloc/Makefile rename {drivers/staging/zsmalloc => include/linux}/zsmalloc.h (100%) rename drivers/staging/zsmalloc/zsmalloc-main.c => mm/zsmalloc.c (99%) diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index aefe820a8005..60585217481f 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -72,8 +72,6 @@ source "drivers/staging/sep/Kconfig" source "drivers/staging/iio/Kconfig" -source "drivers/staging/zsmalloc/Kconfig" - source "drivers/staging/zram/Kconfig" source "drivers/staging/wlags49_h2/Kconfig" diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index 415772ea306d..29aaeaa283eb 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -31,7 +31,6 @@ obj-$(CONFIG_VME_BUS) += vme/ obj-$(CONFIG_DX_SEP) += sep/ obj-$(CONFIG_IIO) += iio/ obj-$(CONFIG_ZRAM) += zram/ -obj-$(CONFIG_ZSMALLOC) += zsmalloc/ obj-$(CONFIG_WLAGS49_H2) += wlags49_h2/ obj-$(CONFIG_WLAGS49_H25) += wlags49_h25/ obj-$(CONFIG_FB_SM7XX) += sm7xxfb/ diff --git a/drivers/staging/zram/zram_drv.h b/drivers/staging/zram/zram_drv.h index 97a3acf6ab76..d8f6596513c3 100644 --- a/drivers/staging/zram/zram_drv.h +++ b/drivers/staging/zram/zram_drv.h @@ -17,8 +17,7 @@ #include #include - -#include "../zsmalloc/zsmalloc.h" +#include /* * Some arbitrary value. This is just to catch diff --git a/drivers/staging/zsmalloc/Makefile b/drivers/staging/zsmalloc/Makefile deleted file mode 100644 index b134848a590d..000000000000 --- a/drivers/staging/zsmalloc/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -zsmalloc-y := zsmalloc-main.o - -obj-$(CONFIG_ZSMALLOC) += zsmalloc.o diff --git a/drivers/staging/zsmalloc/zsmalloc.h b/include/linux/zsmalloc.h similarity index 100% rename from drivers/staging/zsmalloc/zsmalloc.h rename to include/linux/zsmalloc.h diff --git a/mm/Kconfig b/mm/Kconfig index e742d06285b7..86919079b64c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -477,3 +477,28 @@ config FRONTSWAP and swap data is stored as normal on the matching swap device. If unsure, say Y to enable frontswap. + +config ZSMALLOC + bool "Memory allocator for compressed pages" + depends on MMU + default n + help + zsmalloc is a slab-based memory allocator designed to store + compressed RAM pages. zsmalloc uses virtual memory mapping + in order to reduce fragmentation. However, this results in a + non-standard allocator interface where a handle, not a pointer, is + returned by an alloc(). This handle must be mapped in order to + access the allocated space. + +config PGTABLE_MAPPING + bool "Use page table mapping to access object in zsmalloc" + depends on ZSMALLOC + help + By default, zsmalloc uses a copy-based object mapping method to + access allocations that span two pages. However, if a particular + architecture (ex, ARM) performs VM mapping faster than copying, + then you should select this. This causes zsmalloc to use page table + mapping rather than copying for object mapping. + + You can check speed with zsmalloc benchmark[1]. + [1] https://github.com/spartacus06/zsmalloc diff --git a/mm/Makefile b/mm/Makefile index 72c5acb9345f..b5ae0b0cc26c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -58,3 +58,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o +obj-$(CONFIG_ZSMALLOC) += zsmalloc.o diff --git a/drivers/staging/zsmalloc/zsmalloc-main.c b/mm/zsmalloc.c similarity index 99% rename from drivers/staging/zsmalloc/zsmalloc-main.c rename to mm/zsmalloc.c index 288f58252a18..6ad98bb06411 100644 --- a/drivers/staging/zsmalloc/zsmalloc-main.c +++ b/mm/zsmalloc.c @@ -78,8 +78,7 @@ #include #include #include - -#include "zsmalloc.h" +#include /* * This must be power of 2 and greater than of equal to sizeof(link_free). From 68955a0e9bb5f971229d3cabed259b31b39bda89 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:45:52 -0800 Subject: [PATCH 18/82] zram: promote zram from staging Zram has lived in staging for a LONG LONG time and have been fixed/improved by many contributors so code is clean and stable now. Of course, there are lots of product using zram in real practice. The major TV companys have used zram as swap since two years ago and recently our production team released android smart phone with zram which is used as swap, too and recently Android Kitkat start to use zram for small memory smart phone. And there was a report Google released their ChromeOS with zram, too and cyanogenmod have been used zram long time ago. And I heard some disto have used zram block device for tmpfs. In addition, I saw many report from many other peoples. For example, Lubuntu start to use it. The benefit of zram is very clear. With my experience, one of the benefit was to remove jitter of video application with backgroud memory pressure. It would be effect of efficient memory usage by compression but more issue is whether swap is there or not in the system. Recent mobile platforms have used JAVA so there are many anonymous pages. But embedded system normally are reluctant to use eMMC or SDCard as swap because there is wear-leveling and latency issues so if we do not use swap, it means we can't reclaim anoymous pages and at last, we could encounter OOM kill. :( Although we have real storage as swap, it was a problem, too. Because it sometime ends up making system very unresponsible caused by slow swap storage performance. Quote from Luigi on Google "Since Chrome OS was mentioned: the main reason why we don't use swap to a disk (rotating or SSD) is because it doesn't degrade gracefully and leads to a bad interactive experience. Generally we prefer to manage RAM at a higher level, by transparently killing and restarting processes. But we noticed that zram is fast enough to be competitive with the latter, and it lets us make more efficient use of the available RAM. " and he announced. http://www.spinics.net/lists/linux-mm/msg57717.html Other uses case is to use zram for block device. Zram is block device so anyone can format the block device and mount on it so some guys on the internet start zram as /var/tmp. http://forums.gentoo.org/viewtopic-t-838198-start-0.html Let's promote zram and enhance/maintain it instead of removing. Signed-off-by: Minchan Kim Reviewed-by: Konrad Rzeszutek Wilk Acked-by: Nitin Gupta Acked-by: Pekka Enberg Cc: Bob Liu Cc: Greg Kroah-Hartman Cc: Hugh Dickins Cc: Jens Axboe Cc: Luigi Semenzato Cc: Mel Gorman Cc: Rik van Riel Cc: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit cd67e10ac6997c6d1e1504e3c111b693bfdbc148) Signed-off-by: Alex Shi --- {drivers/staging/zram => Documentation/blockdev}/zram.txt | 0 drivers/block/Kconfig | 2 ++ drivers/block/Makefile | 1 + drivers/{staging => block}/zram/Kconfig | 0 drivers/{staging => block}/zram/Makefile | 0 drivers/{staging => block}/zram/zram_drv.c | 0 drivers/{staging => block}/zram/zram_drv.h | 0 drivers/staging/Kconfig | 2 -- drivers/staging/Makefile | 1 - 9 files changed, 3 insertions(+), 3 deletions(-) rename {drivers/staging/zram => Documentation/blockdev}/zram.txt (100%) rename drivers/{staging => block}/zram/Kconfig (100%) rename drivers/{staging => block}/zram/Makefile (100%) rename drivers/{staging => block}/zram/zram_drv.c (100%) rename drivers/{staging => block}/zram/zram_drv.h (100%) diff --git a/drivers/staging/zram/zram.txt b/Documentation/blockdev/zram.txt similarity index 100% rename from drivers/staging/zram/zram.txt rename to Documentation/blockdev/zram.txt diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index b81ddfea1da0..9da952c9af91 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -105,6 +105,8 @@ source "drivers/block/paride/Kconfig" source "drivers/block/mtip32xx/Kconfig" +source "drivers/block/zram/Kconfig" + config BLK_CPQ_DA tristate "Compaq SMART2 support" depends on PCI && VIRT_TO_BUS diff --git a/drivers/block/Makefile b/drivers/block/Makefile index ca07399a8d99..3675937ab651 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ +obj-$(CONFIG_ZRAM) += zram/ nvme-y := nvme-core.o nvme-scsi.o swim_mod-y := swim.o swim_asm.o diff --git a/drivers/staging/zram/Kconfig b/drivers/block/zram/Kconfig similarity index 100% rename from drivers/staging/zram/Kconfig rename to drivers/block/zram/Kconfig diff --git a/drivers/staging/zram/Makefile b/drivers/block/zram/Makefile similarity index 100% rename from drivers/staging/zram/Makefile rename to drivers/block/zram/Makefile diff --git a/drivers/staging/zram/zram_drv.c b/drivers/block/zram/zram_drv.c similarity index 100% rename from drivers/staging/zram/zram_drv.c rename to drivers/block/zram/zram_drv.c diff --git a/drivers/staging/zram/zram_drv.h b/drivers/block/zram/zram_drv.h similarity index 100% rename from drivers/staging/zram/zram_drv.h rename to drivers/block/zram/zram_drv.h diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index 60585217481f..25c8bffdd248 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -72,8 +72,6 @@ source "drivers/staging/sep/Kconfig" source "drivers/staging/iio/Kconfig" -source "drivers/staging/zram/Kconfig" - source "drivers/staging/wlags49_h2/Kconfig" source "drivers/staging/wlags49_h25/Kconfig" diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index 29aaeaa283eb..f9d86a4b48e9 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -30,7 +30,6 @@ obj-$(CONFIG_VT6656) += vt6656/ obj-$(CONFIG_VME_BUS) += vme/ obj-$(CONFIG_DX_SEP) += sep/ obj-$(CONFIG_IIO) += iio/ -obj-$(CONFIG_ZRAM) += zram/ obj-$(CONFIG_WLAGS49_H2) += wlags49_h2/ obj-$(CONFIG_WLAGS49_H25) += wlags49_h25/ obj-$(CONFIG_FB_SM7XX) += sm7xxfb/ From 15db1d2f8df0721faf81c4e23119a716e537b6cc Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:45:54 -0800 Subject: [PATCH 19/82] zram: remove old private project comment Remove the old private compcache project address so upcoming patches should be sent to LKML because we Linux kernel community will take care. Signed-off-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 49061236a9c2e18b31617cef10d27ba136068bac) Signed-off-by: Alex Shi --- Documentation/blockdev/zram.txt | 6 ------ drivers/block/zram/Kconfig | 1 - drivers/block/zram/zram_drv.c | 1 - drivers/block/zram/zram_drv.h | 1 - 4 files changed, 9 deletions(-) diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 765d790ae831..2eccddffa6c8 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -1,8 +1,6 @@ zram: Compressed RAM based block devices ---------------------------------------- -Project home: http://compcache.googlecode.com/ - * Introduction The zram module creates RAM based block devices named /dev/zram @@ -69,9 +67,5 @@ Following shows a typical sequence of steps for using zram. resets the disksize to zero. You must set the disksize again before reusing the device. -Please report any problems at: - - Mailing list: linux-mm-cc at laptop dot org - - Issue tracker: http://code.google.com/p/compcache/issues/list - Nitin Gupta ngupta@vflare.org diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 983314c41349..3450be850399 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -14,7 +14,6 @@ config ZRAM disks and maybe many more. See zram.txt for more information. - Project home: config ZRAM_DEBUG bool "Compressed RAM block device debug support" diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 689ebf105acd..4c492eade671 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -9,7 +9,6 @@ * Released under the terms of 3-clause BSD License * Released under the terms of GNU General Public License Version 2.0 * - * Project home: http://compcache.googlecode.com */ #define KMSG_COMPONENT "zram" diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index d8f6596513c3..92f70e8f457c 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -9,7 +9,6 @@ * Released under the terms of 3-clause BSD License * Released under the terms of GNU General Public License Version 2.0 * - * Project home: http://compcache.googlecode.com */ #ifndef _ZRAM_DRV_H_ From 851a07391ee1cea24cc2a2b099e12f5ce55edca9 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:45:55 -0800 Subject: [PATCH 20/82] zram: add copyright Add my copyright to the zram source code which I maintain. Signed-off-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 7bfb3de8a1b3bebc2dc68d381efe27448c0584c5) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 1 + drivers/block/zram/zram_drv.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 4c492eade671..d66b404fc535 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2,6 +2,7 @@ * Compressed RAM block device * * Copyright (C) 2008, 2009, 2010 Nitin Gupta + * 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the licence that better fits your requirements. diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 92f70e8f457c..0e46953c08e9 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -2,6 +2,7 @@ * Compressed RAM block device * * Copyright (C) 2008, 2009, 2010 Nitin Gupta + * 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the licence that better fits your requirements. From 00cfab35e838986cf72222f981444dab704db687 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:45:55 -0800 Subject: [PATCH 21/82] zsmalloc: add copyright Add my copyright to the zsmalloc source code which I maintain. Signed-off-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 31fc00bb788ffde7d8d861d8b2bba798ab445992) Signed-off-by: Alex Shi --- include/linux/zsmalloc.h | 1 + mm/zsmalloc.c | 1 + 2 files changed, 2 insertions(+) diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 46dbd0558d86..d3f48686bceb 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -2,6 +2,7 @@ * zsmalloc memory allocator * * Copyright (C) 2011 Nitin Gupta + * Copyright (C) 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the license that better fits your requirements. diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 6ad98bb06411..7a8d161b4cd7 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2,6 +2,7 @@ * zsmalloc memory allocator * * Copyright (C) 2011 Nitin Gupta + * Copyright (C) 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the license that better fits your requirements. From fbd4d659587f3be893860c57e7b338fe3f45284d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:45:58 -0800 Subject: [PATCH 22/82] zram: fix race between reset and flushing pending work Dan and Sergey reported that there is a racy between reset and flushing of pending work so that it could make oops by freeing zram->meta in reset while zram_slot_free can access zram->meta if new request is adding during the race window. This patch moves flush after taking init_lock so it prevents new request so that it closes the race. Signed-off-by: Minchan Kim Reported-by: Dan Carpenter Cc: Nitin Gupta Cc: Jerome Marchand Tested-by: Sergey Senozhatsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit da4a04126baa3be03bc566d4a2ee0944c5e783d0) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index d66b404fc535..1a377872729d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -552,14 +552,14 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) size_t index; struct zram_meta *meta; - flush_work(&zram->free_work); - down_write(&zram->init_lock); if (!zram->init_done) { up_write(&zram->init_lock); return; } + flush_work(&zram->free_work); + meta = zram->meta; zram->init_done = 0; From ee76411779c7fb286de437bd15367ff39ae88e6a Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:46:00 -0800 Subject: [PATCH 23/82] zram: delay pending free request in read path Sergey reported we don't need to handle pending free request every I/O so that this patch removes it in read path while we remain it in write path. Let's consider below example. Swap subsystem ask to zram "A" block free by swap_slot_free_notify but zram had been pended it without real freeing. Swap subsystem allocates "A" block for new data but request pended for a long time just handled and zram blindly free new data on the "A" block. :( That's why we couldn't remove handle pending free request right before zram-write. Signed-off-by: Minchan Kim Reported-by: Sergey Senozhatsky Tested-by: Sergey Senozhatsky Cc: Nitin Gupta Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 9b353db16d18f87242337e3e61a948c023505a65) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 1a377872729d..8b88a8d064af 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -534,7 +534,6 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, if (rw == READ) { down_read(&zram->lock); - handle_pending_slot_free(zram); ret = zram_bvec_read(zram, bvec, index, offset, bio); up_read(&zram->lock); } else { From 6eb4a8a4531c2411703a9c43428ad9de7e762a2f Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:46:01 -0800 Subject: [PATCH 24/82] zram: remove unnecessary free Commit a0c516cbfc74 ("zram: don't grab mutex in zram_slot_free_noity") introduced pending zram slot free in zram's write path in case of missing slot free by memory allocation failure in zram_slot_free_notify but it is not necessary because we have already freed the slot right before overwriting. Signed-off-by: Minchan Kim Cc: Nitin Gupta Cc: Jerome Marchand Tested-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 874e3cddc33f0c0f9cc08ad2b73fa0cbe7dfaa63) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 8b88a8d064af..a9c236de4676 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -440,14 +440,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } - /* - * zram_slot_free_notify could miss free so that let's - * double check. - */ - if (unlikely(meta->table[index].handle || - zram_test_flag(meta, index, ZRAM_ZERO))) - zram_free_page(zram, index); - ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen, meta->compress_workmem); From 671b5561a6c05a36da64a2b798b1b52dada37c2c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:46:02 -0800 Subject: [PATCH 25/82] zram: use atomic operation for stat Some of fields in zram->stats are protected by zram->lock which is rather coarse-grained so let's use atomic operation without explict locking. This patch is ready for removing dependency of zram->lock in read path which is very coarse-grained rw_semaphore. Of course, this patch adds new atomic operation so it might make slow but my 12CPU test couldn't spot any regression. All gain/lose is marginal within stddev. iozone -t -T -l 12 -u 12 -r 16K -s 60M -I +Z -V 0 ==Initial write ==Initial write records: 50 records: 50 avg: 412875.17 avg: 415638.23 std: 38543.12 (9.34%) std: 36601.11 (8.81%) max: 521262.03 max: 502976.72 min: 343263.13 min: 351389.12 ==Rewrite ==Rewrite records: 50 records: 50 avg: 416640.34 avg: 397914.33 std: 60798.92 (14.59%) std: 46150.42 (11.60%) max: 543057.07 max: 522669.17 min: 304071.67 min: 316588.77 ==Read ==Read records: 50 records: 50 avg: 4147338.63 avg: 4070736.51 std: 179333.25 (4.32%) std: 223499.89 (5.49%) max: 4459295.28 max: 4539514.44 min: 3753057.53 min: 3444686.31 ==Re-read ==Re-read records: 50 records: 50 avg: 4096706.71 avg: 4117218.57 std: 229735.04 (5.61%) std: 171676.25 (4.17%) max: 4430012.09 max: 4459263.94 min: 2987217.80 min: 3666904.28 ==Reverse Read ==Reverse Read records: 50 records: 50 avg: 4062763.83 avg: 4078508.32 std: 186208.46 (4.58%) std: 172684.34 (4.23%) max: 4401358.78 max: 4424757.22 min: 3381625.00 min: 3679359.94 ==Stride read ==Stride read records: 50 records: 50 avg: 4094933.49 avg: 4082170.22 std: 185710.52 (4.54%) std: 196346.68 (4.81%) max: 4478241.25 max: 4460060.97 min: 3732593.23 min: 3584125.78 ==Random read ==Random read records: 50 records: 50 avg: 4031070.04 avg: 4074847.49 std: 192065.51 (4.76%) std: 206911.33 (5.08%) max: 4356931.16 max: 4399442.56 min: 3481619.62 min: 3548372.44 ==Mixed workload ==Mixed workload records: 50 records: 50 avg: 149925.73 avg: 149675.54 std: 7701.26 (5.14%) std: 6902.09 (4.61%) max: 191301.56 max: 175162.05 min: 133566.28 min: 137762.87 ==Random write ==Random write records: 50 records: 50 avg: 404050.11 avg: 393021.47 std: 58887.57 (14.57%) std: 42813.70 (10.89%) max: 601798.09 max: 524533.43 min: 325176.99 min: 313255.34 ==Pwrite ==Pwrite records: 50 records: 50 avg: 411217.70 avg: 411237.96 std: 43114.99 (10.48%) std: 33136.29 (8.06%) max: 530766.79 max: 471899.76 min: 320786.84 min: 317906.94 ==Pread ==Pread records: 50 records: 50 avg: 4154908.65 avg: 4087121.92 std: 151272.08 (3.64%) std: 219505.04 (5.37%) max: 4459478.12 max: 4435857.38 min: 3730512.41 min: 3101101.67 Signed-off-by: Minchan Kim Cc: Nitin Gupta Tested-by: Sergey Senozhatsky Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit deb0bdeb2f3d6b81d37fc778316dae46b6daab56) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 20 ++++++++++---------- drivers/block/zram/zram_drv.h | 16 ++++++---------- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index a9c236de4676..0b53d1db7eaf 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -104,7 +104,7 @@ static ssize_t zero_pages_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%u\n", zram->stats.pages_zero); + return sprintf(buf, "%u\n", atomic_read(&zram->stats.pages_zero)); } static ssize_t orig_data_size_show(struct device *dev, @@ -113,7 +113,7 @@ static ssize_t orig_data_size_show(struct device *dev, struct zram *zram = dev_to_zram(dev); return sprintf(buf, "%llu\n", - (u64)(zram->stats.pages_stored) << PAGE_SHIFT); + (u64)(atomic_read(&zram->stats.pages_stored)) << PAGE_SHIFT); } static ssize_t compr_data_size_show(struct device *dev, @@ -292,21 +292,21 @@ static void zram_free_page(struct zram *zram, size_t index) */ if (zram_test_flag(meta, index, ZRAM_ZERO)) { zram_clear_flag(meta, index, ZRAM_ZERO); - zram->stats.pages_zero--; + atomic_dec(&zram->stats.pages_zero); } return; } if (unlikely(size > max_zpage_size)) - zram->stats.bad_compress--; + atomic_dec(&zram->stats.bad_compress); zs_free(meta->mem_pool, handle); if (size <= PAGE_SIZE / 2) - zram->stats.good_compress--; + atomic_dec(&zram->stats.good_compress); atomic64_sub(meta->table[index].size, &zram->stats.compr_size); - zram->stats.pages_stored--; + atomic_dec(&zram->stats.pages_stored); meta->table[index].handle = 0; meta->table[index].size = 0; @@ -434,7 +434,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, /* Free memory associated with this sector now. */ zram_free_page(zram, index); - zram->stats.pages_zero++; + atomic_inc(&zram->stats.pages_zero); zram_set_flag(meta, index, ZRAM_ZERO); ret = 0; goto out; @@ -455,7 +455,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, } if (unlikely(clen > max_zpage_size)) { - zram->stats.bad_compress++; + atomic_inc(&zram->stats.bad_compress); clen = PAGE_SIZE; src = NULL; if (is_partial_io(bvec)) @@ -492,9 +492,9 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, /* Update stats */ atomic64_add(clen, &zram->stats.compr_size); - zram->stats.pages_stored++; + atomic_inc(&zram->stats.pages_stored); if (clen <= PAGE_SIZE / 2) - zram->stats.good_compress++; + atomic_inc(&zram->stats.good_compress); out: if (is_partial_io(bvec)) diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 0e46953c08e9..81b0170de369 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -68,10 +68,6 @@ struct table { u8 flags; } __aligned(4); -/* - * All 64bit fields should only be manipulated by 64bit atomic accessors. - * All modifications to 32bit counter should be protected by zram->lock. - */ struct zram_stats { atomic64_t compr_size; /* compressed size of pages stored */ atomic64_t num_reads; /* failed + successful */ @@ -80,10 +76,10 @@ struct zram_stats { atomic64_t failed_writes; /* can happen when memory is too low */ atomic64_t invalid_io; /* non-page-aligned I/O requests */ atomic64_t notify_free; /* no. of swap slot free notifications */ - u32 pages_zero; /* no. of zero filled pages */ - u32 pages_stored; /* no. of pages currently stored */ - u32 good_compress; /* % of pages with compression ratio<=50% */ - u32 bad_compress; /* % of pages with compression ratio>=75% */ + atomic_t pages_zero; /* no. of zero filled pages */ + atomic_t pages_stored; /* no. of pages currently stored */ + atomic_t good_compress; /* % of pages with compression ratio<=50% */ + atomic_t bad_compress; /* % of pages with compression ratio>=75% */ }; struct zram_meta { @@ -101,8 +97,8 @@ struct zram_slot_free { struct zram { struct zram_meta *meta; struct rw_semaphore lock; /* protect compression buffers, table, - * 32bit stat counters against concurrent - * notifications, reads and writes */ + * reads and writes + */ struct work_struct free_work; /* handle pending free request */ struct zram_slot_free *slot_free_rq; /* list head of free request */ From 76b3c1eb150766da298e3c68074dbd1401f22a7c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:46:03 -0800 Subject: [PATCH 26/82] zram: introduce zram->tb_lock Currently, the zram table is protected by zram->lock but it's rather coarse-grained lock and it makes hard for scalibility. Let's use own rwlock instead of depending on zram->lock. This patch adds new locking so obviously, it would make slow but this patch is just prepartion for removing coarse-grained rw_semaphore(ie, zram->lock) which is hurdle about zram scalability. Final patch in this patchset series will remove the lock from read-path and change rw_semaphore with mutex in write path. With bonus, we could drop pending slot free mess in next patch. Signed-off-by: Minchan Kim Cc: Nitin Gupta Tested-by: Sergey Senozhatsky Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 92967471b67163bb1654e9b7fe99449ab70a4aaa) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 26 +++++++++++++++++++++----- drivers/block/zram/zram_drv.h | 3 ++- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 0b53d1db7eaf..42358b61e360 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -140,6 +140,7 @@ static ssize_t mem_used_total_show(struct device *dev, return sprintf(buf, "%llu\n", val); } +/* flag operations needs meta->tb_lock */ static int zram_test_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) { @@ -227,6 +228,7 @@ static struct zram_meta *zram_meta_alloc(u64 disksize) goto free_table; } + rwlock_init(&meta->tb_lock); return meta; free_table: @@ -279,6 +281,7 @@ static void handle_zero_page(struct bio_vec *bvec) flush_dcache_page(page); } +/* NOTE: caller should hold meta->tb_lock with write-side */ static void zram_free_page(struct zram *zram, size_t index) { struct zram_meta *meta = zram->meta; @@ -318,20 +321,26 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) size_t clen = PAGE_SIZE; unsigned char *cmem; struct zram_meta *meta = zram->meta; - unsigned long handle = meta->table[index].handle; + unsigned long handle; + u16 size; + + read_lock(&meta->tb_lock); + handle = meta->table[index].handle; + size = meta->table[index].size; if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) { + read_unlock(&meta->tb_lock); clear_page(mem); return 0; } cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); - if (meta->table[index].size == PAGE_SIZE) + if (size == PAGE_SIZE) copy_page(mem, cmem); else - ret = lzo1x_decompress_safe(cmem, meta->table[index].size, - mem, &clen); + ret = lzo1x_decompress_safe(cmem, size, mem, &clen); zs_unmap_object(meta->mem_pool, handle); + read_unlock(&meta->tb_lock); /* Should NEVER happen. Return bio error if it does. */ if (unlikely(ret != LZO_E_OK)) { @@ -352,11 +361,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, struct zram_meta *meta = zram->meta; page = bvec->bv_page; + read_lock(&meta->tb_lock); if (unlikely(!meta->table[index].handle) || zram_test_flag(meta, index, ZRAM_ZERO)) { + read_unlock(&meta->tb_lock); handle_zero_page(bvec); return 0; } + read_unlock(&meta->tb_lock); if (is_partial_io(bvec)) /* Use a temporary buffer to decompress the page */ @@ -432,10 +444,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, if (page_zero_filled(uncmem)) { kunmap_atomic(user_mem); /* Free memory associated with this sector now. */ + write_lock(&zram->meta->tb_lock); zram_free_page(zram, index); + zram_set_flag(meta, index, ZRAM_ZERO); + write_unlock(&zram->meta->tb_lock); atomic_inc(&zram->stats.pages_zero); - zram_set_flag(meta, index, ZRAM_ZERO); ret = 0; goto out; } @@ -485,10 +499,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, * Free memory associated with this sector * before overwriting unused sectors. */ + write_lock(&zram->meta->tb_lock); zram_free_page(zram, index); meta->table[index].handle = handle; meta->table[index].size = clen; + write_unlock(&zram->meta->tb_lock); /* Update stats */ atomic64_add(clen, &zram->stats.compr_size); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 81b0170de369..c3f453f04974 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -83,6 +83,7 @@ struct zram_stats { }; struct zram_meta { + rwlock_t tb_lock; /* protect table */ void *compress_workmem; void *compress_buffer; struct table *table; @@ -96,7 +97,7 @@ struct zram_slot_free { struct zram { struct zram_meta *meta; - struct rw_semaphore lock; /* protect compression buffers, table, + struct rw_semaphore lock; /* protect compression buffers, * reads and writes */ From fa5b73b76279cd7e88743b4b8ec2fa61478d6776 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:46:04 -0800 Subject: [PATCH 27/82] zram: remove workqueue for freeing removed pending slot Commit a0c516cbfc74 ("zram: don't grab mutex in zram_slot_free_noity") introduced free request pending code to avoid scheduling by mutex under spinlock and it was a mess which made code lenghty and increased overhead. Now, we don't need zram->lock any more to free slot so this patch reverts it and then, tb_lock should protect it. Signed-off-by: Minchan Kim Cc: Nitin Gupta Tested-by: Sergey Senozhatsky Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit f614a9f48dedd2b80d1dc8bae8094842fcdb39dd) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 54 ++++------------------------------- drivers/block/zram/zram_drv.h | 10 ------- 2 files changed, 6 insertions(+), 58 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 42358b61e360..3e797b844377 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -521,20 +521,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, return ret; } -static void handle_pending_slot_free(struct zram *zram) -{ - struct zram_slot_free *free_rq; - - spin_lock(&zram->slot_free_lock); - while (zram->slot_free_rq) { - free_rq = zram->slot_free_rq; - zram->slot_free_rq = free_rq->next; - zram_free_page(zram, free_rq->index); - kfree(free_rq); - } - spin_unlock(&zram->slot_free_lock); -} - static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, struct bio *bio, int rw) { @@ -546,7 +532,6 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, up_read(&zram->lock); } else { down_write(&zram->lock); - handle_pending_slot_free(zram); ret = zram_bvec_write(zram, bvec, index, offset); up_write(&zram->lock); } @@ -565,8 +550,6 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) return; } - flush_work(&zram->free_work); - meta = zram->meta; zram->init_done = 0; @@ -766,40 +749,19 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) bio_io_error(bio); } -static void zram_slot_free(struct work_struct *work) -{ - struct zram *zram; - - zram = container_of(work, struct zram, free_work); - down_write(&zram->lock); - handle_pending_slot_free(zram); - up_write(&zram->lock); -} - -static void add_slot_free(struct zram *zram, struct zram_slot_free *free_rq) -{ - spin_lock(&zram->slot_free_lock); - free_rq->next = zram->slot_free_rq; - zram->slot_free_rq = free_rq; - spin_unlock(&zram->slot_free_lock); -} - static void zram_slot_free_notify(struct block_device *bdev, unsigned long index) { struct zram *zram; - struct zram_slot_free *free_rq; + struct zram_meta *meta; zram = bdev->bd_disk->private_data; + meta = zram->meta; + + write_lock(&meta->tb_lock); + zram_free_page(zram, index); + write_unlock(&meta->tb_lock); atomic64_inc(&zram->stats.notify_free); - - free_rq = kmalloc(sizeof(struct zram_slot_free), GFP_ATOMIC); - if (!free_rq) - return; - - free_rq->index = index; - add_slot_free(zram, free_rq); - schedule_work(&zram->free_work); } static const struct block_device_operations zram_devops = { @@ -846,10 +808,6 @@ static int create_device(struct zram *zram, int device_id) init_rwsem(&zram->lock); init_rwsem(&zram->init_lock); - INIT_WORK(&zram->free_work, zram_slot_free); - spin_lock_init(&zram->slot_free_lock); - zram->slot_free_rq = NULL; - zram->queue = blk_alloc_queue(GFP_KERNEL); if (!zram->queue) { pr_err("Error allocating disk queue for device %d\n", diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index c3f453f04974..d876300da6c9 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -90,20 +90,11 @@ struct zram_meta { struct zs_pool *mem_pool; }; -struct zram_slot_free { - unsigned long index; - struct zram_slot_free *next; -}; - struct zram { struct zram_meta *meta; struct rw_semaphore lock; /* protect compression buffers, * reads and writes */ - - struct work_struct free_work; /* handle pending free request */ - struct zram_slot_free *slot_free_rq; /* list head of free request */ - struct request_queue *queue; struct gendisk *disk; int init_done; @@ -114,7 +105,6 @@ struct zram { * we can store in a disk. */ u64 disksize; /* bytes */ - spinlock_t slot_free_lock; struct zram_stats stats; }; From 3853d83925871721c6ee7b2bc40126183129a3ac Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 30 Jan 2014 15:46:06 -0800 Subject: [PATCH 28/82] zram: remove zram->lock in read path and change it with mutex Finally, we separated zram->lock dependency from 32bit stat/ table handling so there is no reason to use rw_semaphore between read and write path so this patch removes the lock from read path totally and changes rw_semaphore with mutex. So, we could do old: read-read: OK read-write: NO write-write: NO Now: read-read: OK read-write: OK write-write: NO The below data proves mixed workload performs well 11 times and there is also enhance on write-write path because current rw-semaphore doesn't support SPIN_ON_OWNER. It's side effect but anyway good thing for us. Write-related tests perform better (from 61% to 1058%) but read path has good/bad(from -2.22% to 1.45%) but they are all marginal within stddev. CPU 12 iozone -t -T -l 12 -u 12 -r 16K -s 60M -I +Z -V 0 ==Initial write ==Initial write records: 10 records: 10 avg: 516189.16 avg: 839907.96 std: 22486.53 (4.36%) std: 47902.17 (5.70%) max: 546970.60 max: 909910.35 min: 481131.54 min: 751148.38 ==Rewrite ==Rewrite records: 10 records: 10 avg: 509527.98 avg: 1050156.37 std: 45799.94 (8.99%) std: 40695.44 (3.88%) max: 611574.27 max: 1111929.26 min: 443679.95 min: 980409.62 ==Read ==Read records: 10 records: 10 avg: 4408624.17 avg: 4472546.76 std: 281152.61 (6.38%) std: 163662.78 (3.66%) max: 4867888.66 max: 4727351.03 min: 4058347.69 min: 4126520.88 ==Re-read ==Re-read records: 10 records: 10 avg: 4462147.53 avg: 4363257.75 std: 283546.11 (6.35%) std: 247292.63 (5.67%) max: 4912894.44 max: 4677241.75 min: 4131386.50 min: 4035235.84 ==Reverse Read ==Reverse Read records: 10 records: 10 avg: 4565865.97 avg: 4485818.08 std: 313395.63 (6.86%) std: 248470.10 (5.54%) max: 5232749.16 max: 4789749.94 min: 4185809.62 min: 3963081.34 ==Stride read ==Stride read records: 10 records: 10 avg: 4515981.80 avg: 4418806.01 std: 211192.32 (4.68%) std: 212837.97 (4.82%) max: 4889287.28 max: 4686967.22 min: 4210362.00 min: 4083041.84 ==Random read ==Random read records: 10 records: 10 avg: 4410525.23 avg: 4387093.18 std: 236693.22 (5.37%) std: 235285.23 (5.36%) max: 4713698.47 max: 4669760.62 min: 4057163.62 min: 3952002.16 ==Mixed workload ==Mixed workload records: 10 records: 10 avg: 243234.25 avg: 2818677.27 std: 28505.07 (11.72%) std: 195569.70 (6.94%) max: 288905.23 max: 3126478.11 min: 212473.16 min: 2484150.69 ==Random write ==Random write records: 10 records: 10 avg: 555887.07 avg: 1053057.79 std: 70841.98 (12.74%) std: 35195.36 (3.34%) max: 683188.28 max: 1096125.73 min: 437299.57 min: 992481.93 ==Pwrite ==Pwrite records: 10 records: 10 avg: 501745.93 avg: 810363.09 std: 16373.54 (3.26%) std: 19245.01 (2.37%) max: 518724.52 max: 833359.70 min: 464208.73 min: 765501.87 ==Pread ==Pread records: 10 records: 10 avg: 4539894.60 avg: 4457680.58 std: 197094.66 (4.34%) std: 188965.60 (4.24%) max: 4877170.38 max: 4689905.53 min: 4226326.03 min: 4095739.72 Signed-off-by: Minchan Kim Cc: Nitin Gupta Tested-by: Sergey Senozhatsky Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit e46e33152eb82b8e2db7ffb3790a2a2653c34513) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 17 ++++++++--------- drivers/block/zram/zram_drv.h | 4 +--- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3e797b844377..c450dd9390ab 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -229,6 +229,7 @@ static struct zram_meta *zram_meta_alloc(u64 disksize) } rwlock_init(&meta->tb_lock); + mutex_init(&meta->buffer_lock); return meta; free_table: @@ -411,6 +412,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, struct page *page; unsigned char *user_mem, *cmem, *src, *uncmem = NULL; struct zram_meta *meta = zram->meta; + bool locked = false; page = bvec->bv_page; src = meta->compress_buffer; @@ -430,6 +432,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } + mutex_lock(&meta->buffer_lock); + locked = true; user_mem = kmap_atomic(page); if (is_partial_io(bvec)) { @@ -456,7 +460,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen, meta->compress_workmem); - if (!is_partial_io(bvec)) { kunmap_atomic(user_mem); user_mem = NULL; @@ -513,6 +516,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, atomic_inc(&zram->stats.good_compress); out: + if (locked) + mutex_unlock(&meta->buffer_lock); if (is_partial_io(bvec)) kfree(uncmem); @@ -526,15 +531,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, { int ret; - if (rw == READ) { - down_read(&zram->lock); + if (rw == READ) ret = zram_bvec_read(zram, bvec, index, offset, bio); - up_read(&zram->lock); - } else { - down_write(&zram->lock); + else ret = zram_bvec_write(zram, bvec, index, offset); - up_write(&zram->lock); - } return ret; } @@ -805,7 +805,6 @@ static int create_device(struct zram *zram, int device_id) { int ret = -ENOMEM; - init_rwsem(&zram->lock); init_rwsem(&zram->init_lock); zram->queue = blk_alloc_queue(GFP_KERNEL); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index d876300da6c9..ad8aa35bae00 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -88,13 +88,11 @@ struct zram_meta { void *compress_buffer; struct table *table; struct zs_pool *mem_pool; + struct mutex buffer_lock; /* protect compress buffers */ }; struct zram { struct zram_meta *meta; - struct rw_semaphore lock; /* protect compression buffers, - * reads and writes - */ struct request_queue *queue; struct gendisk *disk; int init_done; From b048aa137fe66b7f19d842a4b145296bbc26caff Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 3 Mar 2014 15:38:34 -0800 Subject: [PATCH 29/82] zram: avoid null access when fail to alloc meta zram_meta_alloc could fail so caller should check it. Otherwise, your system will hang. Signed-off-by: Minchan Kim Acked-by: Jerome Marchand Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit db5d711e2db776f18219b033e5dc4fb7e4264dd7) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index c450dd9390ab..cf77a8a1ae97 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -611,6 +611,8 @@ static ssize_t disksize_store(struct device *dev, disksize = PAGE_ALIGN(disksize); meta = zram_meta_alloc(disksize); + if (!meta) + return -ENOMEM; down_write(&zram->init_lock); if (zram->init_done) { up_write(&zram->init_lock); From 30e87174bf64cdce17005dc7fe031ea1aab98161 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Tue, 11 Mar 2014 02:04:14 +0530 Subject: [PATCH 30/82] CPU hotplug: Provide lockless versions of callback registration functions The following method of CPU hotplug callback registration is not safe due to the possibility of an ABBA deadlock involving the cpu_add_remove_lock and the cpu_hotplug.lock. get_online_cpus(); for_each_online_cpu(cpu) init_cpu(cpu); register_cpu_notifier(&foobar_cpu_notifier); put_online_cpus(); The deadlock is shown below: CPU 0 CPU 1 ----- ----- Acquire cpu_hotplug.lock [via get_online_cpus()] CPU online/offline operation takes cpu_add_remove_lock [via cpu_maps_update_begin()] Try to acquire cpu_add_remove_lock [via register_cpu_notifier()] CPU online/offline operation tries to acquire cpu_hotplug.lock [via cpu_hotplug_begin()] *** DEADLOCK! *** The problem here is that callback registration takes the locks in one order whereas the CPU hotplug operations take the same locks in the opposite order. To avoid this issue and to provide a race-free method to register CPU hotplug callbacks (along with initialization of already online CPUs), introduce new variants of the callback registration APIs that simply register the callbacks without holding the cpu_add_remove_lock during the registration. That way, we can avoid the ABBA scenario. However, we will need to hold the cpu_add_remove_lock throughout the entire critical section, to protect updates to the callback/notifier chain. This can be achieved by writing the callback registration code as follows: cpu_maps_update_begin(); [ or cpu_notifier_register_begin(); see below ] for_each_online_cpu(cpu) init_cpu(cpu); /* This doesn't take the cpu_add_remove_lock */ __register_cpu_notifier(&foobar_cpu_notifier); cpu_maps_update_done(); [ or cpu_notifier_register_done(); see below ] Note that we can't use get_online_cpus() here instead of cpu_maps_update_begin() because the cpu_hotplug.lock is dropped during the invocation of CPU_POST_DEAD notifiers, and hence get_online_cpus() cannot provide the necessary synchronization to protect the callback/notifier chains against concurrent reads and writes. On the other hand, since the cpu_add_remove_lock protects the entire hotplug operation (including CPU_POST_DEAD), we can use cpu_maps_update_begin/done() to guarantee proper synchronization. Also, since cpu_maps_update_begin/done() is like a super-set of get/put_online_cpus(), the former naturally protects the critical sections from concurrent hotplug operations. Since the names cpu_maps_update_begin/done() don't make much sense in CPU hotplug callback registration scenarios, we'll introduce new APIs named cpu_notifier_register_begin/done() and map them to cpu_maps_update_begin/done(). In summary, introduce the lockless variants of un/register_cpu_notifier() and also export the cpu_notifier_register_begin/done() APIs for use by modules. This way, we provide a race-free way to register hotplug callbacks as well as perform initialization for the CPUs that are already online. Cc: Thomas Gleixner Cc: Andrew Morton Cc: Peter Zijlstra Cc: Ingo Molnar Acked-by: Oleg Nesterov Acked-by: Toshi Kani Reviewed-by: Gautham R. Shenoy Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki (cherry picked from commit 93ae4f978ca7f26d17df915ac7afc919c1dd0353) Signed-off-by: Alex Shi --- include/linux/cpu.h | 47 +++++++++++++++++++++++++++++++++++++++++++++ kernel/cpu.c | 21 ++++++++++++++++++-- 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 9f3c7e81270a..096af4570d69 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -119,26 +119,46 @@ enum { { .notifier_call = fn, .priority = pri }; \ register_cpu_notifier(&fn##_nb); \ } + +#define __cpu_notifier(fn, pri) { \ + static struct notifier_block fn##_nb = \ + { .notifier_call = fn, .priority = pri }; \ + __register_cpu_notifier(&fn##_nb); \ +} #else /* #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */ #define cpu_notifier(fn, pri) do { (void)(fn); } while (0) +#define __cpu_notifier(fn, pri) do { (void)(fn); } while (0) #endif /* #else #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */ + #ifdef CONFIG_HOTPLUG_CPU extern int register_cpu_notifier(struct notifier_block *nb); +extern int __register_cpu_notifier(struct notifier_block *nb); extern void unregister_cpu_notifier(struct notifier_block *nb); +extern void __unregister_cpu_notifier(struct notifier_block *nb); #else #ifndef MODULE extern int register_cpu_notifier(struct notifier_block *nb); +extern int __register_cpu_notifier(struct notifier_block *nb); #else static inline int register_cpu_notifier(struct notifier_block *nb) { return 0; } + +static inline int __register_cpu_notifier(struct notifier_block *nb) +{ + return 0; +} #endif static inline void unregister_cpu_notifier(struct notifier_block *nb) { } + +static inline void __unregister_cpu_notifier(struct notifier_block *nb) +{ +} #endif int cpu_up(unsigned int cpu); @@ -146,19 +166,32 @@ void notify_cpu_starting(unsigned int cpu); extern void cpu_maps_update_begin(void); extern void cpu_maps_update_done(void); +#define cpu_notifier_register_begin cpu_maps_update_begin +#define cpu_notifier_register_done cpu_maps_update_done + #else /* CONFIG_SMP */ #define cpu_notifier(fn, pri) do { (void)(fn); } while (0) +#define __cpu_notifier(fn, pri) do { (void)(fn); } while (0) static inline int register_cpu_notifier(struct notifier_block *nb) { return 0; } +static inline int __register_cpu_notifier(struct notifier_block *nb) +{ + return 0; +} + static inline void unregister_cpu_notifier(struct notifier_block *nb) { } +static inline void __unregister_cpu_notifier(struct notifier_block *nb) +{ +} + static inline void cpu_maps_update_begin(void) { } @@ -167,6 +200,14 @@ static inline void cpu_maps_update_done(void) { } +static inline void cpu_notifier_register_begin(void) +{ +} + +static inline void cpu_notifier_register_done(void) +{ +} + #endif /* CONFIG_SMP */ extern struct bus_type cpu_subsys; @@ -178,8 +219,11 @@ extern void put_online_cpus(void); extern void cpu_hotplug_disable(void); extern void cpu_hotplug_enable(void); #define hotcpu_notifier(fn, pri) cpu_notifier(fn, pri) +#define __hotcpu_notifier(fn, pri) __cpu_notifier(fn, pri) #define register_hotcpu_notifier(nb) register_cpu_notifier(nb) +#define __register_hotcpu_notifier(nb) __register_cpu_notifier(nb) #define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb) +#define __unregister_hotcpu_notifier(nb) __unregister_cpu_notifier(nb) void clear_tasks_mm_cpumask(int cpu); int cpu_down(unsigned int cpu); @@ -203,9 +247,12 @@ static inline void cpu_hotplug_driver_unlock(void) #define cpu_hotplug_disable() do { } while (0) #define cpu_hotplug_enable() do { } while (0) #define hotcpu_notifier(fn, pri) do { (void)(fn); } while (0) +#define __hotcpu_notifier(fn, pri) do { (void)(fn); } while (0) /* These aren't inline functions due to a GCC bug. */ #define register_hotcpu_notifier(nb) ({ (void)(nb); 0; }) +#define __register_hotcpu_notifier(nb) ({ (void)(nb); 0; }) #define unregister_hotcpu_notifier(nb) ({ (void)(nb); }) +#define __unregister_hotcpu_notifier(nb) ({ (void)(nb); }) #endif /* CONFIG_HOTPLUG_CPU */ #ifdef CONFIG_PM_SLEEP_SMP diff --git a/kernel/cpu.c b/kernel/cpu.c index bc255e25d5dd..5fbcbdd31fb3 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -27,18 +27,23 @@ static DEFINE_MUTEX(cpu_add_remove_lock); /* - * The following two API's must be used when attempting - * to serialize the updates to cpu_online_mask, cpu_present_mask. + * The following two APIs (cpu_maps_update_begin/done) must be used when + * attempting to serialize the updates to cpu_online_mask & cpu_present_mask. + * The APIs cpu_notifier_register_begin/done() must be used to protect CPU + * hotplug callback (un)registration performed using __register_cpu_notifier() + * or __unregister_cpu_notifier(). */ void cpu_maps_update_begin(void) { mutex_lock(&cpu_add_remove_lock); } +EXPORT_SYMBOL(cpu_notifier_register_begin); void cpu_maps_update_done(void) { mutex_unlock(&cpu_add_remove_lock); } +EXPORT_SYMBOL(cpu_notifier_register_done); static RAW_NOTIFIER_HEAD(cpu_chain); @@ -169,6 +174,11 @@ int __ref register_cpu_notifier(struct notifier_block *nb) return ret; } +int __ref __register_cpu_notifier(struct notifier_block *nb) +{ + return raw_notifier_chain_register(&cpu_chain, nb); +} + static int __cpu_notify(unsigned long val, void *v, int nr_to_call, int *nr_calls) { @@ -192,6 +202,7 @@ static void cpu_notify_nofail(unsigned long val, void *v) BUG_ON(cpu_notify(val, v)); } EXPORT_SYMBOL(register_cpu_notifier); +EXPORT_SYMBOL(__register_cpu_notifier); void __ref unregister_cpu_notifier(struct notifier_block *nb) { @@ -201,6 +212,12 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_cpu_notifier); +void __ref __unregister_cpu_notifier(struct notifier_block *nb) +{ + raw_notifier_chain_unregister(&cpu_chain, nb); +} +EXPORT_SYMBOL(__unregister_cpu_notifier); + /** * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU * @cpu: a CPU id From cda631acb964bb521add32cd60db08f43a23835c Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Tue, 11 Mar 2014 02:09:59 +0530 Subject: [PATCH 31/82] zsmalloc: Fix CPU hotplug callback registration Subsystems that want to register CPU hotplug callbacks, as well as perform initialization for the CPUs that are already online, often do it as shown below: get_online_cpus(); for_each_online_cpu(cpu) init_cpu(cpu); register_cpu_notifier(&foobar_cpu_notifier); put_online_cpus(); This is wrong, since it is prone to ABBA deadlocks involving the cpu_add_remove_lock and the cpu_hotplug.lock (when running concurrently with CPU hotplug operations). Instead, the correct and race-free way of performing the callback registration is: cpu_notifier_register_begin(); for_each_online_cpu(cpu) init_cpu(cpu); /* Note the use of the double underscored version of the API */ __register_cpu_notifier(&foobar_cpu_notifier); cpu_notifier_register_done(); Fix the zsmalloc code by using this latter form of callback registration. Cc: Nitin Gupta Cc: Ingo Molnar Signed-off-by: Srivatsa S. Bhat Acked-by: Minchan Kim Signed-off-by: Rafael J. Wysocki (cherry picked from commit f0e71fcd0fa6f3f5495cd9ad3f1e4acd94446a55) Signed-off-by: Alex Shi --- mm/zsmalloc.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 7a8d161b4cd7..b728f10e353b 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -780,21 +780,32 @@ static void zs_exit(void) { int cpu; + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); - unregister_cpu_notifier(&zs_cpu_nb); + __unregister_cpu_notifier(&zs_cpu_nb); + + cpu_notifier_register_done(); } static int zs_init(void) { int cpu, ret; - register_cpu_notifier(&zs_cpu_nb); + cpu_notifier_register_begin(); + + __register_cpu_notifier(&zs_cpu_nb); for_each_online_cpu(cpu) { ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); - if (notifier_to_errno(ret)) + if (notifier_to_errno(ret)) { + cpu_notifier_register_done(); goto fail; + } } + + cpu_notifier_register_done(); + return 0; fail: zs_exit(); From fafe199bf14378d4a0a309d15bba16ce6742bd19 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:00 -0700 Subject: [PATCH 32/82] zram: drop `init_done' struct zram member Introduce init_done() helper function which allows us to drop `init_done' struct zram member. init_done() uses the fact that ->init_done == 1 equals to ->meta != NULL. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit be2d1d56c82d8cf20e6c77515eb499f8e86eb5be) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 21 +++++++++++---------- drivers/block/zram/zram_drv.h | 1 - 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index cf77a8a1ae97..e75ccdf194a6 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -42,6 +42,11 @@ static struct zram *zram_devices; /* Module params (documentation at end) */ static unsigned int num_devices = 1; +static inline int init_done(struct zram *zram) +{ + return zram->meta != NULL; +} + static inline struct zram *dev_to_zram(struct device *dev) { return (struct zram *)dev_to_disk(dev)->private_data; @@ -60,7 +65,7 @@ static ssize_t initstate_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%u\n", zram->init_done); + return sprintf(buf, "%u\n", init_done(zram)); } static ssize_t num_reads_show(struct device *dev, @@ -133,7 +138,7 @@ static ssize_t mem_used_total_show(struct device *dev, struct zram_meta *meta = zram->meta; down_read(&zram->init_lock); - if (zram->init_done) + if (init_done(zram)) val = zs_get_total_size_bytes(meta->mem_pool); up_read(&zram->init_lock); @@ -545,14 +550,12 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) struct zram_meta *meta; down_write(&zram->init_lock); - if (!zram->init_done) { + if (!init_done(zram)) { up_write(&zram->init_lock); return; } meta = zram->meta; - zram->init_done = 0; - /* Free all pages that are still in this zram device */ for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) { unsigned long handle = meta->table[index].handle; @@ -593,8 +596,6 @@ static void zram_init_device(struct zram *zram, struct zram_meta *meta) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); zram->meta = meta; - zram->init_done = 1; - pr_debug("Initialization done!\n"); } @@ -614,7 +615,7 @@ static ssize_t disksize_store(struct device *dev, if (!meta) return -ENOMEM; down_write(&zram->init_lock); - if (zram->init_done) { + if (init_done(zram)) { up_write(&zram->init_lock); zram_meta_free(meta); pr_info("Cannot change disksize for initialized device\n"); @@ -733,7 +734,7 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) struct zram *zram = queue->queuedata; down_read(&zram->init_lock); - if (unlikely(!zram->init_done)) + if (unlikely(!init_done(zram))) goto error; if (!valid_io_request(zram, bio)) { @@ -856,7 +857,7 @@ static int create_device(struct zram *zram, int device_id) goto out_free_disk; } - zram->init_done = 0; + zram->meta = NULL; return 0; out_free_disk: diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index ad8aa35bae00..e81e9cdf4147 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -95,7 +95,6 @@ struct zram { struct zram_meta *meta; struct request_queue *queue; struct gendisk *disk; - int init_done; /* Prevent concurrent execution of device init, reset and R/W request */ struct rw_semaphore init_lock; /* From 18bebabb7ecc3c03e3339d825141908814a4cf59 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:01 -0700 Subject: [PATCH 33/82] zram: do not pass rw argument to __zram_make_request() Do not pass rw argument down the __zram_make_request() -> zram_bvec_rw() chain, decode it in zram_bvec_rw() instead. Besides, this is the place where we distinguish READ and WRITE bio data directions, so account zram RW stats here, instead of __zram_make_request(). This also allows to account a real number of zram READ/WRITE operations, not just requests (single RW request may cause a number of zram RW ops with separate locking, compression/decompression, etc). Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit be257c61306750d11c20d2ac567bf63304c696a3) Signed-off-by: Alex Shi Conflicts: drivers/block/zram/zram_drv.c Conflicts solution: keep bio struct as old before commit 4f024f3797 'block: Abstract out bvec iterator' --- drivers/block/zram/zram_drv.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e75ccdf194a6..3f98cf21c7f0 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -532,14 +532,18 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, } static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, struct bio *bio, int rw) + int offset, struct bio *bio) { int ret; + int rw = bio_data_dir(bio); - if (rw == READ) + if (rw == READ) { + atomic64_inc(&zram->stats.num_reads); ret = zram_bvec_read(zram, bvec, index, offset, bio); - else + } else { + atomic64_inc(&zram->stats.num_writes); ret = zram_bvec_write(zram, bvec, index, offset); + } return ret; } @@ -671,20 +675,12 @@ static ssize_t reset_store(struct device *dev, return ret; } -static void __zram_make_request(struct zram *zram, struct bio *bio, int rw) +static void __zram_make_request(struct zram *zram, struct bio *bio) { int i, offset; u32 index; struct bio_vec *bvec; - switch (rw) { - case READ: - atomic64_inc(&zram->stats.num_reads); - break; - case WRITE: - atomic64_inc(&zram->stats.num_writes); - break; - } index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT; offset = (bio->bi_sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; @@ -703,16 +699,15 @@ static void __zram_make_request(struct zram *zram, struct bio *bio, int rw) bv.bv_len = max_transfer_size; bv.bv_offset = bvec->bv_offset; - if (zram_bvec_rw(zram, &bv, index, offset, bio, rw) < 0) + if (zram_bvec_rw(zram, &bv, index, offset, bio) < 0) goto out; bv.bv_len = bvec->bv_len - max_transfer_size; bv.bv_offset += max_transfer_size; - if (zram_bvec_rw(zram, &bv, index+1, 0, bio, rw) < 0) + if (zram_bvec_rw(zram, &bv, index + 1, 0, bio) < 0) goto out; } else - if (zram_bvec_rw(zram, bvec, index, offset, bio, rw) - < 0) + if (zram_bvec_rw(zram, bvec, index, offset, bio) < 0) goto out; update_position(&index, &offset, bvec); @@ -742,7 +737,7 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) goto error; } - __zram_make_request(zram, bio, bio_data_dir(bio)); + __zram_make_request(zram, bio); up_read(&zram->init_lock); return; From e700d5f0d65058709b17623728a356e2c16a3a3a Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:02 -0700 Subject: [PATCH 34/82] zram: remove good and bad compress stats Remove `good' and `bad' compressed sub-requests stats. RW request may cause a number of RW sub-requests. zram used to account `good' compressed sub-queries (with compressed size less than 50% of original size), `bad' compressed sub-queries (with compressed size greater that 75% of original size), leaving sub-requests with compression size between 50% and 75% of original size not accounted and not reported. zram already accounts each sub-request's compression size so we can calculate real device compression ratio. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit b7cccf8b4009bf74df61f3c9d86b95fabd807c11) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 11 ----------- drivers/block/zram/zram_drv.h | 2 -- 2 files changed, 13 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3f98cf21c7f0..a09618cc9e68 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -292,7 +292,6 @@ static void zram_free_page(struct zram *zram, size_t index) { struct zram_meta *meta = zram->meta; unsigned long handle = meta->table[index].handle; - u16 size = meta->table[index].size; if (unlikely(!handle)) { /* @@ -306,14 +305,8 @@ static void zram_free_page(struct zram *zram, size_t index) return; } - if (unlikely(size > max_zpage_size)) - atomic_dec(&zram->stats.bad_compress); - zs_free(meta->mem_pool, handle); - if (size <= PAGE_SIZE / 2) - atomic_dec(&zram->stats.good_compress); - atomic64_sub(meta->table[index].size, &zram->stats.compr_size); atomic_dec(&zram->stats.pages_stored); @@ -477,7 +470,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, } if (unlikely(clen > max_zpage_size)) { - atomic_inc(&zram->stats.bad_compress); clen = PAGE_SIZE; src = NULL; if (is_partial_io(bvec)) @@ -517,9 +509,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, /* Update stats */ atomic64_add(clen, &zram->stats.compr_size); atomic_inc(&zram->stats.pages_stored); - if (clen <= PAGE_SIZE / 2) - atomic_inc(&zram->stats.good_compress); - out: if (locked) mutex_unlock(&meta->buffer_lock); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index e81e9cdf4147..2f173cb1fd0a 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -78,8 +78,6 @@ struct zram_stats { atomic64_t notify_free; /* no. of swap slot free notifications */ atomic_t pages_zero; /* no. of zero filled pages */ atomic_t pages_stored; /* no. of pages currently stored */ - atomic_t good_compress; /* % of pages with compression ratio<=50% */ - atomic_t bad_compress; /* % of pages with compression ratio>=75% */ }; struct zram_meta { From 2385e8be54460cae2cf803a482a0b8b76cf8d001 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:03 -0700 Subject: [PATCH 35/82] zram: use atomic64_t for all zram stats This is a preparation patch for stats code duplication removal. 1) use atomic64_t for `pages_zero' and `pages_stored' zram stats. 2) `compr_size' and `pages_zero' struct zram_stats members did not follow the existing device attr naming scheme: zram_stats.ATTR has ATTR_show() function. rename them: -- compr_size -> compr_data_size -- pages_zero -> zero_pages Minchan Kim's note: If we really have trouble with atomic stat operation, we could change it with percpu_counter so that it could solve atomic overhead and unnecessary memory space by introducing unsigned long instead of 64bit atomic_t. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 90a7806ea9b9f7cb4751859cc2506e2d80e36ef1) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 18 +++++++++--------- drivers/block/zram/zram_drv.h | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index a09618cc9e68..4da4210b3454 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -109,7 +109,7 @@ static ssize_t zero_pages_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%u\n", atomic_read(&zram->stats.pages_zero)); + return sprintf(buf, "%llu\n", (u64)atomic64_read(&zram->stats.zero_pages)); } static ssize_t orig_data_size_show(struct device *dev, @@ -118,7 +118,7 @@ static ssize_t orig_data_size_show(struct device *dev, struct zram *zram = dev_to_zram(dev); return sprintf(buf, "%llu\n", - (u64)(atomic_read(&zram->stats.pages_stored)) << PAGE_SHIFT); + (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT); } static ssize_t compr_data_size_show(struct device *dev, @@ -127,7 +127,7 @@ static ssize_t compr_data_size_show(struct device *dev, struct zram *zram = dev_to_zram(dev); return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.compr_size)); + (u64)atomic64_read(&zram->stats.compr_data_size)); } static ssize_t mem_used_total_show(struct device *dev, @@ -300,15 +300,15 @@ static void zram_free_page(struct zram *zram, size_t index) */ if (zram_test_flag(meta, index, ZRAM_ZERO)) { zram_clear_flag(meta, index, ZRAM_ZERO); - atomic_dec(&zram->stats.pages_zero); + atomic64_dec(&zram->stats.zero_pages); } return; } zs_free(meta->mem_pool, handle); - atomic64_sub(meta->table[index].size, &zram->stats.compr_size); - atomic_dec(&zram->stats.pages_stored); + atomic64_sub(meta->table[index].size, &zram->stats.compr_data_size); + atomic64_dec(&zram->stats.pages_stored); meta->table[index].handle = 0; meta->table[index].size = 0; @@ -451,7 +451,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, zram_set_flag(meta, index, ZRAM_ZERO); write_unlock(&zram->meta->tb_lock); - atomic_inc(&zram->stats.pages_zero); + atomic64_inc(&zram->stats.zero_pages); ret = 0; goto out; } @@ -507,8 +507,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, write_unlock(&zram->meta->tb_lock); /* Update stats */ - atomic64_add(clen, &zram->stats.compr_size); - atomic_inc(&zram->stats.pages_stored); + atomic64_add(clen, &zram->stats.compr_data_size); + atomic64_inc(&zram->stats.pages_stored); out: if (locked) mutex_unlock(&meta->buffer_lock); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 2f173cb1fd0a..58d4ac537f65 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -69,15 +69,15 @@ struct table { } __aligned(4); struct zram_stats { - atomic64_t compr_size; /* compressed size of pages stored */ + atomic64_t compr_data_size; /* compressed size of pages stored */ atomic64_t num_reads; /* failed + successful */ atomic64_t num_writes; /* --do-- */ atomic64_t failed_reads; /* should NEVER! happen */ atomic64_t failed_writes; /* can happen when memory is too low */ atomic64_t invalid_io; /* non-page-aligned I/O requests */ atomic64_t notify_free; /* no. of swap slot free notifications */ - atomic_t pages_zero; /* no. of zero filled pages */ - atomic_t pages_stored; /* no. of pages currently stored */ + atomic64_t zero_pages; /* no. of zero filled pages */ + atomic64_t pages_stored; /* no. of pages currently stored */ }; struct zram_meta { From e72baa0d78767e5eeaba4f3a3f82be4b584f57fd Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:04 -0700 Subject: [PATCH 36/82] zram: remove zram stats code duplication Introduce ZRAM_ATTR_RO macro that generates device_attribute and default ATTR show() function for existing atomic64_t zram stats. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit a68eb3b65e658406d386bebef02277f4007b2f45) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 82 ++++++++++------------------------- 1 file changed, 23 insertions(+), 59 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 4da4210b3454..c4179fb54fde 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -42,6 +42,17 @@ static struct zram *zram_devices; /* Module params (documentation at end) */ static unsigned int num_devices = 1; +#define ZRAM_ATTR_RO(name) \ +static ssize_t zram_attr_##name##_show(struct device *d, \ + struct device_attribute *attr, char *b) \ +{ \ + struct zram *zram = dev_to_zram(d); \ + return sprintf(b, "%llu\n", \ + (u64)atomic64_read(&zram->stats.name)); \ +} \ +static struct device_attribute dev_attr_##name = \ + __ATTR(name, S_IRUGO, zram_attr_##name##_show, NULL); + static inline int init_done(struct zram *zram) { return zram->meta != NULL; @@ -63,53 +74,14 @@ static ssize_t disksize_show(struct device *dev, static ssize_t initstate_show(struct device *dev, struct device_attribute *attr, char *buf) { + u32 val; struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%u\n", init_done(zram)); -} + down_read(&zram->init_lock); + val = init_done(zram); + up_read(&zram->init_lock); -static ssize_t num_reads_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.num_reads)); -} - -static ssize_t num_writes_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.num_writes)); -} - -static ssize_t invalid_io_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.invalid_io)); -} - -static ssize_t notify_free_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.notify_free)); -} - -static ssize_t zero_pages_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", (u64)atomic64_read(&zram->stats.zero_pages)); + return sprintf(buf, "%u\n", val); } static ssize_t orig_data_size_show(struct device *dev, @@ -121,15 +93,6 @@ static ssize_t orig_data_size_show(struct device *dev, (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT); } -static ssize_t compr_data_size_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct zram *zram = dev_to_zram(dev); - - return sprintf(buf, "%llu\n", - (u64)atomic64_read(&zram->stats.compr_data_size)); -} - static ssize_t mem_used_total_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -760,15 +723,16 @@ static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR, disksize_show, disksize_store); static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); -static DEVICE_ATTR(num_reads, S_IRUGO, num_reads_show, NULL); -static DEVICE_ATTR(num_writes, S_IRUGO, num_writes_show, NULL); -static DEVICE_ATTR(invalid_io, S_IRUGO, invalid_io_show, NULL); -static DEVICE_ATTR(notify_free, S_IRUGO, notify_free_show, NULL); -static DEVICE_ATTR(zero_pages, S_IRUGO, zero_pages_show, NULL); static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); -static DEVICE_ATTR(compr_data_size, S_IRUGO, compr_data_size_show, NULL); static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); +ZRAM_ATTR_RO(num_reads); +ZRAM_ATTR_RO(num_writes); +ZRAM_ATTR_RO(invalid_io); +ZRAM_ATTR_RO(notify_free); +ZRAM_ATTR_RO(zero_pages); +ZRAM_ATTR_RO(compr_data_size); + static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, &dev_attr_initstate.attr, From 0a0d055ef99c20fe0e6b1cced9d358ab2e56eafd Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:05 -0700 Subject: [PATCH 37/82] zram: report failed read and write stats zram accounted but did not report numbers of failed read and write queries. make these stats available as failed_reads and failed_writes attrs. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 6444724939db5de7390c90f7b4a657159b3b4465) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index c4179fb54fde..4043e783e50a 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -728,6 +728,8 @@ static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); +ZRAM_ATTR_RO(failed_reads); +ZRAM_ATTR_RO(failed_writes); ZRAM_ATTR_RO(invalid_io); ZRAM_ATTR_RO(notify_free); ZRAM_ATTR_RO(zero_pages); @@ -739,6 +741,8 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_reset.attr, &dev_attr_num_reads.attr, &dev_attr_num_writes.attr, + &dev_attr_failed_reads.attr, + &dev_attr_failed_writes.attr, &dev_attr_invalid_io.attr, &dev_attr_notify_free.attr, &dev_attr_zero_pages.attr, From 7d9e350a92b55768e644d5f7b1aacab9acd032f6 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:06 -0700 Subject: [PATCH 38/82] zram: drop not used table `count' member struct table `count' member is not used. Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Acked-by: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 59fc86a4922f1a1c0f69eac758a7e2b2b138aab4) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 58d4ac537f65..1d5b1f5786a8 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -64,7 +64,6 @@ enum zram_pageflags { struct table { unsigned long handle; u16 size; /* object size (excluding header) */ - u8 count; /* object ref count (not yet used) */ u8 flags; } __aligned(4); From 65281bd78e42b4137f9cf87c4b734ff2325bf483 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:07 -0700 Subject: [PATCH 39/82] zram: move zram size warning to documentation Move zram warning about disksize and size of memory correlation to zram documentation. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit e64cd51d2fa87733176246101df871a8ac5c7c20) Signed-off-by: Alex Shi --- Documentation/blockdev/zram.txt | 5 +++++ drivers/block/zram/zram_drv.c | 15 --------------- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 2eccddffa6c8..393541be1ec0 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -33,6 +33,11 @@ Following shows a typical sequence of steps for using zram. echo 512M > /sys/block/zram0/disksize echo 1G > /sys/block/zram0/disksize +Note: +There is little point creating a zram of greater than twice the size of memory +since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the +size of the disk when not in use so a huge zram is wasteful. + 3) Activate: mkswap /dev/zram0 swapon /dev/zram0 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 4043e783e50a..a083dffb699e 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -534,23 +534,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) static void zram_init_device(struct zram *zram, struct zram_meta *meta) { - if (zram->disksize > 2 * (totalram_pages << PAGE_SHIFT)) { - pr_info( - "There is little point creating a zram of greater than " - "twice the size of memory since we expect a 2:1 compression " - "ratio. Note that zram uses about 0.1%% of the size of " - "the disk when not in use so a huge zram is " - "wasteful.\n" - "\tMemory Size: %lu kB\n" - "\tSize you selected: %llu kB\n" - "Continuing anyway ...\n", - (totalram_pages << PAGE_SHIFT) >> 10, zram->disksize >> 10 - ); - } - /* zram devices sort of resembles non-rotational disks */ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); - zram->meta = meta; pr_debug("Initialization done!\n"); } From 6ab557fb5b5851a3848743ad6df597323b9d39ee Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:09 -0700 Subject: [PATCH 40/82] zram: delete zram_init_device() allocate new `zram_meta' in disksize_store() only for uninitialised zram device, saving a number of allocations and deallocations in case if disksize_store() was called on currently used device. at the same time zram_meta stack variable is not necessary, because we can set ->meta directly. there is also no need in setting QUEUE_FLAG_NONROT queue on every disksize_store(), set it once during device creation. [minchan@kernel.org: handle zram->meta alloc fail case] [minchan@kernel.org: prevent lockdep spew of init_lock] Signed-off-by: Sergey Senozhatsky Signed-off-by: Minchan Kim Acked-by: Jerome Marchand Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit b67d1ec189ffb92cdad9b2bd29475fb1e0166983) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index a083dffb699e..9c71757aa1c6 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -532,14 +532,6 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) up_write(&zram->init_lock); } -static void zram_init_device(struct zram *zram, struct zram_meta *meta) -{ - /* zram devices sort of resembles non-rotational disks */ - queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); - zram->meta = meta; - pr_debug("Initialization done!\n"); -} - static ssize_t disksize_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -555,17 +547,18 @@ static ssize_t disksize_store(struct device *dev, meta = zram_meta_alloc(disksize); if (!meta) return -ENOMEM; + down_write(&zram->init_lock); if (init_done(zram)) { - up_write(&zram->init_lock); zram_meta_free(meta); + up_write(&zram->init_lock); pr_info("Cannot change disksize for initialized device\n"); return -EBUSY; } + zram->meta = meta; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - zram_init_device(zram, meta); up_write(&zram->init_lock); return len; @@ -774,7 +767,8 @@ static int create_device(struct zram *zram, int device_id) /* Actual capacity set using syfs (/sys/block/zram/disksize */ set_capacity(zram->disk, 0); - + /* zram devices sort of resembles non-rotational disks */ + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); /* * To ensure that we always get PAGE_SIZE aligned * and n*PAGE_SIZED sized I/O requests. From 4c48f57e379f75106cde867e4561375ed37ef69f Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:11 -0700 Subject: [PATCH 41/82] zram: introduce compressing backend abstraction ZRAM performs direct LZO compression algorithm calls, making it the one and only option. While LZO is generally performs well, LZ4 algorithm tends to have a faster decompression (see http://code.google.com/p/lz4/ for full report) Name Ratio C.speed D.speed MB/s MB/s LZ4 (r101) 2.084 422 1820 LZO 2.06 2.106 414 600 Thus, users who have mostly read (decompress) usage scenarious or mixed workflow (writes with relatively high read ops number) will benefit from using LZ4 compression backend. Introduce compressing backend abstraction zcomp in order to support multiple compression algorithms with the following set of operations: .create .destroy .compress .decompress Schematically zram write() usually contains the following steps: 0) preparation (decompression of partioal IO, etc.) 1) lock buffer_lock mutex (protects meta compress buffers) 2) compress (using meta compress buffers) 3) alloc and map zs_pool object 4) copy compressed data (from meta compress buffers) to object allocated by 3) 5) free previous pool page, assign a new one 6) unlock buffer_lock mutex As we can see, compressing buffers must remain untouched from 1) to 4), because, otherwise, concurrent write() can overwrite data. At the same time, zram_meta must be aware of a) specific compression algorithm memory requirements and b) necessary locking to protect compression buffers. To remove requirement a) new struct zcomp_strm introduced, which contains a compress/decompress `buffer' and compression algorithm `private' part. While struct zcomp implements zcomp_strm stream handling and locking and removes requirement b) from zram meta. zcomp ->create() and ->destroy(), respectively, allocate and deallocate algorithm specific zcomp_strm `private' part. Every zcomp has zcomp stream and mutex to protect its compression stream. Stream usage semantics remains the same -- only one write can hold stream lock and use its buffers. zcomp_strm_find() turns caller into exclusive user of a stream (holding stream mutex until zram release stream), and zcomp_strm_release() makes zcomp stream available (unlock the stream mutex). Hence no concurrent write (compression) operations possible at the moment. iozone -t 3 -R -r 16K -s 60M -I +Z test base patched -------------------------------------------------- Initial write 597992.91 591660.58 Rewrite 609674.34 616054.97 Read 2404771.75 2452909.12 Re-read 2459216.81 2470074.44 Reverse Read 1652769.66 1589128.66 Stride read 2202441.81 2202173.31 Random read 2236311.47 2276565.31 Mixed workload 1423760.41 1709760.06 Random write 579584.08 615933.86 Pwrite 597550.02 594933.70 Pread 1703672.53 1718126.72 Fwrite 1330497.06 1461054.00 Fread 3922851.00 3957242.62 Usage examples: comp = zcomp_create(NAME) /* NAME e.g. "lzo" */ which initialises compressing backend if requested algorithm is supported. Compress: zstrm = zcomp_strm_find(comp) zcomp_compress(comp, zstrm, src, &dst_len) [..] /* copy compressed data */ zcomp_strm_release(comp, zstrm) Decompress: zcomp_decompress(comp, src, src_len, dst); Free compessing backend and its zcomp stream: zcomp_destroy(comp) Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit e7e1ef439d18f9a21521116ea9f2b976d7230e54) Signed-off-by: Alex Shi --- drivers/block/zram/zcomp.c | 115 +++++++++++++++++++++++++++++++++ drivers/block/zram/zcomp.h | 58 +++++++++++++++++ drivers/block/zram/zcomp_lzo.c | 47 ++++++++++++++ drivers/block/zram/zcomp_lzo.h | 17 +++++ 4 files changed, 237 insertions(+) create mode 100644 drivers/block/zram/zcomp.c create mode 100644 drivers/block/zram/zcomp.h create mode 100644 drivers/block/zram/zcomp_lzo.c create mode 100644 drivers/block/zram/zcomp_lzo.h diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c new file mode 100644 index 000000000000..22f4ae235660 --- /dev/null +++ b/drivers/block/zram/zcomp.c @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include "zcomp.h" +#include "zcomp_lzo.h" + +static struct zcomp_backend *find_backend(const char *compress) +{ + if (strncmp(compress, "lzo", 3) == 0) + return &zcomp_lzo; + return NULL; +} + +static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + if (zstrm->private) + comp->backend->destroy(zstrm->private); + free_pages((unsigned long)zstrm->buffer, 1); + kfree(zstrm); +} + +/* + * allocate new zcomp_strm structure with ->private initialized by + * backend, return NULL on error + */ +static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) +{ + struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL); + if (!zstrm) + return NULL; + + zstrm->private = comp->backend->create(); + /* + * allocate 2 pages. 1 for compressed data, plus 1 extra for the + * case when compressed size is larger than the original one + */ + zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); + if (!zstrm->private || !zstrm->buffer) { + zcomp_strm_free(comp, zstrm); + zstrm = NULL; + } + return zstrm; +} + +struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) +{ + mutex_lock(&comp->strm_lock); + return comp->zstrm; +} + +void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + mutex_unlock(&comp->strm_lock); +} + +int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, + const unsigned char *src, size_t *dst_len) +{ + return comp->backend->compress(src, zstrm->buffer, dst_len, + zstrm->private); +} + +int zcomp_decompress(struct zcomp *comp, const unsigned char *src, + size_t src_len, unsigned char *dst) +{ + return comp->backend->decompress(src, src_len, dst); +} + +void zcomp_destroy(struct zcomp *comp) +{ + zcomp_strm_free(comp, comp->zstrm); + kfree(comp); +} + +/* + * search available compressors for requested algorithm. + * allocate new zcomp and initialize it. return NULL + * if requested algorithm is not supported or in case + * of init error + */ +struct zcomp *zcomp_create(const char *compress) +{ + struct zcomp *comp; + struct zcomp_backend *backend; + + backend = find_backend(compress); + if (!backend) + return NULL; + + comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL); + if (!comp) + return NULL; + + comp->backend = backend; + mutex_init(&comp->strm_lock); + + comp->zstrm = zcomp_strm_alloc(comp); + if (!comp->zstrm) { + kfree(comp); + return NULL; + } + return comp; +} diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h new file mode 100644 index 000000000000..c9a98e1317fe --- /dev/null +++ b/drivers/block/zram/zcomp.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_H_ +#define _ZCOMP_H_ + +#include + +struct zcomp_strm { + /* compression/decompression buffer */ + void *buffer; + /* + * The private data of the compression stream, only compression + * stream backend can touch this (e.g. compression algorithm + * working memory) + */ + void *private; +}; + +/* static compression backend */ +struct zcomp_backend { + int (*compress)(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private); + + int (*decompress)(const unsigned char *src, size_t src_len, + unsigned char *dst); + + void *(*create)(void); + void (*destroy)(void *private); + + const char *name; +}; + +/* dynamic per-device compression frontend */ +struct zcomp { + struct mutex strm_lock; + struct zcomp_strm *zstrm; + struct zcomp_backend *backend; +}; + +struct zcomp *zcomp_create(const char *comp); +void zcomp_destroy(struct zcomp *comp); + +struct zcomp_strm *zcomp_strm_find(struct zcomp *comp); +void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm); + +int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, + const unsigned char *src, size_t *dst_len); + +int zcomp_decompress(struct zcomp *comp, const unsigned char *src, + size_t src_len, unsigned char *dst); +#endif /* _ZCOMP_H_ */ diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c new file mode 100644 index 000000000000..da1bc47d588e --- /dev/null +++ b/drivers/block/zram/zcomp_lzo.c @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include "zcomp_lzo.h" + +static void *lzo_create(void) +{ + return kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); +} + +static void lzo_destroy(void *private) +{ + kfree(private); +} + +static int lzo_compress(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private) +{ + int ret = lzo1x_1_compress(src, PAGE_SIZE, dst, dst_len, private); + return ret == LZO_E_OK ? 0 : ret; +} + +static int lzo_decompress(const unsigned char *src, size_t src_len, + unsigned char *dst) +{ + size_t dst_len = PAGE_SIZE; + int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len); + return ret == LZO_E_OK ? 0 : ret; +} + +struct zcomp_backend zcomp_lzo = { + .compress = lzo_compress, + .decompress = lzo_decompress, + .create = lzo_create, + .destroy = lzo_destroy, + .name = "lzo", +}; diff --git a/drivers/block/zram/zcomp_lzo.h b/drivers/block/zram/zcomp_lzo.h new file mode 100644 index 000000000000..128c5807fa14 --- /dev/null +++ b/drivers/block/zram/zcomp_lzo.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_LZO_H_ +#define _ZCOMP_LZO_H_ + +#include "zcomp.h" + +extern struct zcomp_backend zcomp_lzo; + +#endif /* _ZCOMP_LZO_H_ */ From 92e9d71d940158389d3553a382045c3c9a7a7e98 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:12 -0700 Subject: [PATCH 42/82] zram: use zcomp compressing backends Do not perform direct LZO compress/decompress calls, initialise and use zcomp LZO backend (single compression stream) instead. [akpm@linux-foundation.org: resolve conflicts with zram-delete-zram_init_device-fix.patch] Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit b7ca232ee7e85ed3b18e39eb20a7f458ee1d6047) Signed-off-by: Alex Shi --- drivers/block/zram/Makefile | 2 +- drivers/block/zram/zram_drv.c | 69 ++++++++++++++++------------------- drivers/block/zram/zram_drv.h | 8 ++-- 3 files changed, 36 insertions(+), 43 deletions(-) diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile index cb0f9ced6a93..757c6a5cadff 100644 --- a/drivers/block/zram/Makefile +++ b/drivers/block/zram/Makefile @@ -1,3 +1,3 @@ -zram-y := zram_drv.o +zram-y := zcomp_lzo.o zcomp.o zram_drv.o obj-$(CONFIG_ZRAM) += zram.o diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 9c71757aa1c6..9f5d2c2f9ea7 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include @@ -38,6 +37,7 @@ /* Globals */ static int zram_major; static struct zram *zram_devices; +static const char *default_compressor = "lzo"; /* Module params (documentation at end) */ static unsigned int num_devices = 1; @@ -159,8 +159,6 @@ static inline int valid_io_request(struct zram *zram, struct bio *bio) static void zram_meta_free(struct zram_meta *meta) { zs_destroy_pool(meta->mem_pool); - kfree(meta->compress_workmem); - free_pages((unsigned long)meta->compress_buffer, 1); vfree(meta->table); kfree(meta); } @@ -172,22 +170,11 @@ static struct zram_meta *zram_meta_alloc(u64 disksize) if (!meta) goto out; - meta->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); - if (!meta->compress_workmem) - goto free_meta; - - meta->compress_buffer = - (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); - if (!meta->compress_buffer) { - pr_err("Error allocating compressor buffer space\n"); - goto free_workmem; - } - num_pages = disksize >> PAGE_SHIFT; meta->table = vzalloc(num_pages * sizeof(*meta->table)); if (!meta->table) { pr_err("Error allocating zram address table\n"); - goto free_buffer; + goto free_meta; } meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM); @@ -197,15 +184,10 @@ static struct zram_meta *zram_meta_alloc(u64 disksize) } rwlock_init(&meta->tb_lock); - mutex_init(&meta->buffer_lock); return meta; free_table: vfree(meta->table); -free_buffer: - free_pages((unsigned long)meta->compress_buffer, 1); -free_workmem: - kfree(meta->compress_workmem); free_meta: kfree(meta); meta = NULL; @@ -279,8 +261,7 @@ static void zram_free_page(struct zram *zram, size_t index) static int zram_decompress_page(struct zram *zram, char *mem, u32 index) { - int ret = LZO_E_OK; - size_t clen = PAGE_SIZE; + int ret = 0; unsigned char *cmem; struct zram_meta *meta = zram->meta; unsigned long handle; @@ -300,12 +281,12 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) if (size == PAGE_SIZE) copy_page(mem, cmem); else - ret = lzo1x_decompress_safe(cmem, size, mem, &clen); + ret = zcomp_decompress(zram->comp, cmem, size, mem); zs_unmap_object(meta->mem_pool, handle); read_unlock(&meta->tb_lock); /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret != LZO_E_OK)) { + if (unlikely(ret)) { pr_err("Decompression failed! err=%d, page=%u\n", ret, index); atomic64_inc(&zram->stats.failed_reads); return ret; @@ -348,7 +329,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, ret = zram_decompress_page(zram, uncmem, index); /* Should NEVER happen. Return bio error if it does. */ - if (unlikely(ret != LZO_E_OK)) + if (unlikely(ret)) goto out_cleanup; if (is_partial_io(bvec)) @@ -373,11 +354,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, struct page *page; unsigned char *user_mem, *cmem, *src, *uncmem = NULL; struct zram_meta *meta = zram->meta; + struct zcomp_strm *zstrm; bool locked = false; page = bvec->bv_page; - src = meta->compress_buffer; - if (is_partial_io(bvec)) { /* * This is a partial IO. We need to read the full page @@ -393,7 +373,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } - mutex_lock(&meta->buffer_lock); + zstrm = zcomp_strm_find(zram->comp); locked = true; user_mem = kmap_atomic(page); @@ -419,22 +399,20 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } - ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen, - meta->compress_workmem); + ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen); if (!is_partial_io(bvec)) { kunmap_atomic(user_mem); user_mem = NULL; uncmem = NULL; } - if (unlikely(ret != LZO_E_OK)) { + if (unlikely(ret)) { pr_err("Compression failed! err=%d\n", ret); goto out; } - + src = zstrm->buffer; if (unlikely(clen > max_zpage_size)) { clen = PAGE_SIZE; - src = NULL; if (is_partial_io(bvec)) src = uncmem; } @@ -456,6 +434,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, memcpy(cmem, src, clen); } + zcomp_strm_release(zram->comp, zstrm); + locked = false; zs_unmap_object(meta->mem_pool, handle); /* @@ -474,10 +454,9 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, atomic64_inc(&zram->stats.pages_stored); out: if (locked) - mutex_unlock(&meta->buffer_lock); + zcomp_strm_release(zram->comp, zstrm); if (is_partial_io(bvec)) kfree(uncmem); - if (ret) atomic64_inc(&zram->stats.failed_writes); return ret; @@ -521,6 +500,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) zs_free(meta->mem_pool, handle); } + zcomp_destroy(zram->comp); zram_meta_free(zram->meta); zram->meta = NULL; /* Reset stats */ @@ -538,6 +518,7 @@ static ssize_t disksize_store(struct device *dev, u64 disksize; struct zram_meta *meta; struct zram *zram = dev_to_zram(dev); + int err; disksize = memparse(buf, NULL); if (!disksize) @@ -550,10 +531,17 @@ static ssize_t disksize_store(struct device *dev, down_write(&zram->init_lock); if (init_done(zram)) { - zram_meta_free(meta); - up_write(&zram->init_lock); pr_info("Cannot change disksize for initialized device\n"); - return -EBUSY; + err = -EBUSY; + goto out_free_meta; + } + + zram->comp = zcomp_create(default_compressor); + if (!zram->comp) { + pr_info("Cannot initialise %s compressing backend\n", + default_compressor); + err = -EINVAL; + goto out_free_meta; } zram->meta = meta; @@ -562,6 +550,11 @@ static ssize_t disksize_store(struct device *dev, up_write(&zram->init_lock); return len; + +out_free_meta: + up_write(&zram->init_lock); + zram_meta_free(meta); + return err; } static ssize_t reset_store(struct device *dev, diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 1d5b1f5786a8..45e04f7b713f 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -16,9 +16,10 @@ #define _ZRAM_DRV_H_ #include -#include #include +#include "zcomp.h" + /* * Some arbitrary value. This is just to catch * invalid value for num_devices module parameter. @@ -81,17 +82,16 @@ struct zram_stats { struct zram_meta { rwlock_t tb_lock; /* protect table */ - void *compress_workmem; - void *compress_buffer; struct table *table; struct zs_pool *mem_pool; - struct mutex buffer_lock; /* protect compress buffers */ }; struct zram { struct zram_meta *meta; struct request_queue *queue; struct gendisk *disk; + struct zcomp *comp; + /* Prevent concurrent execution of device init, reset and R/W request */ struct rw_semaphore init_lock; /* From 9381faeb8bc1d1fdfb6931b69ee70a594d04c4fe Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:13 -0700 Subject: [PATCH 43/82] zram: factor out single stream compression This is preparation patch to add multi stream support to zcomp. Introduce struct zcomp_strm_single and a set of functions to manage zcomp_strm stream access. zcomp_strm_single implements single compession stream, same way as current zcomp implementation. This moves zcomp_strm stream control and locking from zcomp, so compressing backend zcomp is not aware of required locking. Single and multi streams require different locking schemes. Minchan Kim reported that spinlock-based locking scheme (which is used in multi stream implementation) has demonstrated a severe perfomance regression for single compression stream case, comparing to mutex-based. see https://lkml.org/lkml/2014/2/18/16 The following set of functions added: - zcomp_strm_single_find()/zcomp_strm_single_release() find and release a compression stream, implement required locking - zcomp_strm_single_create()/zcomp_strm_single_destroy() create and destroy zcomp_strm_single New ->strm_find() and ->strm_release() callbacks added to zcomp, which are set to zcomp_strm_single_find() and zcomp_strm_single_release() during initialisation. Instead of direct locking and zcomp_strm access from zcomp_strm_find() and zcomp_strm_release(), zcomp now calls ->strm_find() and ->strm_release() correspondingly. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 9cc97529a180b369fcb7e5265771b6ba7e01f05b) Signed-off-by: Alex Shi --- drivers/block/zram/zcomp.c | 62 +++++++++++++++++++++++++++++++++----- drivers/block/zram/zcomp.h | 7 +++-- 2 files changed, 59 insertions(+), 10 deletions(-) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 22f4ae235660..72e8071f9d73 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -16,6 +16,14 @@ #include "zcomp.h" #include "zcomp_lzo.h" +/* + * single zcomp_strm backend + */ +struct zcomp_strm_single { + struct mutex strm_lock; + struct zcomp_strm *zstrm; +}; + static struct zcomp_backend *find_backend(const char *compress) { if (strncmp(compress, "lzo", 3) == 0) @@ -54,15 +62,56 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) return zstrm; } +static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp) +{ + struct zcomp_strm_single *zs = comp->stream; + mutex_lock(&zs->strm_lock); + return zs->zstrm; +} + +static void zcomp_strm_single_release(struct zcomp *comp, + struct zcomp_strm *zstrm) +{ + struct zcomp_strm_single *zs = comp->stream; + mutex_unlock(&zs->strm_lock); +} + +static void zcomp_strm_single_destroy(struct zcomp *comp) +{ + struct zcomp_strm_single *zs = comp->stream; + zcomp_strm_free(comp, zs->zstrm); + kfree(zs); +} + +static int zcomp_strm_single_create(struct zcomp *comp) +{ + struct zcomp_strm_single *zs; + + comp->destroy = zcomp_strm_single_destroy; + comp->strm_find = zcomp_strm_single_find; + comp->strm_release = zcomp_strm_single_release; + zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL); + if (!zs) + return -ENOMEM; + + comp->stream = zs; + mutex_init(&zs->strm_lock); + zs->zstrm = zcomp_strm_alloc(comp); + if (!zs->zstrm) { + kfree(zs); + return -ENOMEM; + } + return 0; +} + struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) { - mutex_lock(&comp->strm_lock); - return comp->zstrm; + return comp->strm_find(comp); } void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm) { - mutex_unlock(&comp->strm_lock); + comp->strm_release(comp, zstrm); } int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, @@ -80,7 +129,7 @@ int zcomp_decompress(struct zcomp *comp, const unsigned char *src, void zcomp_destroy(struct zcomp *comp) { - zcomp_strm_free(comp, comp->zstrm); + comp->destroy(comp); kfree(comp); } @@ -104,10 +153,7 @@ struct zcomp *zcomp_create(const char *compress) return NULL; comp->backend = backend; - mutex_init(&comp->strm_lock); - - comp->zstrm = zcomp_strm_alloc(comp); - if (!comp->zstrm) { + if (zcomp_strm_single_create(comp) != 0) { kfree(comp); return NULL; } diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index c9a98e1317fe..dc3500d842a3 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -39,9 +39,12 @@ struct zcomp_backend { /* dynamic per-device compression frontend */ struct zcomp { - struct mutex strm_lock; - struct zcomp_strm *zstrm; + void *stream; struct zcomp_backend *backend; + + struct zcomp_strm *(*strm_find)(struct zcomp *comp); + void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm); + void (*destroy)(struct zcomp *comp); }; struct zcomp *zcomp_create(const char *comp); From de980e9147bd30182894d0bb6778fbd0c8a7f293 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:14 -0700 Subject: [PATCH 44/82] zram: add multi stream functionality Existing zram (zcomp) implementation has only one compression stream (buffer and algorithm private part), so in order to prevent data corruption only one write (compress operation) can use this compression stream, forcing all concurrent write operations to wait for stream lock to be released. This patch changes zcomp to keep a compression streams list of user-defined size (via sysfs device attr). Each write operation still exclusively holds compression stream, the difference is that we can have N write operations (depending on size of streams list) executing in parallel. See TEST section later in commit message for performance data. Introduce struct zcomp_strm_multi and a set of functions to manage zcomp_strm stream access. zcomp_strm_multi has a list of idle zcomp_strm structs, spinlock to protect idle list and wait queue, making it possible to perform parallel compressions. The following set of functions added: - zcomp_strm_multi_find()/zcomp_strm_multi_release() find and release a compression stream, implement required locking - zcomp_strm_multi_create()/zcomp_strm_multi_destroy() create and destroy zcomp_strm_multi zcomp ->strm_find() and ->strm_release() callbacks are set during initialisation to zcomp_strm_multi_find()/zcomp_strm_multi_release() correspondingly. Each time zcomp issues a zcomp_strm_multi_find() call, the following set of operations performed: - spin lock strm_lock - if idle list is not empty, remove zcomp_strm from idle list, spin unlock and return zcomp stream pointer to caller - if idle list is empty, current adds itself to wait queue. it will be awaken by zcomp_strm_multi_release() caller. zcomp_strm_multi_release(): - spin lock strm_lock - add zcomp stream to idle list - spin unlock, wake up sleeper Minchan Kim reported that spinlock-based locking scheme has demonstrated a severe perfomance regression for single compression stream case, comparing to mutex-based (see https://lkml.org/lkml/2014/2/18/16) base spinlock mutex ==Initial write ==Initial write ==Initial write records: 5 records: 5 records: 5 avg: 1642424.35 avg: 699610.40 avg: 1655583.71 std: 39890.95(2.43%) std: 232014.19(33.16%) std: 52293.96 max: 1690170.94 max: 1163473.45 max: 1697164.75 min: 1568669.52 min: 573429.88 min: 1553410.23 ==Rewrite ==Rewrite ==Rewrite records: 5 records: 5 records: 5 avg: 1611775.39 avg: 501406.64 avg: 1684419.11 std: 17144.58(1.06%) std: 15354.41(3.06%) std: 18367.42 max: 1641800.95 max: 531356.78 max: 1706445.84 min: 1593515.27 min: 488817.78 min: 1655335.73 When only one compression stream available, mutex with spin on owner tends to perform much better than frequent wait_event()/wake_up(). This is why single stream implemented as a special case with mutex locking. Introduce and document zram device attribute max_comp_streams. This attr shows and stores current zcomp's max number of zcomp streams (max_strm). Extend zcomp's zcomp_create() with `max_strm' parameter. `max_strm' limits the number of zcomp_strm structs in compression backend's idle list (max_comp_streams). max_comp_streams used during initialisation as follows: -- passing to zcomp_create() max_strm equals to 1 will initialise zcomp using single compression stream zcomp_strm_single (mutex-based locking). -- passing to zcomp_create() max_strm greater than 1 will initialise zcomp using multi compression stream zcomp_strm_multi (spinlock-based locking). default max_comp_streams value is 1, meaning that zram with single stream will be initialised. Later patch will introduce configuration knob to change max_comp_streams on already initialised and used zcomp. TEST iozone -t 3 -R -r 16K -s 60M -I +Z test base 1 strm (mutex) 3 strm (spinlock) ----------------------------------------------------------------------- Initial write 589286.78 583518.39 718011.05 Rewrite 604837.97 596776.38 1515125.72 Random write 584120.11 595714.58 1388850.25 Pwrite 535731.17 541117.38 739295.27 Fwrite 1418083.88 1478612.72 1484927.06 Usage example: set max_comp_streams to 4 echo 4 > /sys/block/zram0/max_comp_streams show current max_comp_streams (default value is 1). cat /sys/block/zram0/max_comp_streams Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit beca3ec71fe5490ee9237dc42400f50402baf83e) Signed-off-by: Alex Shi --- Documentation/ABI/testing/sysfs-block-zram | 22 ++++ Documentation/blockdev/zram.txt | 31 +++++- drivers/block/zram/zcomp.c | 124 ++++++++++++++++++++- drivers/block/zram/zcomp.h | 4 +- drivers/block/zram/zram_drv.c | 42 ++++++- drivers/block/zram/zram_drv.h | 2 +- 6 files changed, 215 insertions(+), 10 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index ec93fe33baa6..d67f0bb1c726 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -42,6 +42,28 @@ Description: The invalid_io file is read-only and specifies the number of non-page-size-aligned I/O requests issued to this device. +What: /sys/block/zram/failed_reads +Date: February 2014 +Contact: Sergey Senozhatsky +Description: + The failed_reads file is read-only and specifies the number of + failed reads happened on this device. + +What: /sys/block/zram/failed_writes +Date: February 2014 +Contact: Sergey Senozhatsky +Description: + The failed_writes file is read-only and specifies the number of + failed writes happened on this device. + +What: /sys/block/zram/max_comp_streams +Date: February 2014 +Contact: Sergey Senozhatsky +Description: + The max_comp_streams file is read-write and specifies the + number of backend's zcomp_strm compression streams (number of + concurrent compress operations). + What: /sys/block/zram/notify_free Date: August 2010 Contact: Nitin Gupta diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 393541be1ec0..e82c03f26f31 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -21,7 +21,28 @@ Following shows a typical sequence of steps for using zram. This creates 4 devices: /dev/zram{0,1,2,3} (num_devices parameter is optional. Default: 1) -2) Set Disksize +2) Set max number of compression streams + Compression backend may use up to max_comp_streams compression streams, + thus allowing up to max_comp_streams concurrent compression operations. + By default, compression backend uses single compression stream. + + Examples: + #show max compression streams number + cat /sys/block/zram0/max_comp_streams + + #set max compression streams number to 3 + echo 3 > /sys/block/zram0/max_comp_streams + +Note: +In order to enable compression backend's multi stream support max_comp_streams +must be initially set to desired concurrency level before ZRAM device +initialisation. Once the device initialised as a single stream compression +backend (max_comp_streams equals to 0) changing the value of max_comp_streams +will not take any effect, because single stream compression backend implemented +as a special case and does not support dynamic max_comp_streams. Only multi +stream backend supports dynamic max_comp_streams adjustment. + +3) Set Disksize Set disk size by writing the value to sysfs node 'disksize'. The value can be either in bytes or you can use mem suffixes. Examples: @@ -38,14 +59,14 @@ There is little point creating a zram of greater than twice the size of memory since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the size of the disk when not in use so a huge zram is wasteful. -3) Activate: +4) Activate: mkswap /dev/zram0 swapon /dev/zram0 mkfs.ext4 /dev/zram1 mount /dev/zram1 /tmp -4) Stats: +5) Stats: Per-device statistics are exported as various nodes under /sys/block/zram/ disksize @@ -59,11 +80,11 @@ size of the disk when not in use so a huge zram is wasteful. compr_data_size mem_used_total -5) Deactivate: +6) Deactivate: swapoff /dev/zram0 umount /dev/zram1 -6) Reset: +7) Reset: Write any positive value to 'reset' sysfs node echo 1 > /sys/block/zram0/reset echo 1 > /sys/block/zram1/reset diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 72e8071f9d73..c06f75f54718 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -24,6 +24,21 @@ struct zcomp_strm_single { struct zcomp_strm *zstrm; }; +/* + * multi zcomp_strm backend + */ +struct zcomp_strm_multi { + /* protect strm list */ + spinlock_t strm_lock; + /* max possible number of zstrm streams */ + int max_strm; + /* number of available zstrm streams */ + int avail_strm; + /* list of available strms */ + struct list_head idle_strm; + wait_queue_head_t strm_wait; +}; + static struct zcomp_backend *find_backend(const char *compress) { if (strncmp(compress, "lzo", 3) == 0) @@ -62,6 +77,107 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) return zstrm; } +/* + * get idle zcomp_strm or wait until other process release + * (zcomp_strm_release()) one for us + */ +static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp) +{ + struct zcomp_strm_multi *zs = comp->stream; + struct zcomp_strm *zstrm; + + while (1) { + spin_lock(&zs->strm_lock); + if (!list_empty(&zs->idle_strm)) { + zstrm = list_entry(zs->idle_strm.next, + struct zcomp_strm, list); + list_del(&zstrm->list); + spin_unlock(&zs->strm_lock); + return zstrm; + } + /* zstrm streams limit reached, wait for idle stream */ + if (zs->avail_strm >= zs->max_strm) { + spin_unlock(&zs->strm_lock); + wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); + continue; + } + /* allocate new zstrm stream */ + zs->avail_strm++; + spin_unlock(&zs->strm_lock); + + zstrm = zcomp_strm_alloc(comp); + if (!zstrm) { + spin_lock(&zs->strm_lock); + zs->avail_strm--; + spin_unlock(&zs->strm_lock); + wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); + continue; + } + break; + } + return zstrm; +} + +/* add stream back to idle list and wake up waiter or free the stream */ +static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + struct zcomp_strm_multi *zs = comp->stream; + + spin_lock(&zs->strm_lock); + if (zs->avail_strm <= zs->max_strm) { + list_add(&zstrm->list, &zs->idle_strm); + spin_unlock(&zs->strm_lock); + wake_up(&zs->strm_wait); + return; + } + + zs->avail_strm--; + spin_unlock(&zs->strm_lock); + zcomp_strm_free(comp, zstrm); +} + +static void zcomp_strm_multi_destroy(struct zcomp *comp) +{ + struct zcomp_strm_multi *zs = comp->stream; + struct zcomp_strm *zstrm; + + while (!list_empty(&zs->idle_strm)) { + zstrm = list_entry(zs->idle_strm.next, + struct zcomp_strm, list); + list_del(&zstrm->list); + zcomp_strm_free(comp, zstrm); + } + kfree(zs); +} + +static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm) +{ + struct zcomp_strm *zstrm; + struct zcomp_strm_multi *zs; + + comp->destroy = zcomp_strm_multi_destroy; + comp->strm_find = zcomp_strm_multi_find; + comp->strm_release = zcomp_strm_multi_release; + zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL); + if (!zs) + return -ENOMEM; + + comp->stream = zs; + spin_lock_init(&zs->strm_lock); + INIT_LIST_HEAD(&zs->idle_strm); + init_waitqueue_head(&zs->strm_wait); + zs->max_strm = max_strm; + zs->avail_strm = 1; + + zstrm = zcomp_strm_alloc(comp); + if (!zstrm) { + kfree(zs); + return -ENOMEM; + } + list_add(&zstrm->list, &zs->idle_strm); + return 0; +} + static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp) { struct zcomp_strm_single *zs = comp->stream; @@ -139,7 +255,7 @@ void zcomp_destroy(struct zcomp *comp) * if requested algorithm is not supported or in case * of init error */ -struct zcomp *zcomp_create(const char *compress) +struct zcomp *zcomp_create(const char *compress, int max_strm) { struct zcomp *comp; struct zcomp_backend *backend; @@ -153,7 +269,11 @@ struct zcomp *zcomp_create(const char *compress) return NULL; comp->backend = backend; - if (zcomp_strm_single_create(comp) != 0) { + if (max_strm > 1) + zcomp_strm_multi_create(comp, max_strm); + else + zcomp_strm_single_create(comp); + if (!comp->stream) { kfree(comp); return NULL; } diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index dc3500d842a3..2a3684446160 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -21,6 +21,8 @@ struct zcomp_strm { * working memory) */ void *private; + /* used in multi stream backend, protected by backend strm_lock */ + struct list_head list; }; /* static compression backend */ @@ -47,7 +49,7 @@ struct zcomp { void (*destroy)(struct zcomp *comp); }; -struct zcomp *zcomp_create(const char *comp); +struct zcomp *zcomp_create(const char *comp, int max_strm); void zcomp_destroy(struct zcomp *comp); struct zcomp_strm *zcomp_strm_find(struct zcomp *comp); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 9f5d2c2f9ea7..88ee317ce6d2 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -108,6 +108,40 @@ static ssize_t mem_used_total_show(struct device *dev, return sprintf(buf, "%llu\n", val); } +static ssize_t max_comp_streams_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + val = zram->max_comp_streams; + up_read(&zram->init_lock); + + return sprintf(buf, "%d\n", val); +} + +static ssize_t max_comp_streams_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + int num; + struct zram *zram = dev_to_zram(dev); + + if (kstrtoint(buf, 0, &num)) + return -EINVAL; + if (num < 1) + return -EINVAL; + down_write(&zram->init_lock); + if (init_done(zram)) { + up_write(&zram->init_lock); + pr_info("Can't set max_comp_streams for initialized device\n"); + return -EBUSY; + } + zram->max_comp_streams = num; + up_write(&zram->init_lock); + return len; +} + /* flag operations needs meta->tb_lock */ static int zram_test_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) @@ -501,6 +535,8 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) } zcomp_destroy(zram->comp); + zram->max_comp_streams = 1; + zram_meta_free(zram->meta); zram->meta = NULL; /* Reset stats */ @@ -536,7 +572,7 @@ static ssize_t disksize_store(struct device *dev, goto out_free_meta; } - zram->comp = zcomp_create(default_compressor); + zram->comp = zcomp_create(default_compressor, zram->max_comp_streams); if (!zram->comp) { pr_info("Cannot initialise %s compressing backend\n", default_compressor); @@ -696,6 +732,8 @@ static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); +static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, + max_comp_streams_show, max_comp_streams_store); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); @@ -720,6 +758,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_orig_data_size.attr, &dev_attr_compr_data_size.attr, &dev_attr_mem_used_total.attr, + &dev_attr_max_comp_streams.attr, NULL, }; @@ -782,6 +821,7 @@ static int create_device(struct zram *zram, int device_id) } zram->meta = NULL; + zram->max_comp_streams = 1; return 0; out_free_disk: diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 45e04f7b713f..ccf36d11755a 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -99,7 +99,7 @@ struct zram { * we can store in a disk. */ u64 disksize; /* bytes */ - + int max_comp_streams; struct zram_stats stats; }; #endif From 9634c9a147869de6d78f53a49823980aaea7233c Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:15 -0700 Subject: [PATCH 45/82] zram: add set_max_streams knob This patch allows to change max_comp_streams on initialised zcomp. Introduce zcomp set_max_streams() knob, zcomp_strm_multi_set_max_streams() and zcomp_strm_single_set_max_streams() callbacks to change streams limit for zcomp_strm_multi and zcomp_strm_single, accordingly. set_max_streams for single steam zcomp does nothing. If user has lowered the limit, then zcomp_strm_multi_set_max_streams() attempts to immediately free extra streams (as much as it can, depending on idle streams availability). Note, this patch does not allow to change stream 'policy' from single to multi stream (or vice versa) on already initialised compression backend. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit fe8eb122c82b2049c460fc6df6e8583a2f935cff) Signed-off-by: Alex Shi --- drivers/block/zram/zcomp.c | 36 +++++++++++++++++++++++++++++++++++ drivers/block/zram/zcomp.h | 3 +++ drivers/block/zram/zram_drv.c | 5 ++--- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index c06f75f54718..ac276f79f21c 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -136,6 +136,29 @@ static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstr zcomp_strm_free(comp, zstrm); } +/* change max_strm limit */ +static int zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) +{ + struct zcomp_strm_multi *zs = comp->stream; + struct zcomp_strm *zstrm; + + spin_lock(&zs->strm_lock); + zs->max_strm = num_strm; + /* + * if user has lowered the limit and there are idle streams, + * immediately free as much streams (and memory) as we can. + */ + while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) { + zstrm = list_entry(zs->idle_strm.next, + struct zcomp_strm, list); + list_del(&zstrm->list); + zcomp_strm_free(comp, zstrm); + zs->avail_strm--; + } + spin_unlock(&zs->strm_lock); + return 0; +} + static void zcomp_strm_multi_destroy(struct zcomp *comp) { struct zcomp_strm_multi *zs = comp->stream; @@ -158,6 +181,7 @@ static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm) comp->destroy = zcomp_strm_multi_destroy; comp->strm_find = zcomp_strm_multi_find; comp->strm_release = zcomp_strm_multi_release; + comp->set_max_streams = zcomp_strm_multi_set_max_streams; zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL); if (!zs) return -ENOMEM; @@ -192,6 +216,12 @@ static void zcomp_strm_single_release(struct zcomp *comp, mutex_unlock(&zs->strm_lock); } +static int zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm) +{ + /* zcomp_strm_single support only max_comp_streams == 1 */ + return -ENOTSUPP; +} + static void zcomp_strm_single_destroy(struct zcomp *comp) { struct zcomp_strm_single *zs = comp->stream; @@ -206,6 +236,7 @@ static int zcomp_strm_single_create(struct zcomp *comp) comp->destroy = zcomp_strm_single_destroy; comp->strm_find = zcomp_strm_single_find; comp->strm_release = zcomp_strm_single_release; + comp->set_max_streams = zcomp_strm_single_set_max_streams; zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL); if (!zs) return -ENOMEM; @@ -220,6 +251,11 @@ static int zcomp_strm_single_create(struct zcomp *comp) return 0; } +int zcomp_set_max_streams(struct zcomp *comp, int num_strm) +{ + return comp->set_max_streams(comp, num_strm); +} + struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) { return comp->strm_find(comp); diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index 2a3684446160..bd11d59c5dd1 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -46,6 +46,7 @@ struct zcomp { struct zcomp_strm *(*strm_find)(struct zcomp *comp); void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm); + int (*set_max_streams)(struct zcomp *comp, int num_strm); void (*destroy)(struct zcomp *comp); }; @@ -60,4 +61,6 @@ int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, int zcomp_decompress(struct zcomp *comp, const unsigned char *src, size_t src_len, unsigned char *dst); + +int zcomp_set_max_streams(struct zcomp *comp, int num_strm); #endif /* _ZCOMP_H_ */ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 88ee317ce6d2..c03d0053309b 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -133,9 +133,8 @@ static ssize_t max_comp_streams_store(struct device *dev, return -EINVAL; down_write(&zram->init_lock); if (init_done(zram)) { - up_write(&zram->init_lock); - pr_info("Can't set max_comp_streams for initialized device\n"); - return -EBUSY; + if (zcomp_set_max_streams(zram->comp, num)) + pr_info("Cannot change max compression streams\n"); } zram->max_comp_streams = num; up_write(&zram->init_lock); From 0626b80d185372c3b9f28f5c6c5e11647f381472 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:17 -0700 Subject: [PATCH 46/82] zram: make compression algorithm selection possible Add and document `comp_algorithm' device attribute. This attribute allows to show supported compression and currently selected compression algorithms: cat /sys/block/zram0/comp_algorithm [lzo] lz4 and change selected compression algorithm: echo lzo > /sys/block/zram0/comp_algorithm Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit e46b8a030d76d3c94156c545c3f4c3676d813435) Signed-off-by: Alex Shi --- Documentation/ABI/testing/sysfs-block-zram | 8 +++++ Documentation/blockdev/zram.txt | 24 +++++++++++--- drivers/block/zram/zcomp.c | 32 +++++++++++++++++-- drivers/block/zram/zcomp.h | 2 ++ drivers/block/zram/zram_drv.c | 37 ++++++++++++++++++++-- drivers/block/zram/zram_drv.h | 1 + 6 files changed, 93 insertions(+), 11 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index d67f0bb1c726..2775966c2d12 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -64,6 +64,14 @@ Description: number of backend's zcomp_strm compression streams (number of concurrent compress operations). +What: /sys/block/zram/comp_algorithm +Date: February 2014 +Contact: Sergey Senozhatsky +Description: + The comp_algorithm file is read-write and lets to show + available and selected compression algorithms, change + compression algorithm selection. + What: /sys/block/zram/notify_free Date: August 2010 Contact: Nitin Gupta diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index e82c03f26f31..4ab2ce98f63c 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -42,7 +42,21 @@ will not take any effect, because single stream compression backend implemented as a special case and does not support dynamic max_comp_streams. Only multi stream backend supports dynamic max_comp_streams adjustment. -3) Set Disksize +3) Select compression algorithm + Using comp_algorithm device attribute one can see available and + currently selected (shown in square brackets) compression algortithms, + change selected compression algorithm (once the device is initialised + there is no way to change compression algorithm). + + Examples: + #show supported compression algorithms + cat /sys/block/zram0/comp_algorithm + lzo [lz4] + + #select lzo compression algorithm + echo lzo > /sys/block/zram0/comp_algorithm + +4) Set Disksize Set disk size by writing the value to sysfs node 'disksize'. The value can be either in bytes or you can use mem suffixes. Examples: @@ -59,14 +73,14 @@ There is little point creating a zram of greater than twice the size of memory since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the size of the disk when not in use so a huge zram is wasteful. -4) Activate: +5) Activate: mkswap /dev/zram0 swapon /dev/zram0 mkfs.ext4 /dev/zram1 mount /dev/zram1 /tmp -5) Stats: +6) Stats: Per-device statistics are exported as various nodes under /sys/block/zram/ disksize @@ -80,11 +94,11 @@ size of the disk when not in use so a huge zram is wasteful. compr_data_size mem_used_total -6) Deactivate: +7) Deactivate: swapoff /dev/zram0 umount /dev/zram1 -7) Reset: +8) Reset: Write any positive value to 'reset' sysfs node echo 1 > /sys/block/zram0/reset echo 1 > /sys/block/zram1/reset diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index ac276f79f21c..aad533a8bc55 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -39,11 +39,20 @@ struct zcomp_strm_multi { wait_queue_head_t strm_wait; }; +static struct zcomp_backend *backends[] = { + &zcomp_lzo, + NULL +}; + static struct zcomp_backend *find_backend(const char *compress) { - if (strncmp(compress, "lzo", 3) == 0) - return &zcomp_lzo; - return NULL; + int i = 0; + while (backends[i]) { + if (sysfs_streq(compress, backends[i]->name)) + break; + i++; + } + return backends[i]; } static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) @@ -251,6 +260,23 @@ static int zcomp_strm_single_create(struct zcomp *comp) return 0; } +/* show available compressors */ +ssize_t zcomp_available_show(const char *comp, char *buf) +{ + ssize_t sz = 0; + int i = 0; + + while (backends[i]) { + if (sysfs_streq(comp, backends[i]->name)) + sz += sprintf(buf + sz, "[%s] ", backends[i]->name); + else + sz += sprintf(buf + sz, "%s ", backends[i]->name); + i++; + } + sz += sprintf(buf + sz, "\n"); + return sz; +} + int zcomp_set_max_streams(struct zcomp *comp, int num_strm) { return comp->set_max_streams(comp, num_strm); diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index bd11d59c5dd1..8b8997f8613b 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -50,6 +50,8 @@ struct zcomp { void (*destroy)(struct zcomp *comp); }; +ssize_t zcomp_available_show(const char *comp, char *buf); + struct zcomp *zcomp_create(const char *comp, int max_strm); void zcomp_destroy(struct zcomp *comp); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index c03d0053309b..f15564e65d87 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -141,6 +141,34 @@ static ssize_t max_comp_streams_store(struct device *dev, return len; } +static ssize_t comp_algorithm_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + size_t sz; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + sz = zcomp_available_show(zram->compressor, buf); + up_read(&zram->init_lock); + + return sz; +} + +static ssize_t comp_algorithm_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + down_write(&zram->init_lock); + if (init_done(zram)) { + up_write(&zram->init_lock); + pr_info("Can't change algorithm for initialized device\n"); + return -EBUSY; + } + strlcpy(zram->compressor, buf, sizeof(zram->compressor)); + up_write(&zram->init_lock); + return len; +} + /* flag operations needs meta->tb_lock */ static int zram_test_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) @@ -571,10 +599,10 @@ static ssize_t disksize_store(struct device *dev, goto out_free_meta; } - zram->comp = zcomp_create(default_compressor, zram->max_comp_streams); + zram->comp = zcomp_create(zram->compressor, zram->max_comp_streams); if (!zram->comp) { pr_info("Cannot initialise %s compressing backend\n", - default_compressor); + zram->compressor); err = -EINVAL; goto out_free_meta; } @@ -733,6 +761,8 @@ static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, max_comp_streams_show, max_comp_streams_store); +static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR, + comp_algorithm_show, comp_algorithm_store); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); @@ -758,6 +788,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_compr_data_size.attr, &dev_attr_mem_used_total.attr, &dev_attr_max_comp_streams.attr, + &dev_attr_comp_algorithm.attr, NULL, }; @@ -818,7 +849,7 @@ static int create_device(struct zram *zram, int device_id) pr_warn("Error creating sysfs group"); goto out_free_disk; } - + strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); zram->meta = NULL; zram->max_comp_streams = 1; return 0; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index ccf36d11755a..7f21c145e317 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -101,5 +101,6 @@ struct zram { u64 disksize; /* bytes */ int max_comp_streams; struct zram_stats stats; + char compressor[10]; }; #endif From f84e7a4599807591a02c524314c1a78cc01ba41d Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:18 -0700 Subject: [PATCH 47/82] zram: add lz4 algorithm backend Introduce LZ4 compression backend and make it available for selection. LZ4 support is optional and requires user to set ZRAM_LZ4_COMPRESS config option. The default compression backend is LZO. TEST (x86_64, core i5, 2 cores + 2 hyperthreading, zram disk size 1G, ext4 file system, 3 compression streams) iozone -t 3 -R -r 16K -s 60M -I +Z Test LZO LZ4 ---------------------------------------------- Initial write 1642744.62 1317005.09 Rewrite 2498980.88 1800645.16 Read 3957026.38 5877043.75 Re-read 3950997.38 5861847.00 Reverse Read 2937114.56 5047384.00 Stride read 2948163.19 4929587.38 Random read 3292692.69 4880793.62 Mixed workload 1545602.62 3502940.38 Random write 2448039.75 1758786.25 Pwrite 1670051.03 1338329.69 Pread 2530682.00 5097177.62 Fwrite 3232085.62 3275942.56 Fread 6306880.25 6645271.12 So on my system LZ4 is slower in write-only tests, while it performs better in read-only and mixed (reads + writes) tests. Official LZ4 benchmarks available here http://code.google.com/p/lz4/ (linux kernel uses revision r90). Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 6e76668e415adf799839f0ab205142ad7002d260) Signed-off-by: Alex Shi --- drivers/block/zram/Kconfig | 10 ++++++++ drivers/block/zram/Makefile | 2 ++ drivers/block/zram/zcomp.c | 6 +++++ drivers/block/zram/zcomp_lz4.c | 47 ++++++++++++++++++++++++++++++++++ drivers/block/zram/zcomp_lz4.h | 17 ++++++++++++ 5 files changed, 82 insertions(+) create mode 100644 drivers/block/zram/zcomp_lz4.c create mode 100644 drivers/block/zram/zcomp_lz4.h diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 3450be850399..6489c0fd0ea6 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -15,6 +15,16 @@ config ZRAM See zram.txt for more information. +config ZRAM_LZ4_COMPRESS + bool "Enable LZ4 algorithm support" + depends on ZRAM + select LZ4_COMPRESS + select LZ4_DECOMPRESS + default n + help + This option enables LZ4 compression algorithm support. Compression + algorithm can be changed using `comp_algorithm' device attribute. + config ZRAM_DEBUG bool "Compressed RAM block device debug support" depends on ZRAM diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile index 757c6a5cadff..be0763ff57a2 100644 --- a/drivers/block/zram/Makefile +++ b/drivers/block/zram/Makefile @@ -1,3 +1,5 @@ zram-y := zcomp_lzo.o zcomp.o zram_drv.o +zram-$(CONFIG_ZRAM_LZ4_COMPRESS) += zcomp_lz4.o + obj-$(CONFIG_ZRAM) += zram.o diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index aad533a8bc55..d5919031ca8b 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -15,6 +15,9 @@ #include "zcomp.h" #include "zcomp_lzo.h" +#ifdef CONFIG_ZRAM_LZ4_COMPRESS +#include "zcomp_lz4.h" +#endif /* * single zcomp_strm backend @@ -41,6 +44,9 @@ struct zcomp_strm_multi { static struct zcomp_backend *backends[] = { &zcomp_lzo, +#ifdef CONFIG_ZRAM_LZ4_COMPRESS + &zcomp_lz4, +#endif NULL }; diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c new file mode 100644 index 000000000000..f2afb7e988c3 --- /dev/null +++ b/drivers/block/zram/zcomp_lz4.c @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include "zcomp_lz4.h" + +static void *zcomp_lz4_create(void) +{ + return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL); +} + +static void zcomp_lz4_destroy(void *private) +{ + kfree(private); +} + +static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private) +{ + /* return : Success if return 0 */ + return lz4_compress(src, PAGE_SIZE, dst, dst_len, private); +} + +static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len, + unsigned char *dst) +{ + size_t dst_len = PAGE_SIZE; + /* return : Success if return 0 */ + return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len); +} + +struct zcomp_backend zcomp_lz4 = { + .compress = zcomp_lz4_compress, + .decompress = zcomp_lz4_decompress, + .create = zcomp_lz4_create, + .destroy = zcomp_lz4_destroy, + .name = "lz4", +}; diff --git a/drivers/block/zram/zcomp_lz4.h b/drivers/block/zram/zcomp_lz4.h new file mode 100644 index 000000000000..60613fb29dd8 --- /dev/null +++ b/drivers/block/zram/zcomp_lz4.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_LZ4_H_ +#define _ZCOMP_LZ4_H_ + +#include "zcomp.h" + +extern struct zcomp_backend zcomp_lz4; + +#endif /* _ZCOMP_LZ4_H_ */ From c7bb5623e03444502f37821e1a391ef61828d4c3 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:19 -0700 Subject: [PATCH 48/82] zram: move comp allocation out of init_lock While fixing lockdep spew of ->init_lock reported by Sasha Levin [1], Minchan Kim noted [2] that it's better to move compression backend allocation (using GPF_KERNEL) out of the ->init_lock lock, same way as with zram_meta_alloc(), in order to prevent the same lockdep spew. [1] https://lkml.org/lkml/2014/2/27/337 [2] https://lkml.org/lkml/2014/3/3/32 Signed-off-by: Sergey Senozhatsky Reported-by: Minchan Kim Acked-by: Minchan Kim Cc: Sasha Levin Acked-by: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit d61f98c70e8b0d324e8e83be2ed546d6295e63f3) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index f15564e65d87..fe0daa9fe59e 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -579,9 +579,10 @@ static ssize_t disksize_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { u64 disksize; + struct zcomp *comp; struct zram_meta *meta; struct zram *zram = dev_to_zram(dev); - int err; + int err = -EINVAL; disksize = memparse(buf, NULL); if (!disksize) @@ -592,30 +593,32 @@ static ssize_t disksize_store(struct device *dev, if (!meta) return -ENOMEM; - down_write(&zram->init_lock); - if (init_done(zram)) { - pr_info("Cannot change disksize for initialized device\n"); - err = -EBUSY; - goto out_free_meta; - } - - zram->comp = zcomp_create(zram->compressor, zram->max_comp_streams); - if (!zram->comp) { + comp = zcomp_create(zram->compressor, zram->max_comp_streams); + if (!comp) { pr_info("Cannot initialise %s compressing backend\n", zram->compressor); - err = -EINVAL; - goto out_free_meta; + goto out_cleanup; + } + + down_write(&zram->init_lock); + if (init_done(zram)) { + up_write(&zram->init_lock); + pr_info("Cannot change disksize for initialized device\n"); + err = -EBUSY; + goto out_cleanup; } zram->meta = meta; + zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); up_write(&zram->init_lock); return len; -out_free_meta: - up_write(&zram->init_lock); +out_cleanup: + if (comp) + zcomp_destroy(comp); zram_meta_free(meta); return err; } From 6fef1131ae139299f9a45924ef866ede77c5c500 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:20 -0700 Subject: [PATCH 49/82] zram: return error-valued pointer from zcomp_create() Instead of returning just NULL, return ERR_PTR from zcomp_create() if compressing backend creation has failed. ERR_PTR(-EINVAL) for unsupported compression algorithm request, ERR_PTR(-ENOMEM) for allocation (zcomp or compression stream) error. Perform IS_ERR() check of returned from zcomp_create() value in disksize_store() and set return code to PTR_ERR(). Change suggested by Jerome Marchand. [akpm@linux-foundation.org: clean up error recovery flow] Signed-off-by: Sergey Senozhatsky Reported-by: Jerome Marchand Cc: Minchan Kim Cc: Nitin Gupta Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit fcfa8d95cacf5cbbe6dee6b8d229fe86142266e0) Signed-off-by: Alex Shi --- drivers/block/zram/zcomp.c | 14 ++++++++------ drivers/block/zram/zram_drv.c | 19 ++++++++++--------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index d5919031ca8b..5647d8fe1dc1 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -319,9 +320,10 @@ void zcomp_destroy(struct zcomp *comp) /* * search available compressors for requested algorithm. - * allocate new zcomp and initialize it. return NULL - * if requested algorithm is not supported or in case - * of init error + * allocate new zcomp and initialize it. return compressing + * backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL) + * if requested algorithm is not supported, ERR_PTR(-ENOMEM) in + * case of allocation error. */ struct zcomp *zcomp_create(const char *compress, int max_strm) { @@ -330,11 +332,11 @@ struct zcomp *zcomp_create(const char *compress, int max_strm) backend = find_backend(compress); if (!backend) - return NULL; + return ERR_PTR(-EINVAL); comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL); if (!comp) - return NULL; + return ERR_PTR(-ENOMEM); comp->backend = backend; if (max_strm > 1) @@ -343,7 +345,7 @@ struct zcomp *zcomp_create(const char *compress, int max_strm) zcomp_strm_single_create(comp); if (!comp->stream) { kfree(comp); - return NULL; + return ERR_PTR(-ENOMEM); } return comp; } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index fe0daa9fe59e..407f541b26a1 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "zram_drv.h" @@ -582,7 +583,7 @@ static ssize_t disksize_store(struct device *dev, struct zcomp *comp; struct zram_meta *meta; struct zram *zram = dev_to_zram(dev); - int err = -EINVAL; + int err; disksize = memparse(buf, NULL); if (!disksize) @@ -594,18 +595,18 @@ static ssize_t disksize_store(struct device *dev, return -ENOMEM; comp = zcomp_create(zram->compressor, zram->max_comp_streams); - if (!comp) { + if (IS_ERR(comp)) { pr_info("Cannot initialise %s compressing backend\n", zram->compressor); - goto out_cleanup; + err = PTR_ERR(comp); + goto out_free_meta; } down_write(&zram->init_lock); if (init_done(zram)) { - up_write(&zram->init_lock); pr_info("Cannot change disksize for initialized device\n"); err = -EBUSY; - goto out_cleanup; + goto out_destroy_comp; } zram->meta = meta; @@ -613,12 +614,12 @@ static ssize_t disksize_store(struct device *dev, zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); up_write(&zram->init_lock); - return len; -out_cleanup: - if (comp) - zcomp_destroy(comp); +out_destroy_comp: + up_write(&zram->init_lock); + zcomp_destroy(comp); +out_free_meta: zram_meta_free(meta); return err; } From 1b2dc1d89b1a5bff45782c9ddb6e0d384c7c6420 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 7 Apr 2014 15:38:21 -0700 Subject: [PATCH 50/82] zram: propagate error to user When we initialized zcomp with single, we couldn't change max_comp_streams without zram reset but current interface doesn't show any error to user and even it changes max_comp_streams's value without any effect so it would make user very confusing. This patch prevents max_comp_streams's change when zcomp was initialized as single zcomp and emit the error to user(ex, echo). [akpm@linux-foundation.org: don't return with the lock held, per Sergey] [fengguang.wu@intel.com: fix coccinelle warnings] Signed-off-by: Minchan Kim Cc: Nitin Gupta Cc: Jerome Marchand Acked-by: Sergey Senozhatsky Signed-off-by: Fengguang Wu Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 60a726e33375a1096e85399cfa1327081b4c38be) Signed-off-by: Alex Shi --- Documentation/blockdev/zram.txt | 9 +++++---- drivers/block/zram/zcomp.c | 10 +++++----- drivers/block/zram/zcomp.h | 4 ++-- drivers/block/zram/zram_drv.c | 17 +++++++++++++---- 4 files changed, 25 insertions(+), 15 deletions(-) diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 4ab2ce98f63c..2db1687a4b10 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -37,10 +37,11 @@ Note: In order to enable compression backend's multi stream support max_comp_streams must be initially set to desired concurrency level before ZRAM device initialisation. Once the device initialised as a single stream compression -backend (max_comp_streams equals to 0) changing the value of max_comp_streams -will not take any effect, because single stream compression backend implemented -as a special case and does not support dynamic max_comp_streams. Only multi -stream backend supports dynamic max_comp_streams adjustment. +backend (max_comp_streams equals to 1), you will see error if you try to change +the value of max_comp_streams because single stream compression backend +implemented as a special case by lock overhead issue and does not support +dynamic max_comp_streams. Only multi stream backend supports dynamic +max_comp_streams adjustment. 3) Select compression algorithm Using comp_algorithm device attribute one can see available and diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 5647d8fe1dc1..b0e7592c44d8 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -153,7 +153,7 @@ static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstr } /* change max_strm limit */ -static int zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) +static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) { struct zcomp_strm_multi *zs = comp->stream; struct zcomp_strm *zstrm; @@ -172,7 +172,7 @@ static int zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) zs->avail_strm--; } spin_unlock(&zs->strm_lock); - return 0; + return true; } static void zcomp_strm_multi_destroy(struct zcomp *comp) @@ -232,10 +232,10 @@ static void zcomp_strm_single_release(struct zcomp *comp, mutex_unlock(&zs->strm_lock); } -static int zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm) +static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm) { /* zcomp_strm_single support only max_comp_streams == 1 */ - return -ENOTSUPP; + return false; } static void zcomp_strm_single_destroy(struct zcomp *comp) @@ -284,7 +284,7 @@ ssize_t zcomp_available_show(const char *comp, char *buf) return sz; } -int zcomp_set_max_streams(struct zcomp *comp, int num_strm) +bool zcomp_set_max_streams(struct zcomp *comp, int num_strm) { return comp->set_max_streams(comp, num_strm); } diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index 8b8997f8613b..c59d1fca72c0 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -46,7 +46,7 @@ struct zcomp { struct zcomp_strm *(*strm_find)(struct zcomp *comp); void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm); - int (*set_max_streams)(struct zcomp *comp, int num_strm); + bool (*set_max_streams)(struct zcomp *comp, int num_strm); void (*destroy)(struct zcomp *comp); }; @@ -64,5 +64,5 @@ int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, int zcomp_decompress(struct zcomp *comp, const unsigned char *src, size_t src_len, unsigned char *dst); -int zcomp_set_max_streams(struct zcomp *comp, int num_strm); +bool zcomp_set_max_streams(struct zcomp *comp, int num_strm); #endif /* _ZCOMP_H_ */ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 407f541b26a1..27da5967b57b 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -127,19 +127,28 @@ static ssize_t max_comp_streams_store(struct device *dev, { int num; struct zram *zram = dev_to_zram(dev); + int ret; - if (kstrtoint(buf, 0, &num)) - return -EINVAL; + ret = kstrtoint(buf, 0, &num); + if (ret < 0) + return ret; if (num < 1) return -EINVAL; + down_write(&zram->init_lock); if (init_done(zram)) { - if (zcomp_set_max_streams(zram->comp, num)) + if (!zcomp_set_max_streams(zram->comp, num)) { pr_info("Cannot change max compression streams\n"); + ret = -EINVAL; + goto out; + } } + zram->max_comp_streams = num; + ret = len; +out: up_write(&zram->init_lock); - return len; + return ret; } static ssize_t comp_algorithm_show(struct device *dev, From 575ef7bc322df6e0384ef661f510928bfa1dab81 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 7 Apr 2014 15:38:22 -0700 Subject: [PATCH 51/82] zram: use scnprintf() in attrs show() methods sysfs.txt documentation lists the following requirements: - The buffer will always be PAGE_SIZE bytes in length. On i386, this is 4096. - show() methods should return the number of bytes printed into the buffer. This is the return value of scnprintf(). - show() should always use scnprintf(). Use scnprintf() in show() functions. Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 56b4e8cb85827a2ccc4752a2a7148e56b62b7e96) Signed-off-by: Alex Shi --- drivers/block/zram/zcomp.c | 8 +++++--- drivers/block/zram/zram_drv.c | 12 ++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index b0e7592c44d8..f1ff39a3d1c1 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -275,12 +275,14 @@ ssize_t zcomp_available_show(const char *comp, char *buf) while (backends[i]) { if (sysfs_streq(comp, backends[i]->name)) - sz += sprintf(buf + sz, "[%s] ", backends[i]->name); + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, + "[%s] ", backends[i]->name); else - sz += sprintf(buf + sz, "%s ", backends[i]->name); + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, + "%s ", backends[i]->name); i++; } - sz += sprintf(buf + sz, "\n"); + sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); return sz; } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 27da5967b57b..031598bc14b4 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -48,7 +48,7 @@ static ssize_t zram_attr_##name##_show(struct device *d, \ struct device_attribute *attr, char *b) \ { \ struct zram *zram = dev_to_zram(d); \ - return sprintf(b, "%llu\n", \ + return scnprintf(b, PAGE_SIZE, "%llu\n", \ (u64)atomic64_read(&zram->stats.name)); \ } \ static struct device_attribute dev_attr_##name = \ @@ -69,7 +69,7 @@ static ssize_t disksize_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%llu\n", zram->disksize); + return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); } static ssize_t initstate_show(struct device *dev, @@ -82,7 +82,7 @@ static ssize_t initstate_show(struct device *dev, val = init_done(zram); up_read(&zram->init_lock); - return sprintf(buf, "%u\n", val); + return scnprintf(buf, PAGE_SIZE, "%u\n", val); } static ssize_t orig_data_size_show(struct device *dev, @@ -90,7 +90,7 @@ static ssize_t orig_data_size_show(struct device *dev, { struct zram *zram = dev_to_zram(dev); - return sprintf(buf, "%llu\n", + return scnprintf(buf, PAGE_SIZE, "%llu\n", (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT); } @@ -106,7 +106,7 @@ static ssize_t mem_used_total_show(struct device *dev, val = zs_get_total_size_bytes(meta->mem_pool); up_read(&zram->init_lock); - return sprintf(buf, "%llu\n", val); + return scnprintf(buf, PAGE_SIZE, "%llu\n", val); } static ssize_t max_comp_streams_show(struct device *dev, @@ -119,7 +119,7 @@ static ssize_t max_comp_streams_show(struct device *dev, val = zram->max_comp_streams; up_read(&zram->init_lock); - return sprintf(buf, "%d\n", val); + return scnprintf(buf, PAGE_SIZE, "%d\n", val); } static ssize_t max_comp_streams_store(struct device *dev, From ba6d7663fa3c9b310f33b9b8a18743af8b3727c9 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 7 Apr 2014 15:38:24 -0700 Subject: [PATCH 52/82] zram: support REQ_DISCARD zram is ram based block device and can be used by backend of filesystem. When filesystem deletes a file, it normally doesn't do anything on data block of that file. It just marks on metadata of that file. This behavior has no problem on disk based block device, but has problems on ram based block device, since we can't free memory used for data block. To overcome this disadvantage, there is REQ_DISCARD functionality. If block device support REQ_DISCARD and filesystem is mounted with discard option, filesystem sends REQ_DISCARD to block device whenever some data blocks are discarded. All we have to do is to handle this request. This patch implements to flag up QUEUE_FLAG_DISCARD and handle this REQ_DISCARD request. With it, we can free memory used by zram if it isn't used. [akpm@linux-foundation.org: tweak comments] Signed-off-by: Joonsoo Kim Cc: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit f4659d8e620d08bd1a84a8aec5d2f5294a242764) Signed-off-by: Alex Shi Conflicts: drivers/block/zram/zram_drv.c Conflicts solution: keep use old bio struct, and bio_for_each_segment() --- drivers/block/zram/zram_drv.c | 62 +++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 031598bc14b4..19cf51ad48ef 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -550,6 +550,47 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, return ret; } +/* + * zram_bio_discard - handler on discard request + * @index: physical block index in PAGE_SIZE units + * @offset: byte offset within physical block + */ +static void zram_bio_discard(struct zram *zram, u32 index, + int offset, struct bio *bio) +{ + size_t n = bio->bi_size; + + /* + * zram manages data in physical block size units. Because logical block + * size isn't identical with physical block size on some arch, we + * could get a discard request pointing to a specific offset within a + * certain physical block. Although we can handle this request by + * reading that physiclal block and decompressing and partially zeroing + * and re-compressing and then re-storing it, this isn't reasonable + * because our intent with a discard request is to save memory. So + * skipping this logical block is appropriate here. + */ + if (offset) { + if (n < offset) + return; + + n -= offset; + index++; + } + + while (n >= PAGE_SIZE) { + /* + * Discard request can be large so the lock hold times could be + * lengthy. So take the lock once per page. + */ + write_lock(&zram->meta->tb_lock); + zram_free_page(zram, index); + write_unlock(&zram->meta->tb_lock); + index++; + n -= PAGE_SIZE; + } +} + static void zram_reset_device(struct zram *zram, bool reset_capacity) { size_t index; @@ -684,6 +725,12 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT; offset = (bio->bi_sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; + if (unlikely(bio->bi_rw & REQ_DISCARD)) { + zram_bio_discard(zram, index, offset, bio); + bio_endio(bio, 0); + return; + } + bio_for_each_segment(bvec, bio, i) { int max_transfer_size = PAGE_SIZE - offset; @@ -853,6 +900,21 @@ static int create_device(struct zram *zram, int device_id) ZRAM_LOGICAL_BLOCK_SIZE); blk_queue_io_min(zram->disk->queue, PAGE_SIZE); blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); + zram->disk->queue->limits.discard_granularity = PAGE_SIZE; + zram->disk->queue->limits.max_discard_sectors = UINT_MAX; + /* + * zram_bio_discard() will clear all logical blocks if logical block + * size is identical with physical block size(PAGE_SIZE). But if it is + * different, we will skip discarding some parts of logical blocks in + * the part of the request range which isn't aligned to physical block + * size. So we can't ensure that all discarded logical blocks are + * zeroed. + */ + if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) + zram->disk->queue->limits.discard_zeroes_data = 1; + else + zram->disk->queue->limits.discard_zeroes_data = 0; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue); add_disk(zram->disk); From c14a95869402ecb41d86ffac66723575bf15f21c Mon Sep 17 00:00:00 2001 From: Kyungsik Lee Date: Mon, 8 Jul 2013 16:01:45 -0700 Subject: [PATCH 53/82] decompressor: add LZ4 decompressor module Add support for LZ4 decompression in the Linux Kernel. LZ4 Decompression APIs for kernel are based on LZ4 implementation by Yann Collet. Benchmark Results(PATCH v3) Compiler: Linaro ARM gcc 4.6.2 1. ARMv7, 1.5GHz based board Kernel: linux 3.4 Uncompressed Kernel Size: 14MB Compressed Size Decompression Speed LZO 6.7MB 20.1MB/s, 25.2MB/s(UA) LZ4 7.3MB 29.1MB/s, 45.6MB/s(UA) 2. ARMv7, 1.7GHz based board Kernel: linux 3.7 Uncompressed Kernel Size: 14MB Compressed Size Decompression Speed LZO 6.0MB 34.1MB/s, 52.2MB/s(UA) LZ4 6.5MB 86.7MB/s - UA: Unaligned memory Access support - Latest patch set for LZO applied This patch set is for adding support for LZ4-compressed Kernel. LZ4 is a very fast lossless compression algorithm and it also features an extremely fast decoder [1]. But we have five of decompressors already and one question which does arise, however, is that of where do we stop adding new ones? This issue had been discussed and came to the conclusion [2]. Russell King said that we should have: - one decompressor which is the fastest - one decompressor for the highest compression ratio - one popular decompressor (eg conventional gzip) If we have a replacement one for one of these, then it should do exactly that: replace it. The benchmark shows that an 8% increase in image size vs a 66% increase in decompression speed compared to LZO(which has been known as the fastest decompressor in the Kernel). Therefore the "fast but may not be small" compression title has clearly been taken by LZ4 [3]. [1] http://code.google.com/p/lz4/ [2] http://thread.gmane.org/gmane.linux.kbuild.devel/9157 [3] http://thread.gmane.org/gmane.linux.kbuild.devel/9347 LZ4 homepage: http://fastcompression.blogspot.com/p/lz4.html LZ4 source repository: http://code.google.com/p/lz4/ Signed-off-by: Kyungsik Lee Signed-off-by: Yann Collet Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Russell King Cc: Borislav Petkov Cc: Florian Fainelli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit cffb78b0e0b3a30b059b27a1d97500cf6464efa9) Signed-off-by: Alex Shi --- include/linux/lz4.h | 51 ++++++ lib/lz4/lz4_decompress.c | 326 +++++++++++++++++++++++++++++++++++++++ lib/lz4/lz4defs.h | 94 +++++++++++ 3 files changed, 471 insertions(+) create mode 100644 include/linux/lz4.h create mode 100644 lib/lz4/lz4_decompress.c create mode 100644 lib/lz4/lz4defs.h diff --git a/include/linux/lz4.h b/include/linux/lz4.h new file mode 100644 index 000000000000..7f6c75a093f8 --- /dev/null +++ b/include/linux/lz4.h @@ -0,0 +1,51 @@ +#ifndef __LZ4_H__ +#define __LZ4_H__ +/* + * LZ4 Kernel Interface + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * lz4_compressbound() + * Provides the maximum size that LZ4 may output in a "worst case" scenario + * (input data not compressible) + */ +static inline size_t lz4_compressbound(size_t isize) +{ + return isize + (isize / 255) + 16; +} + +/* + * lz4_decompress() + * src : source address of the compressed data + * src_len : is the input size, whcih is returned after decompress done + * dest : output buffer address of the decompressed data + * actual_dest_len: is the size of uncompressed data, supposing it's known + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer must be already allocated. + * slightly faster than lz4_decompress_unknownoutputsize() + */ +int lz4_decompress(const char *src, size_t *src_len, char *dest, + size_t actual_dest_len); + +/* + * lz4_decompress_unknownoutputsize() + * src : source address of the compressed data + * src_len : is the input size, therefore the compressed size + * dest : output buffer address of the decompressed data + * dest_len: is the max size of the destination buffer, which is + * returned with actual size of decompressed data after + * decompress done + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer must be already allocated. + */ +int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, + char *dest, size_t *dest_len); +#endif diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c new file mode 100644 index 000000000000..dcc89753af65 --- /dev/null +++ b/lib/lz4/lz4_decompress.c @@ -0,0 +1,326 @@ +/* + * LZ4 Decompressor for Linux kernel + * + * Copyright (C) 2013 LG Electronics Co., Ltd. (http://www.lge.com/) + * + * Based on LZ4 implementation by Yann Collet. + * + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011-2012, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + */ + +#ifndef STATIC +#include +#include +#endif +#include + +#include + +#include "lz4defs.h" + +static int lz4_uncompress(const char *source, char *dest, int osize) +{ + const BYTE *ip = (const BYTE *) source; + const BYTE *ref; + BYTE *op = (BYTE *) dest; + BYTE * const oend = op + osize; + BYTE *cpy; + unsigned token; + size_t length; + size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; +#if LZ4_ARCH64 + size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; +#endif + + while (1) { + + /* get runlength */ + token = *ip++; + length = (token >> ML_BITS); + if (length == RUN_MASK) { + size_t len; + + len = *ip++; + for (; len == 255; length += 255) + len = *ip++; + length += len; + } + + /* copy literals */ + cpy = op + length; + if (unlikely(cpy > oend - COPYLENGTH)) { + /* + * Error: not enough place for another match + * (min 4) + 5 literals + */ + if (cpy != oend) + goto _output_error; + + memcpy(op, ip, length); + ip += length; + break; /* EOF */ + } + LZ4_WILDCOPY(ip, op, cpy); + ip -= (op - cpy); + op = cpy; + + /* get offset */ + LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); + ip += 2; + + /* Error: offset create reference outside destination buffer */ + if (unlikely(ref < (BYTE *const) dest)) + goto _output_error; + + /* get matchlength */ + length = token & ML_MASK; + if (length == ML_MASK) { + for (; *ip == 255; length += 255) + ip++; + length += *ip++; + } + + /* copy repeated sequence */ + if (unlikely((op - ref) < STEPSIZE)) { +#if LZ4_ARCH64 + size_t dec64 = dec64table[op - ref]; +#else + const int dec64 = 0; +#endif + op[0] = ref[0]; + op[1] = ref[1]; + op[2] = ref[2]; + op[3] = ref[3]; + op += 4; + ref += 4; + ref -= dec32table[op-ref]; + PUT4(ref, op); + op += STEPSIZE - 4; + ref -= dec64; + } else { + LZ4_COPYSTEP(ref, op); + } + cpy = op + length - (STEPSIZE - 4); + if (cpy > (oend - COPYLENGTH)) { + + /* Error: request to write beyond destination buffer */ + if (cpy > oend) + goto _output_error; + LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); + while (op < cpy) + *op++ = *ref++; + op = cpy; + /* + * Check EOF (should never happen, since last 5 bytes + * are supposed to be literals) + */ + if (op == oend) + goto _output_error; + continue; + } + LZ4_SECURECOPY(ref, op, cpy); + op = cpy; /* correction */ + } + /* end of decoding */ + return (int) (((char *)ip) - source); + + /* write overflow error detected */ +_output_error: + return (int) (-(((char *)ip) - source)); +} + +static int lz4_uncompress_unknownoutputsize(const char *source, char *dest, + int isize, size_t maxoutputsize) +{ + const BYTE *ip = (const BYTE *) source; + const BYTE *const iend = ip + isize; + const BYTE *ref; + + + BYTE *op = (BYTE *) dest; + BYTE * const oend = op + maxoutputsize; + BYTE *cpy; + + size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; +#if LZ4_ARCH64 + size_t dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; +#endif + + /* Main Loop */ + while (ip < iend) { + + unsigned token; + size_t length; + + /* get runlength */ + token = *ip++; + length = (token >> ML_BITS); + if (length == RUN_MASK) { + int s = 255; + while ((ip < iend) && (s == 255)) { + s = *ip++; + length += s; + } + } + /* copy literals */ + cpy = op + length; + if ((cpy > oend - COPYLENGTH) || + (ip + length > iend - COPYLENGTH)) { + + if (cpy > oend) + goto _output_error;/* writes beyond buffer */ + + if (ip + length != iend) + goto _output_error;/* + * Error: LZ4 format requires + * to consume all input + * at this stage + */ + memcpy(op, ip, length); + op += length; + break;/* Necessarily EOF, due to parsing restrictions */ + } + LZ4_WILDCOPY(ip, op, cpy); + ip -= (op - cpy); + op = cpy; + + /* get offset */ + LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); + ip += 2; + if (ref < (BYTE * const) dest) + goto _output_error; + /* + * Error : offset creates reference + * outside of destination buffer + */ + + /* get matchlength */ + length = (token & ML_MASK); + if (length == ML_MASK) { + while (ip < iend) { + int s = *ip++; + length += s; + if (s == 255) + continue; + break; + } + } + + /* copy repeated sequence */ + if (unlikely((op - ref) < STEPSIZE)) { +#if LZ4_ARCH64 + size_t dec64 = dec64table[op - ref]; +#else + const int dec64 = 0; +#endif + op[0] = ref[0]; + op[1] = ref[1]; + op[2] = ref[2]; + op[3] = ref[3]; + op += 4; + ref += 4; + ref -= dec32table[op - ref]; + PUT4(ref, op); + op += STEPSIZE - 4; + ref -= dec64; + } else { + LZ4_COPYSTEP(ref, op); + } + cpy = op + length - (STEPSIZE-4); + if (cpy > oend - COPYLENGTH) { + if (cpy > oend) + goto _output_error; /* write outside of buf */ + + LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); + while (op < cpy) + *op++ = *ref++; + op = cpy; + /* + * Check EOF (should never happen, since last 5 bytes + * are supposed to be literals) + */ + if (op == oend) + goto _output_error; + continue; + } + LZ4_SECURECOPY(ref, op, cpy); + op = cpy; /* correction */ + } + /* end of decoding */ + return (int) (((char *) op) - dest); + + /* write overflow error detected */ +_output_error: + return (int) (-(((char *) ip) - source)); +} + +int lz4_decompress(const char *src, size_t *src_len, char *dest, + size_t actual_dest_len) +{ + int ret = -1; + int input_len = 0; + + input_len = lz4_uncompress(src, dest, actual_dest_len); + if (input_len < 0) + goto exit_0; + *src_len = input_len; + + return 0; +exit_0: + return ret; +} +#ifndef STATIC +EXPORT_SYMBOL_GPL(lz4_decompress); +#endif + +int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, + char *dest, size_t *dest_len) +{ + int ret = -1; + int out_len = 0; + + out_len = lz4_uncompress_unknownoutputsize(src, dest, src_len, + *dest_len); + if (out_len < 0) + goto exit_0; + *dest_len = out_len; + + return 0; +exit_0: + return ret; +} +#ifndef STATIC +EXPORT_SYMBOL_GPL(lz4_decompress_unknownoutputsize); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4 Decompressor"); +#endif diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h new file mode 100644 index 000000000000..43ac31d63f36 --- /dev/null +++ b/lib/lz4/lz4defs.h @@ -0,0 +1,94 @@ +/* + * lz4defs.h -- architecture specific defines + * + * Copyright (C) 2013, LG Electronics, Kyungsik Lee + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * Detects 64 bits mode + */ +#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) \ + || defined(__ppc64__) || defined(__LP64__)) +#define LZ4_ARCH64 1 +#else +#define LZ4_ARCH64 0 +#endif + +/* + * Architecture-specific macros + */ +#define BYTE u8 +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) \ + || defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 6 \ + && defined(ARM_EFFICIENT_UNALIGNED_ACCESS) +typedef struct _U32_S { u32 v; } U32_S; +typedef struct _U64_S { u64 v; } U64_S; + +#define A32(x) (((U32_S *)(x))->v) +#define A64(x) (((U64_S *)(x))->v) + +#define PUT4(s, d) (A32(d) = A32(s)) +#define PUT8(s, d) (A64(d) = A64(s)) +#else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ + +#define PUT4(s, d) \ + put_unaligned(get_unaligned((const u32 *) s), (u32 *) d) +#define PUT8(s, d) \ + put_unaligned(get_unaligned((const u64 *) s), (u64 *) d) +#endif + +#define COPYLENGTH 8 +#define ML_BITS 4 +#define ML_MASK ((1U << ML_BITS) - 1) +#define RUN_BITS (8 - ML_BITS) +#define RUN_MASK ((1U << RUN_BITS) - 1) + +#if LZ4_ARCH64/* 64-bit */ +#define STEPSIZE 8 + +#define LZ4_COPYSTEP(s, d) \ + do { \ + PUT8(s, d); \ + d += 8; \ + s += 8; \ + } while (0) + +#define LZ4_COPYPACKET(s, d) LZ4_COPYSTEP(s, d) + +#define LZ4_SECURECOPY(s, d, e) \ + do { \ + if (d < e) { \ + LZ4_WILDCOPY(s, d, e); \ + } \ + } while (0) + +#else /* 32-bit */ +#define STEPSIZE 4 + +#define LZ4_COPYSTEP(s, d) \ + do { \ + PUT4(s, d); \ + d += 4; \ + s += 4; \ + } while (0) + +#define LZ4_COPYPACKET(s, d) \ + do { \ + LZ4_COPYSTEP(s, d); \ + LZ4_COPYSTEP(s, d); \ + } while (0) + +#define LZ4_SECURECOPY LZ4_WILDCOPY +#endif + +#define LZ4_READ_LITTLEENDIAN_16(d, s, p) \ + (d = s - get_unaligned_le16(p)) + +#define LZ4_WILDCOPY(s, d, e) \ + do { \ + LZ4_COPYPACKET(s, d); \ + } while (d < e) From 2da344e89ce0d5827a3eb3f306d63a62f0e6b03e Mon Sep 17 00:00:00 2001 From: Chanho Min Date: Mon, 8 Jul 2013 16:01:49 -0700 Subject: [PATCH 54/82] lib: add lz4 compressor module This patchset is for supporting LZ4 compression and the crypto API using it. As shown below, the size of data is a little bit bigger but compressing speed is faster under the enabled unaligned memory access. We can use lz4 de/compression through crypto API as well. Also, It will be useful for another potential user of lz4 compression. lz4 Compression Benchmark: Compiler: ARM gcc 4.6.4 ARMv7, 1 GHz based board Kernel: linux 3.4 Uncompressed data Size: 101 MB Compressed Size compression Speed LZO 72.1MB 32.1MB/s, 33.0MB/s(UA) LZ4 75.1MB 30.4MB/s, 35.9MB/s(UA) LZ4HC 59.8MB 2.4MB/s, 2.5MB/s(UA) - UA: Unaligned memory Access support - Latest patch set for LZO applied This patch: Add support for LZ4 compression in the Linux Kernel. LZ4 Compression APIs for kernel are based on LZ4 implementation by Yann Collet and were changed for kernel coding style. LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html LZ4 source repository : http://code.google.com/p/lz4/ svn revision : r90 Two APIs are added: lz4_compress() support basic lz4 compression whereas lz4hc_compress() support high compression or CPU performance get lower but compression ratio get higher. Also, we require the pre-allocated working memory with the defined size and destination buffer must be allocated with the size of lz4_compressbound. [akpm@linux-foundation.org: make lz4_compresshcctx() static] Signed-off-by: Chanho Min Cc: "Darrick J. Wong" Cc: Bob Pearson Cc: Richard Weinberger Cc: Herbert Xu Cc: Yann Collet Cc: Kyungsik Lee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit c72ac7a1a926dbffb59daf0f275450e5eecce16f) Signed-off-by: Alex Shi --- include/linux/lz4.h | 36 +++ lib/Kconfig | 9 + lib/Makefile | 3 + lib/lz4/Makefile | 3 + lib/lz4/lz4_compress.c | 443 ++++++++++++++++++++++++++++++++ lib/lz4/lz4defs.h | 66 ++++- lib/lz4/lz4hc_compress.c | 539 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 1097 insertions(+), 2 deletions(-) create mode 100644 lib/lz4/Makefile create mode 100644 lib/lz4/lz4_compress.c create mode 100644 lib/lz4/lz4hc_compress.c diff --git a/include/linux/lz4.h b/include/linux/lz4.h index 7f6c75a093f8..d21c13f10a64 100644 --- a/include/linux/lz4.h +++ b/include/linux/lz4.h @@ -9,6 +9,8 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#define LZ4_MEM_COMPRESS (4096 * sizeof(unsigned char *)) +#define LZ4HC_MEM_COMPRESS (65538 * sizeof(unsigned char *)) /* * lz4_compressbound() @@ -20,6 +22,40 @@ static inline size_t lz4_compressbound(size_t isize) return isize + (isize / 255) + 16; } +/* + * lz4_compress() + * src : source address of the original data + * src_len : size of the original data + * dst : output buffer address of the compressed data + * This requires 'dst' of size LZ4_COMPRESSBOUND. + * dst_len : is the output size, which is returned after compress done + * workmem : address of the working memory. + * This requires 'workmem' of size LZ4_MEM_COMPRESS. + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer and workmem must be already allocated with + * the defined size. + */ +int lz4_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem); + + /* + * lz4hc_compress() + * src : source address of the original data + * src_len : size of the original data + * dst : output buffer address of the compressed data + * This requires 'dst' of size LZ4_COMPRESSBOUND. + * dst_len : is the output size, which is returned after compress done + * workmem : address of the working memory. + * This requires 'workmem' of size LZ4HC_MEM_COMPRESS. + * return : Success if return 0 + * Error if return (< 0) + * note : Destination buffer and workmem must be already allocated with + * the defined size. + */ +int lz4hc_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem); + /* * lz4_decompress() * src : source address of the compressed data diff --git a/lib/Kconfig b/lib/Kconfig index fe01d418b09a..06d94d885877 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -189,6 +189,15 @@ config LZO_COMPRESS config LZO_DECOMPRESS tristate +config LZ4_COMPRESS + tristate + +config LZ4HC_COMPRESS + tristate + +config LZ4_DECOMPRESS + tristate + source "lib/xz/Kconfig" # diff --git a/lib/Makefile b/lib/Makefile index 9efe480b975e..b6d7848a526f 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -76,6 +76,9 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ obj-$(CONFIG_BCH) += bch.o obj-$(CONFIG_LZO_COMPRESS) += lzo/ obj-$(CONFIG_LZO_DECOMPRESS) += lzo/ +obj-$(CONFIG_LZ4_COMPRESS) += lz4/ +obj-$(CONFIG_LZ4HC_COMPRESS) += lz4/ +obj-$(CONFIG_LZ4_DECOMPRESS) += lz4/ obj-$(CONFIG_XZ_DEC) += xz/ obj-$(CONFIG_RAID6_PQ) += raid6/ diff --git a/lib/lz4/Makefile b/lib/lz4/Makefile new file mode 100644 index 000000000000..8085d04e9309 --- /dev/null +++ b/lib/lz4/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_LZ4_COMPRESS) += lz4_compress.o +obj-$(CONFIG_LZ4HC_COMPRESS) += lz4hc_compress.o +obj-$(CONFIG_LZ4_DECOMPRESS) += lz4_decompress.o diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c new file mode 100644 index 000000000000..fd94058bd7f9 --- /dev/null +++ b/lib/lz4/lz4_compress.c @@ -0,0 +1,443 @@ +/* + * LZ4 - Fast LZ compression algorithm + * Copyright (C) 2011-2012, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + * + * Changed for kernel use by: + * Chanho Min + */ + +#include +#include +#include +#include +#include "lz4defs.h" + +/* + * LZ4_compressCtx : + * ----------------- + * Compress 'isize' bytes from 'source' into an output buffer 'dest' of + * maximum size 'maxOutputSize'. * If it cannot achieve it, compression + * will stop, and result of the function will be zero. + * return : the number of bytes written in buffer 'dest', or 0 if the + * compression fails + */ +static inline int lz4_compressctx(void *ctx, + const char *source, + char *dest, + int isize, + int maxoutputsize) +{ + HTYPE *hashtable = (HTYPE *)ctx; + const u8 *ip = (u8 *)source; +#if LZ4_ARCH64 + const BYTE * const base = ip; +#else + const int base = 0; +#endif + const u8 *anchor = ip; + const u8 *const iend = ip + isize; + const u8 *const mflimit = iend - MFLIMIT; + #define MATCHLIMIT (iend - LASTLITERALS) + + u8 *op = (u8 *) dest; + u8 *const oend = op + maxoutputsize; + int length; + const int skipstrength = SKIPSTRENGTH; + u32 forwardh; + int lastrun; + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + memset((void *)hashtable, 0, LZ4_MEM_COMPRESS); + + /* First Byte */ + hashtable[LZ4_HASH_VALUE(ip)] = ip - base; + ip++; + forwardh = LZ4_HASH_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findmatchattempts = (1U << skipstrength) + 3; + const u8 *forwardip = ip; + const u8 *ref; + u8 *token; + + /* Find a match */ + do { + u32 h = forwardh; + int step = findmatchattempts++ >> skipstrength; + ip = forwardip; + forwardip = ip + step; + + if (unlikely(forwardip > mflimit)) + goto _last_literals; + + forwardh = LZ4_HASH_VALUE(forwardip); + ref = base + hashtable[h]; + hashtable[h] = ip - base; + } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip))); + + /* Catch up */ + while ((ip > anchor) && (ref > (u8 *)source) && + unlikely(ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = (int)(ip - anchor); + token = op++; + /* check output limit */ + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend)) + return 0; + + if (length >= (int)RUN_MASK) { + int len; + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254 ; len -= 255) + *op++ = 255; + *op++ = (u8)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); +_next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref)); + + /* Start Counting */ + ip += MINMATCH; + /* MinMatch verified */ + ref += MINMATCH; + anchor = ip; + while (likely(ip < MATCHLIMIT - (STEPSIZE - 1))) { + #if LZ4_ARCH64 + u64 diff = A64(ref) ^ A64(ip); + #else + u32 diff = A32(ref) ^ A32(ip); + #endif + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NBCOMMONBYTES(diff); + goto _endcount; + } + #if LZ4_ARCH64 + if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } + #endif + if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < MATCHLIMIT) && (*ref == *ip)) + ip++; +_endcount: + /* Encode MatchLength */ + length = (int)(ip - anchor); + /* Check output limit */ + if (unlikely(op + (1 + LASTLITERALS) + (length >> 8) > oend)) + return 0; + if (length >= (int)ML_MASK) { + *token += ML_MASK; + length -= ML_MASK; + for (; length > 509 ; length -= 510) { + *op++ = 255; + *op++ = 255; + } + if (length > 254) { + length -= 255; + *op++ = 255; + } + *op++ = (u8)length; + } else + *token += length; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + + /* Fill table */ + hashtable[LZ4_HASH_VALUE(ip-2)] = ip - 2 - base; + + /* Test next position */ + ref = base + hashtable[LZ4_HASH_VALUE(ip)]; + hashtable[LZ4_HASH_VALUE(ip)] = ip - base; + if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) { + token = op++; + *token = 0; + goto _next_match; + } + + /* Prepare next loop */ + anchor = ip++; + forwardh = LZ4_HASH_VALUE(ip); + } + +_last_literals: + /* Encode Last Literals */ + lastrun = (int)(iend - anchor); + if (((char *)op - dest) + lastrun + 1 + + ((lastrun + 255 - RUN_MASK) / 255) > (u32)maxoutputsize) + return 0; + + if (lastrun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastrun -= RUN_MASK; + for (; lastrun > 254 ; lastrun -= 255) + *op++ = 255; + *op++ = (u8)lastrun; + } else + *op++ = (lastrun << ML_BITS); + memcpy(op, anchor, iend - anchor); + op += iend - anchor; + + /* End */ + return (int)(((char *)op) - dest); +} + +static inline int lz4_compress64kctx(void *ctx, + const char *source, + char *dest, + int isize, + int maxoutputsize) +{ + u16 *hashtable = (u16 *)ctx; + const u8 *ip = (u8 *) source; + const u8 *anchor = ip; + const u8 *const base = ip; + const u8 *const iend = ip + isize; + const u8 *const mflimit = iend - MFLIMIT; + #define MATCHLIMIT (iend - LASTLITERALS) + + u8 *op = (u8 *) dest; + u8 *const oend = op + maxoutputsize; + int len, length; + const int skipstrength = SKIPSTRENGTH; + u32 forwardh; + int lastrun; + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + memset((void *)hashtable, 0, LZ4_MEM_COMPRESS); + + /* First Byte */ + ip++; + forwardh = LZ4_HASH64K_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findmatchattempts = (1U << skipstrength) + 3; + const u8 *forwardip = ip; + const u8 *ref; + u8 *token; + + /* Find a match */ + do { + u32 h = forwardh; + int step = findmatchattempts++ >> skipstrength; + ip = forwardip; + forwardip = ip + step; + + if (forwardip > mflimit) + goto _last_literals; + + forwardh = LZ4_HASH64K_VALUE(forwardip); + ref = base + hashtable[h]; + hashtable[h] = (u16)(ip - base); + } while (A32(ref) != A32(ip)); + + /* Catch up */ + while ((ip > anchor) && (ref > (u8 *)source) + && (ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = (int)(ip - anchor); + token = op++; + /* Check output limit */ + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend)) + return 0; + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254 ; len -= 255) + *op++ = 255; + *op++ = (u8)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); + +_next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, (u16)(ip - ref)); + + /* Start Counting */ + ip += MINMATCH; + /* MinMatch verified */ + ref += MINMATCH; + anchor = ip; + + while (ip < MATCHLIMIT - (STEPSIZE - 1)) { + #if LZ4_ARCH64 + u64 diff = A64(ref) ^ A64(ip); + #else + u32 diff = A32(ref) ^ A32(ip); + #endif + + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NBCOMMONBYTES(diff); + goto _endcount; + } + #if LZ4_ARCH64 + if ((ip < (MATCHLIMIT - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } + #endif + if ((ip < (MATCHLIMIT - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < MATCHLIMIT) && (*ref == *ip)) + ip++; +_endcount: + + /* Encode MatchLength */ + len = (int)(ip - anchor); + /* Check output limit */ + if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend)) + return 0; + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509 ; len -= 510) { + *op++ = 255; + *op++ = 255; + } + if (len > 254) { + len -= 255; + *op++ = 255; + } + *op++ = (u8)len; + } else + *token += len; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + + /* Fill table */ + hashtable[LZ4_HASH64K_VALUE(ip-2)] = (u16)(ip - 2 - base); + + /* Test next position */ + ref = base + hashtable[LZ4_HASH64K_VALUE(ip)]; + hashtable[LZ4_HASH64K_VALUE(ip)] = (u16)(ip - base); + if (A32(ref) == A32(ip)) { + token = op++; + *token = 0; + goto _next_match; + } + + /* Prepare next loop */ + anchor = ip++; + forwardh = LZ4_HASH64K_VALUE(ip); + } + +_last_literals: + /* Encode Last Literals */ + lastrun = (int)(iend - anchor); + if (op + lastrun + 1 + (lastrun - RUN_MASK + 255) / 255 > oend) + return 0; + if (lastrun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastrun -= RUN_MASK; + for (; lastrun > 254 ; lastrun -= 255) + *op++ = 255; + *op++ = (u8)lastrun; + } else + *op++ = (lastrun << ML_BITS); + memcpy(op, anchor, iend - anchor); + op += iend - anchor; + /* End */ + return (int)(((char *)op) - dest); +} + +int lz4_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem) +{ + int ret = -1; + int out_len = 0; + + if (src_len < LZ4_64KLIMIT) + out_len = lz4_compress64kctx(wrkmem, src, dst, src_len, + lz4_compressbound(src_len)); + else + out_len = lz4_compressctx(wrkmem, src, dst, src_len, + lz4_compressbound(src_len)); + + if (out_len < 0) + goto exit; + + *dst_len = out_len; + + return 0; +exit: + return ret; +} +EXPORT_SYMBOL_GPL(lz4_compress); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4 compressor"); diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h index 43ac31d63f36..abcecdc2d0f2 100644 --- a/lib/lz4/lz4defs.h +++ b/lib/lz4/lz4defs.h @@ -22,23 +22,40 @@ * Architecture-specific macros */ #define BYTE u8 +typedef struct _U16_S { u16 v; } U16_S; +typedef struct _U32_S { u32 v; } U32_S; +typedef struct _U64_S { u64 v; } U64_S; #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) \ || defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 6 \ && defined(ARM_EFFICIENT_UNALIGNED_ACCESS) -typedef struct _U32_S { u32 v; } U32_S; -typedef struct _U64_S { u64 v; } U64_S; +#define A16(x) (((U16_S *)(x))->v) #define A32(x) (((U32_S *)(x))->v) #define A64(x) (((U64_S *)(x))->v) #define PUT4(s, d) (A32(d) = A32(s)) #define PUT8(s, d) (A64(d) = A64(s)) +#define LZ4_WRITE_LITTLEENDIAN_16(p, v) \ + do { \ + A16(p) = v; \ + p += 2; \ + } while (0) #else /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ +#define A64(x) get_unaligned((u64 *)&(((U16_S *)(x))->v)) +#define A32(x) get_unaligned((u32 *)&(((U16_S *)(x))->v)) +#define A16(x) get_unaligned((u16 *)&(((U16_S *)(x))->v)) + #define PUT4(s, d) \ put_unaligned(get_unaligned((const u32 *) s), (u32 *) d) #define PUT8(s, d) \ put_unaligned(get_unaligned((const u64 *) s), (u64 *) d) + +#define LZ4_WRITE_LITTLEENDIAN_16(p, v) \ + do { \ + put_unaligned(v, (u16 *)(p)); \ + p += 2; \ + } while (0) #endif #define COPYLENGTH 8 @@ -46,6 +63,29 @@ typedef struct _U64_S { u64 v; } U64_S; #define ML_MASK ((1U << ML_BITS) - 1) #define RUN_BITS (8 - ML_BITS) #define RUN_MASK ((1U << RUN_BITS) - 1) +#define MEMORY_USAGE 14 +#define MINMATCH 4 +#define SKIPSTRENGTH 6 +#define LASTLITERALS 5 +#define MFLIMIT (COPYLENGTH + MINMATCH) +#define MINLENGTH (MFLIMIT + 1) +#define MAXD_LOG 16 +#define MAXD (1 << MAXD_LOG) +#define MAXD_MASK (u32)(MAXD - 1) +#define MAX_DISTANCE (MAXD - 1) +#define HASH_LOG (MAXD_LOG - 1) +#define HASHTABLESIZE (1 << HASH_LOG) +#define MAX_NB_ATTEMPTS 256 +#define OPTIMAL_ML (int)((ML_MASK-1)+MINMATCH) +#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT - 1)) +#define HASHLOG64K ((MEMORY_USAGE - 2) + 1) +#define HASH64KTABLESIZE (1U << HASHLOG64K) +#define LZ4_HASH_VALUE(p) (((A32(p)) * 2654435761U) >> \ + ((MINMATCH * 8) - (MEMORY_USAGE-2))) +#define LZ4_HASH64K_VALUE(p) (((A32(p)) * 2654435761U) >> \ + ((MINMATCH * 8) - HASHLOG64K)) +#define HASH_VALUE(p) (((A32(p)) * 2654435761U) >> \ + ((MINMATCH * 8) - HASH_LOG)) #if LZ4_ARCH64/* 64-bit */ #define STEPSIZE 8 @@ -65,6 +105,13 @@ typedef struct _U64_S { u64 v; } U64_S; LZ4_WILDCOPY(s, d, e); \ } \ } while (0) +#define HTYPE u32 + +#ifdef __BIG_ENDIAN +#define LZ4_NBCOMMONBYTES(val) (__builtin_clzll(val) >> 3) +#else +#define LZ4_NBCOMMONBYTES(val) (__builtin_ctzll(val) >> 3) +#endif #else /* 32-bit */ #define STEPSIZE 4 @@ -83,6 +130,14 @@ typedef struct _U64_S { u64 v; } U64_S; } while (0) #define LZ4_SECURECOPY LZ4_WILDCOPY +#define HTYPE const u8* + +#ifdef __BIG_ENDIAN +#define LZ4_NBCOMMONBYTES(val) (__builtin_clz(val) >> 3) +#else +#define LZ4_NBCOMMONBYTES(val) (__builtin_ctz(val) >> 3) +#endif + #endif #define LZ4_READ_LITTLEENDIAN_16(d, s, p) \ @@ -92,3 +147,10 @@ typedef struct _U64_S { u64 v; } U64_S; do { \ LZ4_COPYPACKET(s, d); \ } while (d < e) + +#define LZ4_BLINDCOPY(s, d, l) \ + do { \ + u8 *e = (d) + l; \ + LZ4_WILDCOPY(s, d, e); \ + d = e; \ + } while (0) diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c new file mode 100644 index 000000000000..eb1a74f5e368 --- /dev/null +++ b/lib/lz4/lz4hc_compress.c @@ -0,0 +1,539 @@ +/* + * LZ4 HC - High Compression Mode of LZ4 + * Copyright (C) 2011-2012, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + * + * Changed for kernel use by: + * Chanho Min + */ + +#include +#include +#include +#include +#include "lz4defs.h" + +struct lz4hc_data { + const u8 *base; + HTYPE hashtable[HASHTABLESIZE]; + u16 chaintable[MAXD]; + const u8 *nexttoupdate; +} __attribute__((__packed__)); + +static inline int lz4hc_init(struct lz4hc_data *hc4, const u8 *base) +{ + memset((void *)hc4->hashtable, 0, sizeof(hc4->hashtable)); + memset(hc4->chaintable, 0xFF, sizeof(hc4->chaintable)); + +#if LZ4_ARCH64 + hc4->nexttoupdate = base + 1; +#else + hc4->nexttoupdate = base; +#endif + hc4->base = base; + return 1; +} + +/* Update chains up to ip (excluded) */ +static inline void lz4hc_insert(struct lz4hc_data *hc4, const u8 *ip) +{ + u16 *chaintable = hc4->chaintable; + HTYPE *hashtable = hc4->hashtable; +#if LZ4_ARCH64 + const BYTE * const base = hc4->base; +#else + const int base = 0; +#endif + + while (hc4->nexttoupdate < ip) { + const u8 *p = hc4->nexttoupdate; + size_t delta = p - (hashtable[HASH_VALUE(p)] + base); + if (delta > MAX_DISTANCE) + delta = MAX_DISTANCE; + chaintable[(size_t)(p) & MAXD_MASK] = (u16)delta; + hashtable[HASH_VALUE(p)] = (p) - base; + hc4->nexttoupdate++; + } +} + +static inline size_t lz4hc_commonlength(const u8 *p1, const u8 *p2, + const u8 *const matchlimit) +{ + const u8 *p1t = p1; + + while (p1t < matchlimit - (STEPSIZE - 1)) { +#if LZ4_ARCH64 + u64 diff = A64(p2) ^ A64(p1t); +#else + u32 diff = A32(p2) ^ A32(p1t); +#endif + if (!diff) { + p1t += STEPSIZE; + p2 += STEPSIZE; + continue; + } + p1t += LZ4_NBCOMMONBYTES(diff); + return p1t - p1; + } +#if LZ4_ARCH64 + if ((p1t < (matchlimit-3)) && (A32(p2) == A32(p1t))) { + p1t += 4; + p2 += 4; + } +#endif + + if ((p1t < (matchlimit - 1)) && (A16(p2) == A16(p1t))) { + p1t += 2; + p2 += 2; + } + if ((p1t < matchlimit) && (*p2 == *p1t)) + p1t++; + return p1t - p1; +} + +static inline int lz4hc_insertandfindbestmatch(struct lz4hc_data *hc4, + const u8 *ip, const u8 *const matchlimit, const u8 **matchpos) +{ + u16 *const chaintable = hc4->chaintable; + HTYPE *const hashtable = hc4->hashtable; + const u8 *ref; +#if LZ4_ARCH64 + const BYTE * const base = hc4->base; +#else + const int base = 0; +#endif + int nbattempts = MAX_NB_ATTEMPTS; + size_t repl = 0, ml = 0; + u16 delta; + + /* HC4 match finder */ + lz4hc_insert(hc4, ip); + ref = hashtable[HASH_VALUE(ip)] + base; + + /* potential repetition */ + if (ref >= ip-4) { + /* confirmed */ + if (A32(ref) == A32(ip)) { + delta = (u16)(ip-ref); + repl = ml = lz4hc_commonlength(ip + MINMATCH, + ref + MINMATCH, matchlimit) + MINMATCH; + *matchpos = ref; + } + ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; + } + + while ((ref >= ip - MAX_DISTANCE) && nbattempts) { + nbattempts--; + if (*(ref + ml) == *(ip + ml)) { + if (A32(ref) == A32(ip)) { + size_t mlt = + lz4hc_commonlength(ip + MINMATCH, + ref + MINMATCH, matchlimit) + MINMATCH; + if (mlt > ml) { + ml = mlt; + *matchpos = ref; + } + } + } + ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; + } + + /* Complete table */ + if (repl) { + const BYTE *ptr = ip; + const BYTE *end; + end = ip + repl - (MINMATCH-1); + /* Pre-Load */ + while (ptr < end - delta) { + chaintable[(size_t)(ptr) & MAXD_MASK] = delta; + ptr++; + } + do { + chaintable[(size_t)(ptr) & MAXD_MASK] = delta; + /* Head of chain */ + hashtable[HASH_VALUE(ptr)] = (ptr) - base; + ptr++; + } while (ptr < end); + hc4->nexttoupdate = end; + } + + return (int)ml; +} + +static inline int lz4hc_insertandgetwidermatch(struct lz4hc_data *hc4, + const u8 *ip, const u8 *startlimit, const u8 *matchlimit, int longest, + const u8 **matchpos, const u8 **startpos) +{ + u16 *const chaintable = hc4->chaintable; + HTYPE *const hashtable = hc4->hashtable; +#if LZ4_ARCH64 + const BYTE * const base = hc4->base; +#else + const int base = 0; +#endif + const u8 *ref; + int nbattempts = MAX_NB_ATTEMPTS; + int delta = (int)(ip - startlimit); + + /* First Match */ + lz4hc_insert(hc4, ip); + ref = hashtable[HASH_VALUE(ip)] + base; + + while ((ref >= ip - MAX_DISTANCE) && (ref >= hc4->base) + && (nbattempts)) { + nbattempts--; + if (*(startlimit + longest) == *(ref - delta + longest)) { + if (A32(ref) == A32(ip)) { + const u8 *reft = ref + MINMATCH; + const u8 *ipt = ip + MINMATCH; + const u8 *startt = ip; + + while (ipt < matchlimit-(STEPSIZE - 1)) { + #if LZ4_ARCH64 + u64 diff = A64(reft) ^ A64(ipt); + #else + u32 diff = A32(reft) ^ A32(ipt); + #endif + + if (!diff) { + ipt += STEPSIZE; + reft += STEPSIZE; + continue; + } + ipt += LZ4_NBCOMMONBYTES(diff); + goto _endcount; + } + #if LZ4_ARCH64 + if ((ipt < (matchlimit - 3)) + && (A32(reft) == A32(ipt))) { + ipt += 4; + reft += 4; + } + ipt += 2; + #endif + if ((ipt < (matchlimit - 1)) + && (A16(reft) == A16(ipt))) { + reft += 2; + } + if ((ipt < matchlimit) && (*reft == *ipt)) + ipt++; +_endcount: + reft = ref; + + while ((startt > startlimit) + && (reft > hc4->base) + && (startt[-1] == reft[-1])) { + startt--; + reft--; + } + + if ((ipt - startt) > longest) { + longest = (int)(ipt - startt); + *matchpos = reft; + *startpos = startt; + } + } + } + ref -= (size_t)chaintable[(size_t)(ref) & MAXD_MASK]; + } + return longest; +} + +static inline int lz4_encodesequence(const u8 **ip, u8 **op, const u8 **anchor, + int ml, const u8 *ref) +{ + int length, len; + u8 *token; + + /* Encode Literal length */ + length = (int)(*ip - *anchor); + token = (*op)++; + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254 ; len -= 255) + *(*op)++ = 255; + *(*op)++ = (u8)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(*anchor, *op, length); + + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(*op, (u16)(*ip - ref)); + + /* Encode MatchLength */ + len = (int)(ml - MINMATCH); + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509 ; len -= 510) { + *(*op)++ = 255; + *(*op)++ = 255; + } + if (len > 254) { + len -= 255; + *(*op)++ = 255; + } + *(*op)++ = (u8)len; + } else + *token += len; + + /* Prepare next loop */ + *ip += ml; + *anchor = *ip; + + return 0; +} + +static int lz4_compresshcctx(struct lz4hc_data *ctx, + const char *source, + char *dest, + int isize) +{ + const u8 *ip = (const u8 *)source; + const u8 *anchor = ip; + const u8 *const iend = ip + isize; + const u8 *const mflimit = iend - MFLIMIT; + const u8 *const matchlimit = (iend - LASTLITERALS); + + u8 *op = (u8 *)dest; + + int ml, ml2, ml3, ml0; + const u8 *ref = NULL; + const u8 *start2 = NULL; + const u8 *ref2 = NULL; + const u8 *start3 = NULL; + const u8 *ref3 = NULL; + const u8 *start0; + const u8 *ref0; + int lastrun; + + ip++; + + /* Main Loop */ + while (ip < mflimit) { + ml = lz4hc_insertandfindbestmatch(ctx, ip, matchlimit, (&ref)); + if (!ml) { + ip++; + continue; + } + + /* saved, in case we would skip too much */ + start0 = ip; + ref0 = ref; + ml0 = ml; +_search2: + if (ip+ml < mflimit) + ml2 = lz4hc_insertandgetwidermatch(ctx, ip + ml - 2, + ip + 1, matchlimit, ml, &ref2, &start2); + else + ml2 = ml; + /* No better match */ + if (ml2 == ml) { + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + continue; + } + + if (start0 < ip) { + /* empirical */ + if (start2 < ip + ml0) { + ip = start0; + ref = ref0; + ml = ml0; + } + } + /* + * Here, start0==ip + * First Match too small : removed + */ + if ((start2 - ip) < 3) { + ml = ml2; + ip = start2; + ref = ref2; + goto _search2; + } + +_search3: + /* + * Currently we have : + * ml2 > ml1, and + * ip1+3 <= ip2 (usually < ip1+ml1) + */ + if ((start2 - ip) < OPTIMAL_ML) { + int correction; + int new_ml = ml; + if (new_ml > OPTIMAL_ML) + new_ml = OPTIMAL_ML; + if (ip + new_ml > start2 + ml2 - MINMATCH) + new_ml = (int)(start2 - ip) + ml2 - MINMATCH; + correction = new_ml - (int)(start2 - ip); + if (correction > 0) { + start2 += correction; + ref2 += correction; + ml2 -= correction; + } + } + /* + * Now, we have start2 = ip+new_ml, + * with new_ml=min(ml, OPTIMAL_ML=18) + */ + if (start2 + ml2 < mflimit) + ml3 = lz4hc_insertandgetwidermatch(ctx, + start2 + ml2 - 3, start2, matchlimit, + ml2, &ref3, &start3); + else + ml3 = ml2; + + /* No better match : 2 sequences to encode */ + if (ml3 == ml2) { + /* ip & ref are known; Now for ml */ + if (start2 < ip+ml) + ml = (int)(start2 - ip); + + /* Now, encode 2 sequences */ + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + ip = start2; + lz4_encodesequence(&ip, &op, &anchor, ml2, ref2); + continue; + } + + /* Not enough space for match 2 : remove it */ + if (start3 < ip + ml + 3) { + /* + * can write Seq1 immediately ==> Seq2 is removed, + * so Seq3 becomes Seq1 + */ + if (start3 >= (ip + ml)) { + if (start2 < ip + ml) { + int correction = + (int)(ip + ml - start2); + start2 += correction; + ref2 += correction; + ml2 -= correction; + if (ml2 < MINMATCH) { + start2 = start3; + ref2 = ref3; + ml2 = ml3; + } + } + + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + ip = start3; + ref = ref3; + ml = ml3; + + start0 = start2; + ref0 = ref2; + ml0 = ml2; + goto _search2; + } + + start2 = start3; + ref2 = ref3; + ml2 = ml3; + goto _search3; + } + + /* + * OK, now we have 3 ascending matches; let's write at least + * the first one ip & ref are known; Now for ml + */ + if (start2 < ip + ml) { + if ((start2 - ip) < (int)ML_MASK) { + int correction; + if (ml > OPTIMAL_ML) + ml = OPTIMAL_ML; + if (ip + ml > start2 + ml2 - MINMATCH) + ml = (int)(start2 - ip) + ml2 + - MINMATCH; + correction = ml - (int)(start2 - ip); + if (correction > 0) { + start2 += correction; + ref2 += correction; + ml2 -= correction; + } + } else + ml = (int)(start2 - ip); + } + lz4_encodesequence(&ip, &op, &anchor, ml, ref); + + ip = start2; + ref = ref2; + ml = ml2; + + start2 = start3; + ref2 = ref3; + ml2 = ml3; + + goto _search3; + } + + /* Encode Last Literals */ + lastrun = (int)(iend - anchor); + if (lastrun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastrun -= RUN_MASK; + for (; lastrun > 254 ; lastrun -= 255) + *op++ = 255; + *op++ = (u8) lastrun; + } else + *op++ = (lastrun << ML_BITS); + memcpy(op, anchor, iend - anchor); + op += iend - anchor; + /* End */ + return (int) (((char *)op) - dest); +} + +int lz4hc_compress(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem) +{ + int ret = -1; + int out_len = 0; + + struct lz4hc_data *hc4 = (struct lz4hc_data *)wrkmem; + lz4hc_init(hc4, (const u8 *)src); + out_len = lz4_compresshcctx((struct lz4hc_data *)hc4, (const u8 *)src, + (char *)dst, (int)src_len); + + if (out_len < 0) + goto exit; + + *dst_len = out_len; + return 0; + +exit: + return ret; +} +EXPORT_SYMBOL_GPL(lz4hc_compress); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZ4HC compressor"); From 6de155507ea389d7863f34fe53b41a86a59228bf Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 11 Sep 2013 14:26:32 -0700 Subject: [PATCH 55/82] lz4: fix compression/decompression signedness mismatch LZ4 compression and decompression functions require different in signedness input/output parameters: unsigned char for compression and signed char for decompression. Change decompression API to require "(const) unsigned char *". Signed-off-by: Sergey Senozhatsky Cc: Kyungsik Lee Cc: Geert Uytterhoeven Cc: Yann Collet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit b34081f1cd59585451efaa69e1dff1b9507e6c89) Signed-off-by: Alex Shi --- include/linux/lz4.h | 8 ++++---- lib/lz4/lz4_decompress.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/lz4.h b/include/linux/lz4.h index d21c13f10a64..4356686b0a39 100644 --- a/include/linux/lz4.h +++ b/include/linux/lz4.h @@ -67,8 +67,8 @@ int lz4hc_compress(const unsigned char *src, size_t src_len, * note : Destination buffer must be already allocated. * slightly faster than lz4_decompress_unknownoutputsize() */ -int lz4_decompress(const char *src, size_t *src_len, char *dest, - size_t actual_dest_len); +int lz4_decompress(const unsigned char *src, size_t *src_len, + unsigned char *dest, size_t actual_dest_len); /* * lz4_decompress_unknownoutputsize() @@ -82,6 +82,6 @@ int lz4_decompress(const char *src, size_t *src_len, char *dest, * Error if return (< 0) * note : Destination buffer must be already allocated. */ -int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, - char *dest, size_t *dest_len); +int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, + unsigned char *dest, size_t *dest_len); #endif diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index dcc89753af65..677d1ea4833d 100644 --- a/lib/lz4/lz4_decompress.c +++ b/lib/lz4/lz4_decompress.c @@ -283,8 +283,8 @@ static int lz4_uncompress_unknownoutputsize(const char *source, char *dest, return (int) (-(((char *) ip) - source)); } -int lz4_decompress(const char *src, size_t *src_len, char *dest, - size_t actual_dest_len) +int lz4_decompress(const unsigned char *src, size_t *src_len, + unsigned char *dest, size_t actual_dest_len) { int ret = -1; int input_len = 0; @@ -302,8 +302,8 @@ int lz4_decompress(const char *src, size_t *src_len, char *dest, EXPORT_SYMBOL_GPL(lz4_decompress); #endif -int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, - char *dest, size_t *dest_len) +int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, + unsigned char *dest, size_t *dest_len) { int ret = -1; int out_len = 0; From ce58d51a0e9b366876270f03555c209917d3b80d Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Wed, 4 Jun 2014 16:11:06 -0700 Subject: [PATCH 56/82] zram: correct offset usage in zram_bio_discard We want to skip the physical block(PAGE_SIZE) which is partially covered by the discard bio, so we check the remaining size and subtract it if there is a need to goto the next physical block. The current offset usage in zram_bio_discard is incorrect, it will cause its upper filesystem breakdown. Consider the following scenario: On some architecture or config, PAGE_SIZE is 64K for example, filesystem is set up on zram disk without PAGE_SIZE aligned, a discard bio leads to a offset = 4K and size=72K, normally, it should not really discard any physical block as it partially cover two physical blocks. However, with the current offset usage, it will discard the second physical block and free its memory, which will cause filesystem breakdown. This patch corrects the offset usage in zram_bio_discard. Signed-off-by: Weijie Yang Cc: Minchan Kim Cc: Nitin Gupta Acked-by: Joonsoo Kim Cc: Sergey Senozhatsky Cc: Bob Liu Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 38515c73398a4c58059ecf1087e844561b58ee0f) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 19cf51ad48ef..efb6ff2a3735 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -571,10 +571,10 @@ static void zram_bio_discard(struct zram *zram, u32 index, * skipping this logical block is appropriate here. */ if (offset) { - if (n < offset) + if (n <= (PAGE_SIZE - offset)) return; - n -= offset; + n -= (PAGE_SIZE - offset); index++; } From c398e6a3df7562a1a70e2ad487dcda2c3cb635e9 Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Wed, 4 Jun 2014 16:11:08 -0700 Subject: [PATCH 57/82] zsmalloc: fixup trivial zs size classes value in comments According to calculation, ZS_SIZE_CLASSES value is 255 on systems with 4K page size, not 254. The old value may forget count the ZS_MIN_ALLOC_SIZE in. This patch fixes this trivial issue in the comments. Signed-off-by: Weijie Yang Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 7eb52512a977854eca51d9b692c2f3be8a0e5eeb) Signed-off-by: Alex Shi --- mm/zsmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b728f10e353b..cee5c54c500f 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -129,7 +129,7 @@ #define ZS_MAX_ALLOC_SIZE PAGE_SIZE /* - * On systems with 4K page size, this gives 254 size classes! There is a + * On systems with 4K page size, this gives 255 size classes! There is a * trader-off here: * - Large number of size classes is potentially wasteful as free page are * spread across these classes From 888210267e226f437abb7fd0604530032a98cafa Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 2 Jul 2014 15:22:36 -0700 Subject: [PATCH 58/82] zram: revalidate disk after capacity change Alexander reported mkswap on /dev/zram0 is failed if other process is opening the block device file. Step is as follows, 0. Reset the unused zram device. 1. Use a program that opens /dev/zram0 with O_RDWR and sleeps until killed. 2. While that program sleeps, echo the correct value to /sys/block/zram0/disksize. 3. Verify (e.g. in /proc/partitions) that the disk size is applied correctly. It is. 4. While that program still sleeps, attempt to mkswap /dev/zram0. This fails: mkswap: error: swap area needs to be at least 40 KiB When I investigated, the size get by ioctl(fd, BLKGETSIZE64, xxx) on mkswap to get a size of blockdev was zero although zram0 has right size by 2. The reason is zram didn't revalidate disk after changing capacity so that size of blockdev's inode is not uptodate until all of file is close. This patch should fix the BUG. Signed-off-by: Minchan Kim Reported-by: Alexander E. Patrakov Tested-by: Alexander E. Patrakov Reviewed-by: Sergey Senozhatsky Cc: Nitin Gupta Acked-by: Jerome Marchand Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 2e32baea46ce542c561a519414c840295b229c8f) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index efb6ff2a3735..1e14825bd02b 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -621,8 +621,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) memset(&zram->stats, 0, sizeof(zram->stats)); zram->disksize = 0; - if (reset_capacity) + if (reset_capacity) { set_capacity(zram->disk, 0); + revalidate_disk(zram->disk); + } up_write(&zram->init_lock); } @@ -663,6 +665,7 @@ static ssize_t disksize_store(struct device *dev, zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); + revalidate_disk(zram->disk); up_write(&zram->init_lock); return len; From d022fbe2a0a481cb0d0c41fcacc47d62c3a16581 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 23 Jul 2014 14:00:04 -0700 Subject: [PATCH 59/82] zram: avoid lockdep splat by revalidate_disk Sasha reported lockdep warning [1] introduced by [2]. It could be fixed by doing disk revalidation out of the init_lock. It's okay because disk capacity change is protected by init_lock so that revalidate_disk always sees up-to-date value so there is no race. [1] https://lkml.org/lkml/2014/7/3/735 [2] zram: revalidate disk after capacity change Fixes 2e32baea46ce ("zram: revalidate disk after capacity change"). Signed-off-by: Minchan Kim Reported-by: Sasha Levin Cc: "Alexander E. Patrakov" Cc: Nitin Gupta Cc: Jerome Marchand Cc: Sergey Senozhatsky CC: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit b4c5c60920e3b0c4598f43e7317559f6aec51531) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 1e14825bd02b..674b8517694d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -621,11 +621,18 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) memset(&zram->stats, 0, sizeof(zram->stats)); zram->disksize = 0; - if (reset_capacity) { + if (reset_capacity) set_capacity(zram->disk, 0); - revalidate_disk(zram->disk); - } + up_write(&zram->init_lock); + + /* + * Revalidate disk out of the init_lock to avoid lockdep splat. + * It's okay because disk's capacity is protected by init_lock + * so that revalidate_disk always sees up-to-date capacity. + */ + if (reset_capacity) + revalidate_disk(zram->disk); } static ssize_t disksize_store(struct device *dev, @@ -665,8 +672,15 @@ static ssize_t disksize_store(struct device *dev, zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - revalidate_disk(zram->disk); up_write(&zram->init_lock); + + /* + * Revalidate disk out of the init_lock to avoid lockdep splat. + * It's okay because disk's capacity is protected by init_lock + * so that revalidate_disk always sees up-to-date capacity. + */ + revalidate_disk(zram->disk); + return len; out_destroy_comp: From e11cb6668f1a349250873337858a502e2ab73c8e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 6 Aug 2014 16:08:25 -0700 Subject: [PATCH 60/82] zram: rename struct `table' to `zram_table_entry' Andrew Morton has recently noted that `struct table' actually represents table entry and, thus, should be renamed. Rename to `zram_table_entry'. Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Nitin Gupta Cc: Weijie Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit cb8f2eec3c5c87e31219c5e58625b8e890004e48) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 7f21c145e317..8909f86caf0d 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -62,7 +62,7 @@ enum zram_pageflags { /*-- Data structures */ /* Allocated for each disk page */ -struct table { +struct zram_table_entry { unsigned long handle; u16 size; /* object size (excluding header) */ u8 flags; @@ -82,7 +82,7 @@ struct zram_stats { struct zram_meta { rwlock_t tb_lock; /* protect table */ - struct table *table; + struct zram_table_entry *table; struct zs_pool *mem_pool; }; From 61febc60cc77c7e06ec7d457f21fc9ce9885f7e8 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 6 Aug 2014 16:08:27 -0700 Subject: [PATCH 61/82] zram: remove unused SECTOR_SIZE define Drop SECTOR_SIZE define, because it's not used. Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Nitin Gupta Cc: Weijie Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit a830eff749eb2bf906783f6bf74a74dad3de3aea) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 8909f86caf0d..c8161bd8969c 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -43,7 +43,6 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; /*-- End of configurable params */ #define SECTOR_SHIFT 9 -#define SECTOR_SIZE (1 << SECTOR_SHIFT) #define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) #define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT) #define ZRAM_LOGICAL_BLOCK_SHIFT 12 From 244e41f94febd73d877a397a05e626563ebd2ceb Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 6 Aug 2014 16:08:29 -0700 Subject: [PATCH 62/82] zram: use size_t instead of u16 Some architectures (eg, hexagon and PowerPC) could use PAGE_SHIFT of 16 or more. In these cases u16 is not sufficiently large to represent a compressed page's size so use size_t. Signed-off-by: Minchan Kim Reported-by: Weijie Yang Acked-by: Sergey Senozhatsky Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 023b409f9dac4cdea3322009f2e592068558690c) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 674b8517694d..efd9e4f1605c 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -336,7 +336,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) unsigned char *cmem; struct zram_meta *meta = zram->meta; unsigned long handle; - u16 size; + size_t size; read_lock(&meta->tb_lock); handle = meta->table[index].handle; From 3a4d2c69adaee7391fcdbc782913629df2e49726 Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Wed, 6 Aug 2014 16:08:31 -0700 Subject: [PATCH 63/82] zram: replace global tb_lock with fine grain lock Currently, we use a rwlock tb_lock to protect concurrent access to the whole zram meta table. However, according to the actual access model, there is only a small chance for upper user to access the same table[index], so the current lock granularity is too big. The idea of optimization is to change the lock granularity from whole meta table to per table entry (table -> table[index]), so that we can protect concurrent access to the same table[index], meanwhile allow the maximum concurrency. With this in mind, several kinds of locks which could be used as a per-entry lock were tested and compared: Test environment: x86-64 Intel Core2 Q8400, system memory 4GB, Ubuntu 12.04, kernel v3.15.0-rc3 as base, zram with 4 max_comp_streams LZO. iozone test: iozone -t 4 -R -r 16K -s 200M -I +Z (1GB zram with ext4 filesystem, take the average of 10 tests, KB/s) Test base CAS spinlock rwlock bit_spinlock ------------------------------------------------------------------- Initial write 1381094 1425435 1422860 1423075 1421521 Rewrite 1529479 1641199 1668762 1672855 1654910 Read 8468009 11324979 11305569 11117273 10997202 Re-read 8467476 11260914 11248059 11145336 10906486 Reverse Read 6821393 8106334 8282174 8279195 8109186 Stride read 7191093 8994306 9153982 8961224 9004434 Random read 7156353 8957932 9167098 8980465 8940476 Mixed workload 4172747 5680814 5927825 5489578 5972253 Random write 1483044 1605588 1594329 1600453 1596010 Pwrite 1276644 1303108 1311612 1314228 1300960 Pread 4324337 4632869 4618386 4457870 4500166 To enhance the possibility of access the same table[index] concurrently, set zram a small disksize(10MB) and let threads run with large loop count. fio test: fio --bs=32k --randrepeat=1 --randseed=100 --refill_buffers --scramble_buffers=1 --direct=1 --loops=3000 --numjobs=4 --filename=/dev/zram0 --name=seq-write --rw=write --stonewall --name=seq-read --rw=read --stonewall --name=seq-readwrite --rw=rw --stonewall --name=rand-readwrite --rw=randrw --stonewall (10MB zram raw block device, take the average of 10 tests, KB/s) Test base CAS spinlock rwlock bit_spinlock ------------------------------------------------------------- seq-write 933789 999357 1003298 995961 1001958 seq-read 5634130 6577930 6380861 6243912 6230006 seq-rw 1405687 1638117 1640256 1633903 1634459 rand-rw 1386119 1614664 1617211 1609267 1612471 All the optimization methods show a higher performance than the base, however, it is hard to say which method is the most appropriate. On the other hand, zram is mostly used on small embedded system, so we don't want to increase any memory footprint. This patch pick the bit_spinlock method, pack object size and page_flag into an unsigned long table.value, so as to not increase any memory overhead on both 32-bit and 64-bit system. On the third hand, even though different kinds of locks have different performances, we can ignore this difference, because: if zram is used as zram swapfile, the swap subsystem can prevent concurrent access to the same swapslot; if zram is used as zram-blk for set up filesystem on it, the upper filesystem and the page cache also prevent concurrent access of the same block mostly. So we can ignore the different performances among locks. Acked-by: Sergey Senozhatsky Reviewed-by: Davidlohr Bueso Signed-off-by: Weijie Yang Signed-off-by: Minchan Kim Cc: Jerome Marchand Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit d2d5e762c8990c4031890e03565983a05febd64a) Signed-off-by: Alex Shi Conflicts: drivers/block/zram/zram_drv.c Conflicts solution: using old bio struct --- drivers/block/zram/zram_drv.c | 69 +++++++++++++++++++++-------------- drivers/block/zram/zram_drv.h | 24 +++++++++--- 2 files changed, 60 insertions(+), 33 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index efd9e4f1605c..a21f466c4e15 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -183,19 +183,32 @@ static ssize_t comp_algorithm_store(struct device *dev, static int zram_test_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) { - return meta->table[index].flags & BIT(flag); + return meta->table[index].value & BIT(flag); } static void zram_set_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) { - meta->table[index].flags |= BIT(flag); + meta->table[index].value |= BIT(flag); } static void zram_clear_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) { - meta->table[index].flags &= ~BIT(flag); + meta->table[index].value &= ~BIT(flag); +} + +static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) +{ + return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); +} + +static void zram_set_obj_size(struct zram_meta *meta, + u32 index, size_t size) +{ + unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT; + + meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; } static inline int is_partial_io(struct bio_vec *bvec) @@ -254,7 +267,6 @@ static struct zram_meta *zram_meta_alloc(u64 disksize) goto free_table; } - rwlock_init(&meta->tb_lock); return meta; free_table: @@ -303,7 +315,12 @@ static void handle_zero_page(struct bio_vec *bvec) flush_dcache_page(page); } -/* NOTE: caller should hold meta->tb_lock with write-side */ + +/* + * To protect concurrent access to the same index entry, + * caller should hold this table index entry's bit_spinlock to + * indicate this index entry is accessing. + */ static void zram_free_page(struct zram *zram, size_t index) { struct zram_meta *meta = zram->meta; @@ -323,11 +340,12 @@ static void zram_free_page(struct zram *zram, size_t index) zs_free(meta->mem_pool, handle); - atomic64_sub(meta->table[index].size, &zram->stats.compr_data_size); + atomic64_sub(zram_get_obj_size(meta, index), + &zram->stats.compr_data_size); atomic64_dec(&zram->stats.pages_stored); meta->table[index].handle = 0; - meta->table[index].size = 0; + zram_set_obj_size(meta, index, 0); } static int zram_decompress_page(struct zram *zram, char *mem, u32 index) @@ -338,12 +356,12 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) unsigned long handle; size_t size; - read_lock(&meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); handle = meta->table[index].handle; - size = meta->table[index].size; + size = zram_get_obj_size(meta, index); if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) { - read_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); clear_page(mem); return 0; } @@ -354,7 +372,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) else ret = zcomp_decompress(zram->comp, cmem, size, mem); zs_unmap_object(meta->mem_pool, handle); - read_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); /* Should NEVER happen. Return bio error if it does. */ if (unlikely(ret)) { @@ -375,14 +393,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, struct zram_meta *meta = zram->meta; page = bvec->bv_page; - read_lock(&meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); if (unlikely(!meta->table[index].handle) || zram_test_flag(meta, index, ZRAM_ZERO)) { - read_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); handle_zero_page(bvec); return 0; } - read_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); if (is_partial_io(bvec)) /* Use a temporary buffer to decompress the page */ @@ -460,10 +478,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, if (page_zero_filled(uncmem)) { kunmap_atomic(user_mem); /* Free memory associated with this sector now. */ - write_lock(&zram->meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); zram_set_flag(meta, index, ZRAM_ZERO); - write_unlock(&zram->meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); atomic64_inc(&zram->stats.zero_pages); ret = 0; @@ -513,12 +531,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, * Free memory associated with this sector * before overwriting unused sectors. */ - write_lock(&zram->meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); meta->table[index].handle = handle; - meta->table[index].size = clen; - write_unlock(&zram->meta->tb_lock); + zram_set_obj_size(meta, index, clen); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); /* Update stats */ atomic64_add(clen, &zram->stats.compr_data_size); @@ -559,6 +577,7 @@ static void zram_bio_discard(struct zram *zram, u32 index, int offset, struct bio *bio) { size_t n = bio->bi_size; + struct zram_meta *meta = zram->meta; /* * zram manages data in physical block size units. Because logical block @@ -579,13 +598,9 @@ static void zram_bio_discard(struct zram *zram, u32 index, } while (n >= PAGE_SIZE) { - /* - * Discard request can be large so the lock hold times could be - * lengthy. So take the lock once per page. - */ - write_lock(&zram->meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); - write_unlock(&zram->meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); index++; n -= PAGE_SIZE; } @@ -819,9 +834,9 @@ static void zram_slot_free_notify(struct block_device *bdev, zram = bdev->bd_disk->private_data; meta = zram->meta; - write_lock(&meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); - write_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); atomic64_inc(&zram->stats.notify_free); } diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index c8161bd8969c..5b0afde729cd 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -50,10 +50,24 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; #define ZRAM_SECTOR_PER_LOGICAL_BLOCK \ (1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT)) -/* Flags for zram pages (table[page_no].flags) */ + +/* + * The lower ZRAM_FLAG_SHIFT bits of table.value is for + * object size (excluding header), the higher bits is for + * zram_pageflags. + * + * zram is mainly used for memory efficiency so we want to keep memory + * footprint small so we can squeeze size and flags into a field. + * The lower ZRAM_FLAG_SHIFT bits is for object size (excluding header), + * the higher bits is for zram_pageflags. + */ +#define ZRAM_FLAG_SHIFT 24 + +/* Flags for zram pages (table[page_no].value) */ enum zram_pageflags { /* Page consists entirely of zeros */ - ZRAM_ZERO, + ZRAM_ZERO = ZRAM_FLAG_SHIFT + 1, + ZRAM_ACCESS, /* page in now accessed */ __NR_ZRAM_PAGEFLAGS, }; @@ -63,9 +77,8 @@ enum zram_pageflags { /* Allocated for each disk page */ struct zram_table_entry { unsigned long handle; - u16 size; /* object size (excluding header) */ - u8 flags; -} __aligned(4); + unsigned long value; +}; struct zram_stats { atomic64_t compr_data_size; /* compressed size of pages stored */ @@ -80,7 +93,6 @@ struct zram_stats { }; struct zram_meta { - rwlock_t tb_lock; /* protect table */ struct zram_table_entry *table; struct zs_pool *mem_pool; }; From 301b07c9f41b4f4111ac53336647788cd89af76d Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 6 Aug 2014 16:08:36 -0700 Subject: [PATCH 64/82] mm/zpool: implement common zpool api to zbud/zsmalloc Add zpool api. zpool provides an interface for memory storage, typically of compressed memory. Users can select what backend to use; currently the only implementations are zbud, a low density implementation with up to two compressed pages per storage page, and zsmalloc, a higher density implementation with multiple compressed pages per storage page. Signed-off-by: Dan Streetman Tested-by: Seth Jennings Cc: Minchan Kim Cc: Nitin Gupta Cc: Weijie Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit af8d417a04564bca0348e7e3c749ab12a3e837ad) Signed-off-by: Alex Shi Conflicts: mm/Kconfig mm/Makefile --- include/linux/zpool.h | 106 ++++++++++++ mm/Kconfig | 7 + mm/Makefile | 1 + mm/zpool.c | 364 ++++++++++++++++++++++++++++++++++++++++++ mm/zsmalloc.c | 1 - 5 files changed, 478 insertions(+), 1 deletion(-) create mode 100644 include/linux/zpool.h create mode 100644 mm/zpool.c diff --git a/include/linux/zpool.h b/include/linux/zpool.h new file mode 100644 index 000000000000..f14bd75f08b3 --- /dev/null +++ b/include/linux/zpool.h @@ -0,0 +1,106 @@ +/* + * zpool memory storage api + * + * Copyright (C) 2014 Dan Streetman + * + * This is a common frontend for the zbud and zsmalloc memory + * storage pool implementations. Typically, this is used to + * store compressed memory. + */ + +#ifndef _ZPOOL_H_ +#define _ZPOOL_H_ + +struct zpool; + +struct zpool_ops { + int (*evict)(struct zpool *pool, unsigned long handle); +}; + +/* + * Control how a handle is mapped. It will be ignored if the + * implementation does not support it. Its use is optional. + * Note that this does not refer to memory protection, it + * refers to how the memory will be copied in/out if copying + * is necessary during mapping; read-write is the safest as + * it copies the existing memory in on map, and copies the + * changed memory back out on unmap. Write-only does not copy + * in the memory and should only be used for initialization. + * If in doubt, use ZPOOL_MM_DEFAULT which is read-write. + */ +enum zpool_mapmode { + ZPOOL_MM_RW, /* normal read-write mapping */ + ZPOOL_MM_RO, /* read-only (no copy-out at unmap time) */ + ZPOOL_MM_WO, /* write-only (no copy-in at map time) */ + + ZPOOL_MM_DEFAULT = ZPOOL_MM_RW +}; + +struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops); + +char *zpool_get_type(struct zpool *pool); + +void zpool_destroy_pool(struct zpool *pool); + +int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, + unsigned long *handle); + +void zpool_free(struct zpool *pool, unsigned long handle); + +int zpool_shrink(struct zpool *pool, unsigned int pages, + unsigned int *reclaimed); + +void *zpool_map_handle(struct zpool *pool, unsigned long handle, + enum zpool_mapmode mm); + +void zpool_unmap_handle(struct zpool *pool, unsigned long handle); + +u64 zpool_get_total_size(struct zpool *pool); + + +/** + * struct zpool_driver - driver implementation for zpool + * @type: name of the driver. + * @list: entry in the list of zpool drivers. + * @create: create a new pool. + * @destroy: destroy a pool. + * @malloc: allocate mem from a pool. + * @free: free mem from a pool. + * @shrink: shrink the pool. + * @map: map a handle. + * @unmap: unmap a handle. + * @total_size: get total size of a pool. + * + * This is created by a zpool implementation and registered + * with zpool. + */ +struct zpool_driver { + char *type; + struct module *owner; + atomic_t refcount; + struct list_head list; + + void *(*create)(gfp_t gfp, struct zpool_ops *ops); + void (*destroy)(void *pool); + + int (*malloc)(void *pool, size_t size, gfp_t gfp, + unsigned long *handle); + void (*free)(void *pool, unsigned long handle); + + int (*shrink)(void *pool, unsigned int pages, + unsigned int *reclaimed); + + void *(*map)(void *pool, unsigned long handle, + enum zpool_mapmode mm); + void (*unmap)(void *pool, unsigned long handle); + + u64 (*total_size)(void *pool); +}; + +void zpool_register_driver(struct zpool_driver *driver); + +int zpool_unregister_driver(struct zpool_driver *driver); + +int zpool_evict(void *pool, unsigned long handle); + +#endif diff --git a/mm/Kconfig b/mm/Kconfig index 86919079b64c..ac85efdeab45 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -478,6 +478,13 @@ config FRONTSWAP If unsure, say Y to enable frontswap. +config ZPOOL + tristate "Common API for compressed memory storage" + default n + help + Compressed memory storage API. This allows using either zbud or + zsmalloc. + config ZSMALLOC bool "Memory allocator for compressed pages" depends on MMU diff --git a/mm/Makefile b/mm/Makefile index b5ae0b0cc26c..fa2e7df37b85 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -58,4 +58,5 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o +obj-$(CONFIG_ZPOOL) += zpool.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o diff --git a/mm/zpool.c b/mm/zpool.c new file mode 100644 index 000000000000..e40612a1df00 --- /dev/null +++ b/mm/zpool.c @@ -0,0 +1,364 @@ +/* + * zpool memory storage api + * + * Copyright (C) 2014 Dan Streetman + * + * This is a common frontend for memory storage pool implementations. + * Typically, this is used to store compressed memory. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +struct zpool { + char *type; + + struct zpool_driver *driver; + void *pool; + struct zpool_ops *ops; + + struct list_head list; +}; + +static LIST_HEAD(drivers_head); +static DEFINE_SPINLOCK(drivers_lock); + +static LIST_HEAD(pools_head); +static DEFINE_SPINLOCK(pools_lock); + +/** + * zpool_register_driver() - register a zpool implementation. + * @driver: driver to register + */ +void zpool_register_driver(struct zpool_driver *driver) +{ + spin_lock(&drivers_lock); + atomic_set(&driver->refcount, 0); + list_add(&driver->list, &drivers_head); + spin_unlock(&drivers_lock); +} +EXPORT_SYMBOL(zpool_register_driver); + +/** + * zpool_unregister_driver() - unregister a zpool implementation. + * @driver: driver to unregister. + * + * Module usage counting is used to prevent using a driver + * while/after unloading, so if this is called from module + * exit function, this should never fail; if called from + * other than the module exit function, and this returns + * failure, the driver is in use and must remain available. + */ +int zpool_unregister_driver(struct zpool_driver *driver) +{ + int ret = 0, refcount; + + spin_lock(&drivers_lock); + refcount = atomic_read(&driver->refcount); + WARN_ON(refcount < 0); + if (refcount > 0) + ret = -EBUSY; + else + list_del(&driver->list); + spin_unlock(&drivers_lock); + + return ret; +} +EXPORT_SYMBOL(zpool_unregister_driver); + +/** + * zpool_evict() - evict callback from a zpool implementation. + * @pool: pool to evict from. + * @handle: handle to evict. + * + * This can be used by zpool implementations to call the + * user's evict zpool_ops struct evict callback. + */ +int zpool_evict(void *pool, unsigned long handle) +{ + struct zpool *zpool; + + spin_lock(&pools_lock); + list_for_each_entry(zpool, &pools_head, list) { + if (zpool->pool == pool) { + spin_unlock(&pools_lock); + if (!zpool->ops || !zpool->ops->evict) + return -EINVAL; + return zpool->ops->evict(zpool, handle); + } + } + spin_unlock(&pools_lock); + + return -ENOENT; +} +EXPORT_SYMBOL(zpool_evict); + +static struct zpool_driver *zpool_get_driver(char *type) +{ + struct zpool_driver *driver; + + spin_lock(&drivers_lock); + list_for_each_entry(driver, &drivers_head, list) { + if (!strcmp(driver->type, type)) { + bool got = try_module_get(driver->owner); + + if (got) + atomic_inc(&driver->refcount); + spin_unlock(&drivers_lock); + return got ? driver : NULL; + } + } + + spin_unlock(&drivers_lock); + return NULL; +} + +static void zpool_put_driver(struct zpool_driver *driver) +{ + atomic_dec(&driver->refcount); + module_put(driver->owner); +} + +/** + * zpool_create_pool() - Create a new zpool + * @type The type of the zpool to create (e.g. zbud, zsmalloc) + * @gfp The GFP flags to use when allocating the pool. + * @ops The optional ops callback. + * + * This creates a new zpool of the specified type. The gfp flags will be + * used when allocating memory, if the implementation supports it. If the + * ops param is NULL, then the created zpool will not be shrinkable. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: New zpool on success, NULL on failure. + */ +struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) +{ + struct zpool_driver *driver; + struct zpool *zpool; + + pr_info("creating pool type %s\n", type); + + driver = zpool_get_driver(type); + + if (!driver) { + request_module(type); + driver = zpool_get_driver(type); + } + + if (!driver) { + pr_err("no driver for type %s\n", type); + return NULL; + } + + zpool = kmalloc(sizeof(*zpool), gfp); + if (!zpool) { + pr_err("couldn't create zpool - out of memory\n"); + zpool_put_driver(driver); + return NULL; + } + + zpool->type = driver->type; + zpool->driver = driver; + zpool->pool = driver->create(gfp, ops); + zpool->ops = ops; + + if (!zpool->pool) { + pr_err("couldn't create %s pool\n", type); + zpool_put_driver(driver); + kfree(zpool); + return NULL; + } + + pr_info("created %s pool\n", type); + + spin_lock(&pools_lock); + list_add(&zpool->list, &pools_head); + spin_unlock(&pools_lock); + + return zpool; +} + +/** + * zpool_destroy_pool() - Destroy a zpool + * @pool The zpool to destroy. + * + * Implementations must guarantee this to be thread-safe, + * however only when destroying different pools. The same + * pool should only be destroyed once, and should not be used + * after it is destroyed. + * + * This destroys an existing zpool. The zpool should not be in use. + */ +void zpool_destroy_pool(struct zpool *zpool) +{ + pr_info("destroying pool type %s\n", zpool->type); + + spin_lock(&pools_lock); + list_del(&zpool->list); + spin_unlock(&pools_lock); + zpool->driver->destroy(zpool->pool); + zpool_put_driver(zpool->driver); + kfree(zpool); +} + +/** + * zpool_get_type() - Get the type of the zpool + * @pool The zpool to check + * + * This returns the type of the pool. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: The type of zpool. + */ +char *zpool_get_type(struct zpool *zpool) +{ + return zpool->type; +} + +/** + * zpool_malloc() - Allocate memory + * @pool The zpool to allocate from. + * @size The amount of memory to allocate. + * @gfp The GFP flags to use when allocating memory. + * @handle Pointer to the handle to set + * + * This allocates the requested amount of memory from the pool. + * The gfp flags will be used when allocating memory, if the + * implementation supports it. The provided @handle will be + * set to the allocated object handle. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error. + */ +int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zpool->driver->malloc(zpool->pool, size, gfp, handle); +} + +/** + * zpool_free() - Free previously allocated memory + * @pool The zpool that allocated the memory. + * @handle The handle to the memory to free. + * + * This frees previously allocated memory. This does not guarantee + * that the pool will actually free memory, only that the memory + * in the pool will become available for use by the pool. + * + * Implementations must guarantee this to be thread-safe, + * however only when freeing different handles. The same + * handle should only be freed once, and should not be used + * after freeing. + */ +void zpool_free(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->free(zpool->pool, handle); +} + +/** + * zpool_shrink() - Shrink the pool size + * @pool The zpool to shrink. + * @pages The number of pages to shrink the pool. + * @reclaimed The number of pages successfully evicted. + * + * This attempts to shrink the actual memory size of the pool + * by evicting currently used handle(s). If the pool was + * created with no zpool_ops, or the evict call fails for any + * of the handles, this will fail. If non-NULL, the @reclaimed + * parameter will be set to the number of pages reclaimed, + * which may be more than the number of pages requested. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error/failure. + */ +int zpool_shrink(struct zpool *zpool, unsigned int pages, + unsigned int *reclaimed) +{ + return zpool->driver->shrink(zpool->pool, pages, reclaimed); +} + +/** + * zpool_map_handle() - Map a previously allocated handle into memory + * @pool The zpool that the handle was allocated from + * @handle The handle to map + * @mm How the memory should be mapped + * + * This maps a previously allocated handle into memory. The @mm + * param indicates to the implementation how the memory will be + * used, i.e. read-only, write-only, read-write. If the + * implementation does not support it, the memory will be treated + * as read-write. + * + * This may hold locks, disable interrupts, and/or preemption, + * and the zpool_unmap_handle() must be called to undo those + * actions. The code that uses the mapped handle should complete + * its operatons on the mapped handle memory quickly and unmap + * as soon as possible. As the implementation may use per-cpu + * data, multiple handles should not be mapped concurrently on + * any cpu. + * + * Returns: A pointer to the handle's mapped memory area. + */ +void *zpool_map_handle(struct zpool *zpool, unsigned long handle, + enum zpool_mapmode mapmode) +{ + return zpool->driver->map(zpool->pool, handle, mapmode); +} + +/** + * zpool_unmap_handle() - Unmap a previously mapped handle + * @pool The zpool that the handle was allocated from + * @handle The handle to unmap + * + * This unmaps a previously mapped handle. Any locks or other + * actions that the implementation took in zpool_map_handle() + * will be undone here. The memory area returned from + * zpool_map_handle() should no longer be used after this. + */ +void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->unmap(zpool->pool, handle); +} + +/** + * zpool_get_total_size() - The total size of the pool + * @pool The zpool to check + * + * This returns the total size in bytes of the pool. + * + * Returns: Total size of the zpool in bytes. + */ +u64 zpool_get_total_size(struct zpool *zpool) +{ + return zpool->driver->total_size(zpool->pool); +} + +static int __init init_zpool(void) +{ + pr_info("loaded\n"); + return 0; +} + +static void __exit exit_zpool(void) +{ + pr_info("unloaded\n"); +} + +module_init(init_zpool); +module_exit(exit_zpool); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dan Streetman "); +MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index cee5c54c500f..3078eca4737d 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -239,7 +239,6 @@ struct mapping_area { enum zs_mapmode vm_mm; /* mapping mode */ }; - /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); From 7b5c9b29fa4e0b9374511d6f76bd24ea5926ed29 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 6 Aug 2014 16:08:38 -0700 Subject: [PATCH 65/82] mm/zpool: zbud/zsmalloc implement zpool Update zbud and zsmalloc to implement the zpool api. [fengguang.wu@intel.com: make functions static] Signed-off-by: Dan Streetman Tested-by: Seth Jennings Cc: Minchan Kim Cc: Nitin Gupta Cc: Weijie Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit c795779df29e180738568d2a5eb3a42f3b5e47f0) Signed-off-by: Alex Shi Conflicts: mm/zbud.c Conflicts solution: remove zbud --- mm/zsmalloc.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 3078eca4737d..fc25b58a02f8 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -80,6 +80,7 @@ #include #include #include +#include /* * This must be power of 2 and greater than of equal to sizeof(link_free). @@ -239,6 +240,82 @@ struct mapping_area { enum zs_mapmode vm_mm; /* mapping mode */ }; +/* zpool driver */ + +#ifdef CONFIG_ZPOOL + +static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) +{ + return zs_create_pool(gfp); +} + +static void zs_zpool_destroy(void *pool) +{ + zs_destroy_pool(pool); +} + +static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + *handle = zs_malloc(pool, size); + return *handle ? 0 : -1; +} +static void zs_zpool_free(void *pool, unsigned long handle) +{ + zs_free(pool, handle); +} + +static int zs_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + return -EINVAL; +} + +static void *zs_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + enum zs_mapmode zs_mm; + + switch (mm) { + case ZPOOL_MM_RO: + zs_mm = ZS_MM_RO; + break; + case ZPOOL_MM_WO: + zs_mm = ZS_MM_WO; + break; + case ZPOOL_MM_RW: /* fallthru */ + default: + zs_mm = ZS_MM_RW; + break; + } + + return zs_map_object(pool, handle, zs_mm); +} +static void zs_zpool_unmap(void *pool, unsigned long handle) +{ + zs_unmap_object(pool, handle); +} + +static u64 zs_zpool_total_size(void *pool) +{ + return zs_get_total_size_bytes(pool); +} + +static struct zpool_driver zs_zpool_driver = { + .type = "zsmalloc", + .owner = THIS_MODULE, + .create = zs_zpool_create, + .destroy = zs_zpool_destroy, + .malloc = zs_zpool_malloc, + .free = zs_zpool_free, + .shrink = zs_zpool_shrink, + .map = zs_zpool_map, + .unmap = zs_zpool_unmap, + .total_size = zs_zpool_total_size, +}; + +#endif /* CONFIG_ZPOOL */ + /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); @@ -779,6 +856,10 @@ static void zs_exit(void) { int cpu; +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zs_zpool_driver); +#endif + cpu_notifier_register_begin(); for_each_online_cpu(cpu) @@ -805,6 +886,10 @@ static int zs_init(void) cpu_notifier_register_done(); +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zs_zpool_driver); +#endif + return 0; fail: zs_exit(); From 548233701e2be93471a703b490b6702c000d60bc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 29 Aug 2014 15:18:37 -0700 Subject: [PATCH 66/82] zram: fix incorrect stat with failed_reads Since we allocate a temporary buffer in zram_bvec_read to handle partial page operations in commit 924bd88d703e ("Staging: zram: allow partial page operations"), our ->failed_reads value may be incorrect as we do not increase its value when failing to allocate the temporary buffer. Let's fix this issue and correct the annotation of failed_reads. Signed-off-by: Chao Yu Acked-by: Minchan Kim Cc: Nitin Gupta Acked-by: Jerome Marchand Acked-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 0cf1e9d6c34d4c82ac3af8015594849814843d36) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 10 +++++++--- drivers/block/zram/zram_drv.h | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index a21f466c4e15..600aeab83d2a 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -377,7 +377,6 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) /* Should NEVER happen. Return bio error if it does. */ if (unlikely(ret)) { pr_err("Decompression failed! err=%d, page=%u\n", ret, index); - atomic64_inc(&zram->stats.failed_reads); return ret; } @@ -546,8 +545,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, zcomp_strm_release(zram->comp, zstrm); if (is_partial_io(bvec)) kfree(uncmem); - if (ret) - atomic64_inc(&zram->stats.failed_writes); return ret; } @@ -565,6 +562,13 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, ret = zram_bvec_write(zram, bvec, index, offset); } + if (unlikely(ret)) { + if (rw == READ) + atomic64_inc(&zram->stats.failed_reads); + else + atomic64_inc(&zram->stats.failed_writes); + } + return ret; } diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 5b0afde729cd..e0f725c87cc6 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -84,7 +84,7 @@ struct zram_stats { atomic64_t compr_data_size; /* compressed size of pages stored */ atomic64_t num_reads; /* failed + successful */ atomic64_t num_writes; /* --do-- */ - atomic64_t failed_reads; /* should NEVER! happen */ + atomic64_t failed_reads; /* can happen when memory is too low */ atomic64_t failed_writes; /* can happen when memory is too low */ atomic64_t invalid_io; /* non-page-aligned I/O requests */ atomic64_t notify_free; /* no. of swap slot free notifications */ From 60fafac3c592636f0bbbc75e20e2d7dcc2284c20 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 29 Aug 2014 15:18:40 -0700 Subject: [PATCH 67/82] mm/zpool: use prefixed module loading To avoid potential format string expansion via module parameters, do not use the zpool type directly in request_module() without a format string. Additionally, to avoid arbitrary modules being loaded via zpool API (e.g. via the zswap_zpool_type module parameter) add a "zpool-" prefix to the requested module, as well as module aliases for the existing zpool types (zbud and zsmalloc). Signed-off-by: Kees Cook Cc: Seth Jennings Cc: Minchan Kim Cc: Nitin Gupta Acked-by: Dan Streetman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 137f8cff505ace6251dc442c7aa973d60c801a79) Signed-off-by: Alex Shi Conflicts: mm/zbud.c Conflicts solution: remove zbud --- mm/zpool.c | 2 +- mm/zsmalloc.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/zpool.c b/mm/zpool.c index e40612a1df00..739cdf0d183a 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -150,7 +150,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) driver = zpool_get_driver(type); if (!driver) { - request_module(type); + request_module("zpool-%s", type); driver = zpool_get_driver(type); } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index fc25b58a02f8..f40ff48433e2 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -314,6 +314,7 @@ static struct zpool_driver zs_zpool_driver = { .total_size = zs_zpool_total_size, }; +MODULE_ALIAS("zpool-zsmalloc"); #endif /* CONFIG_ZPOOL */ /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ From 8ccca7290fdce49995c32fc514ff8281d476e417 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 9 Oct 2014 15:29:48 -0700 Subject: [PATCH 68/82] zsmalloc: move pages_allocated to zs_pool Currently, zram has no feature to limit memory so theoretically zram can deplete system memory. Users have asked for a limit several times as even without exhaustion zram makes it hard to control memory usage of the platform. This patchset adds the feature. Patch 1 makes zs_get_total_size_bytes faster because it would be used frequently in later patches for the new feature. Patch 2 changes zs_get_total_size_bytes's return unit from bytes to page so that zsmalloc doesn't need unnecessary operation(ie, << PAGE_SHIFT). Patch 3 adds new feature. I added the feature into zram layer, not zsmalloc because limiation is zram's requirement, not zsmalloc so any other user using zsmalloc(ie, zpool) shouldn't affected by unnecessary branch of zsmalloc. In future, if every users of zsmalloc want the feature, then, we could move the feature from client side to zsmalloc easily but vice versa would be painful. Patch 4 adds news facility to report maximum memory usage of zram so that this avoids user polling frequently via /sys/block/zram0/ mem_used_total and ensures transient max are not missed. This patch (of 4): pages_allocated has counted in size_class structure and when user of zsmalloc want to see total_size_bytes, it should gather all of count from each size_class to report the sum. It's not bad if user don't see the value often but if user start to see the value frequently, it would be not a good deal for performance pov. This patch moves the count from size_class to zs_pool so it could reduce memory footprint (from [255 * 8byte] to [sizeof(atomic_long_t)]). Signed-off-by: Minchan Kim Reviewed-by: Dan Streetman Cc: Sergey Senozhatsky Cc: Jerome Marchand Cc: Cc: Cc: Luigi Semenzato Cc: Nitin Gupta Cc: Seth Jennings Reviewed-by: David Horner Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 13de8933c96b4557f667c337676f05274e017f83) Signed-off-by: Alex Shi --- mm/zsmalloc.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index f40ff48433e2..261be4654848 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -187,9 +187,6 @@ struct size_class { spinlock_t lock; - /* stats */ - u64 pages_allocated; - struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; }; @@ -208,6 +205,7 @@ struct zs_pool { struct size_class size_class[ZS_SIZE_CLASSES]; gfp_t flags; /* allocation flags used when growing pool */ + atomic_long_t pages_allocated; }; /* @@ -995,8 +993,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) return 0; set_zspage_mapping(first_page, class->index, ZS_EMPTY); + atomic_long_add(class->pages_per_zspage, + &pool->pages_allocated); spin_lock(&class->lock); - class->pages_allocated += class->pages_per_zspage; } obj = (unsigned long)first_page->freelist; @@ -1049,14 +1048,13 @@ void zs_free(struct zs_pool *pool, unsigned long obj) first_page->inuse--; fullness = fix_fullness_group(pool, first_page); - - if (fullness == ZS_EMPTY) - class->pages_allocated -= class->pages_per_zspage; - spin_unlock(&class->lock); - if (fullness == ZS_EMPTY) + if (fullness == ZS_EMPTY) { + atomic_long_sub(class->pages_per_zspage, + &pool->pages_allocated); free_zspage(first_page); + } } EXPORT_SYMBOL_GPL(zs_free); @@ -1152,12 +1150,7 @@ EXPORT_SYMBOL_GPL(zs_unmap_object); u64 zs_get_total_size_bytes(struct zs_pool *pool) { - int i; - u64 npages = 0; - - for (i = 0; i < ZS_SIZE_CLASSES; i++) - npages += pool->size_class[i].pages_allocated; - + u64 npages = atomic_long_read(&pool->pages_allocated); return npages << PAGE_SHIFT; } EXPORT_SYMBOL_GPL(zs_get_total_size_bytes); From 70a5d237c239ddde3c055784980635e39bd93885 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 9 Oct 2014 15:29:50 -0700 Subject: [PATCH 69/82] zsmalloc: change return value unit of zs_get_total_size_bytes zs_get_total_size_bytes returns a amount of memory zsmalloc consumed with *byte unit* but zsmalloc operates *page unit* rather than byte unit so let's change the API so benefit we could get is that reduce unnecessary overhead (ie, change page unit with byte unit) in zsmalloc. Since return type is pages, "zs_get_total_pages" is better than "zs_get_total_size_bytes". Signed-off-by: Minchan Kim Reviewed-by: Dan Streetman Cc: Sergey Senozhatsky Cc: Jerome Marchand Cc: Cc: Cc: Luigi Semenzato Cc: Nitin Gupta Cc: Seth Jennings Cc: David Horner Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 722cdc17232f0f684011407f7cf3c40d39457971) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 4 ++-- include/linux/zsmalloc.h | 2 +- mm/zsmalloc.c | 9 ++++----- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 600aeab83d2a..d0717743e2df 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -103,10 +103,10 @@ static ssize_t mem_used_total_show(struct device *dev, down_read(&zram->init_lock); if (init_done(zram)) - val = zs_get_total_size_bytes(meta->mem_pool); + val = zs_get_total_pages(meta->mem_pool); up_read(&zram->init_lock); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); } static ssize_t max_comp_streams_show(struct device *dev, diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index d3f48686bceb..ebb2841f752e 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -39,6 +39,6 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, enum zs_mapmode mm); void zs_unmap_object(struct zs_pool *pool, unsigned long handle); -u64 zs_get_total_size_bytes(struct zs_pool *pool); +unsigned long zs_get_total_pages(struct zs_pool *pool); #endif diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 261be4654848..a255c6e87cab 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -296,7 +296,7 @@ static void zs_zpool_unmap(void *pool, unsigned long handle) static u64 zs_zpool_total_size(void *pool) { - return zs_get_total_size_bytes(pool); + return zs_get_total_pages(pool) << PAGE_SHIFT; } static struct zpool_driver zs_zpool_driver = { @@ -1148,12 +1148,11 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) } EXPORT_SYMBOL_GPL(zs_unmap_object); -u64 zs_get_total_size_bytes(struct zs_pool *pool) +unsigned long zs_get_total_pages(struct zs_pool *pool) { - u64 npages = atomic_long_read(&pool->pages_allocated); - return npages << PAGE_SHIFT; + return atomic_long_read(&pool->pages_allocated); } -EXPORT_SYMBOL_GPL(zs_get_total_size_bytes); +EXPORT_SYMBOL_GPL(zs_get_total_pages); module_init(zs_init); module_exit(zs_exit); From 3c854b64635888527504dfc898687ab10dad6191 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 9 Oct 2014 15:29:53 -0700 Subject: [PATCH 70/82] zram: zram memory size limitation Since zram has no control feature to limit memory usage, it makes hard to manage system memrory. This patch adds new knob "mem_limit" via sysfs to set up the a limit so that zram could fail allocation once it reaches the limit. In addition, user could change the limit in runtime so that he could manage the memory more dynamically. Initial state is no limit so it doesn't break old behavior. [akpm@linux-foundation.org: fix typo, per Sergey] Signed-off-by: Minchan Kim Cc: Dan Streetman Cc: Sergey Senozhatsky Cc: Jerome Marchand Cc: Cc: Cc: Luigi Semenzato Cc: Nitin Gupta Cc: Seth Jennings Cc: David Horner Cc: Joonsoo Kim Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 9ada9da9573f3460b156b7755c093e30b258eacb) Signed-off-by: Alex Shi --- Documentation/ABI/testing/sysfs-block-zram | 9 +++++ Documentation/blockdev/zram.txt | 24 ++++++++++-- drivers/block/zram/zram_drv.c | 45 ++++++++++++++++++++++ drivers/block/zram/zram_drv.h | 5 +++ 4 files changed, 79 insertions(+), 4 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 2775966c2d12..f861cbfab6b1 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -127,3 +127,12 @@ Description: efficiency can be calculated using compr_data_size and this statistic. Unit: bytes + +What: /sys/block/zram/mem_limit +Date: August 2014 +Contact: Minchan Kim +Description: + The mem_limit file is read/write and specifies the maximum + amount of memory ZRAM can use to store the compressed data. The + limit could be changed in run time and "0" means disable the + limit. No limit is the initial state. Unit: bytes diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 2db1687a4b10..4331ebf94bf0 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -74,14 +74,30 @@ There is little point creating a zram of greater than twice the size of memory since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the size of the disk when not in use so a huge zram is wasteful. -5) Activate: +5) Set memory limit: Optional + Set memory limit by writing the value to sysfs node 'mem_limit'. + The value can be either in bytes or you can use mem suffixes. + In addition, you could change the value in runtime. + Examples: + # limit /dev/zram0 with 50MB memory + echo $((50*1024*1024)) > /sys/block/zram0/mem_limit + + # Using mem suffixes + echo 256K > /sys/block/zram0/mem_limit + echo 512M > /sys/block/zram0/mem_limit + echo 1G > /sys/block/zram0/mem_limit + + # To disable memory limit + echo 0 > /sys/block/zram0/mem_limit + +6) Activate: mkswap /dev/zram0 swapon /dev/zram0 mkfs.ext4 /dev/zram1 mount /dev/zram1 /tmp -6) Stats: +7) Stats: Per-device statistics are exported as various nodes under /sys/block/zram/ disksize @@ -95,11 +111,11 @@ size of the disk when not in use so a huge zram is wasteful. compr_data_size mem_used_total -7) Deactivate: +8) Deactivate: swapoff /dev/zram0 umount /dev/zram1 -8) Reset: +9) Reset: Write any positive value to 'reset' sysfs node echo 1 > /sys/block/zram0/reset echo 1 > /sys/block/zram1/reset diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index d0717743e2df..3f4da06c89c0 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -122,6 +122,37 @@ static ssize_t max_comp_streams_show(struct device *dev, return scnprintf(buf, PAGE_SIZE, "%d\n", val); } +static ssize_t mem_limit_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u64 val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + val = zram->limit_pages; + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); +} + +static ssize_t mem_limit_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + u64 limit; + char *tmp; + struct zram *zram = dev_to_zram(dev); + + limit = memparse(buf, &tmp); + if (buf == tmp) /* no chars parsed, invalid input */ + return -EINVAL; + + down_write(&zram->init_lock); + zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT; + up_write(&zram->init_lock); + + return len; +} + static ssize_t max_comp_streams_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -512,6 +543,14 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, ret = -ENOMEM; goto out; } + + if (zram->limit_pages && + zs_get_total_pages(meta->mem_pool) > zram->limit_pages) { + zs_free(meta->mem_pool, handle); + ret = -ENOMEM; + goto out; + } + cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) { @@ -616,6 +655,9 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) struct zram_meta *meta; down_write(&zram->init_lock); + + zram->limit_pages = 0; + if (!init_done(zram)) { up_write(&zram->init_lock); return; @@ -855,6 +897,8 @@ static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); +static DEVICE_ATTR(mem_limit, S_IRUGO | S_IWUSR, mem_limit_show, + mem_limit_store); static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, max_comp_streams_show, max_comp_streams_store); static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR, @@ -883,6 +927,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_orig_data_size.attr, &dev_attr_compr_data_size.attr, &dev_attr_mem_used_total.attr, + &dev_attr_mem_limit.attr, &dev_attr_max_comp_streams.attr, &dev_attr_comp_algorithm.attr, NULL, diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index e0f725c87cc6..b7aa9c21553f 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -112,6 +112,11 @@ struct zram { u64 disksize; /* bytes */ int max_comp_streams; struct zram_stats stats; + /* + * the number of pages zram can consume for storing compressed data + */ + unsigned long limit_pages; + char compressor[10]; }; #endif From cda6b06454ce34ec784cc45c427eb4b0131581e1 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 9 Oct 2014 15:29:55 -0700 Subject: [PATCH 71/82] zram: report maximum used memory Normally, zram user could get maximum memory usage zram consumed via polling mem_used_total with sysfs in userspace. But it has a critical problem because user can miss peak memory usage during update inverval of polling. For avoiding that, user should poll it with shorter interval(ie, 0.0000000001s) with mlocking to avoid page fault delay when memory pressure is heavy. It would be troublesome. This patch adds new knob "mem_used_max" so user could see the maximum memory usage easily via reading the knob and reset it via "echo 0 > /sys/block/zram0/mem_used_max". Signed-off-by: Minchan Kim Reviewed-by: Dan Streetman Cc: Sergey Senozhatsky Cc: Jerome Marchand Cc: Cc: Cc: Luigi Semenzato Cc: Nitin Gupta Cc: Seth Jennings Reviewed-by: David Horner Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 461a8eee6af3b55745be64bea403ed0b743563cf) Signed-off-by: Alex Shi --- Documentation/ABI/testing/sysfs-block-zram | 10 ++++ Documentation/blockdev/zram.txt | 1 + drivers/block/zram/zram_drv.c | 60 +++++++++++++++++++++- drivers/block/zram/zram_drv.h | 1 + 4 files changed, 70 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index f861cbfab6b1..31db44f01936 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -128,6 +128,16 @@ Description: statistic. Unit: bytes +What: /sys/block/zram/mem_used_max +Date: August 2014 +Contact: Minchan Kim +Description: + The mem_used_max file is read/write and specifies the amount + of maximum memory zram have consumed to store compressed data. + For resetting the value, you should write "0". Otherwise, + you could see -EINVAL. + Unit: bytes + What: /sys/block/zram/mem_limit Date: August 2014 Contact: Minchan Kim diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 4331ebf94bf0..5cd0bd903f54 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -110,6 +110,7 @@ size of the disk when not in use so a huge zram is wasteful. orig_data_size compr_data_size mem_used_total + mem_used_max 8) Deactivate: swapoff /dev/zram0 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3f4da06c89c0..204b7fa9d78e 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -153,6 +153,41 @@ static ssize_t mem_limit_store(struct device *dev, return len; } +static ssize_t mem_used_max_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u64 val = 0; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + if (init_done(zram)) + val = atomic_long_read(&zram->stats.max_used_pages); + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); +} + +static ssize_t mem_used_max_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + int err; + unsigned long val; + struct zram *zram = dev_to_zram(dev); + struct zram_meta *meta = zram->meta; + + err = kstrtoul(buf, 10, &val); + if (err || val != 0) + return -EINVAL; + + down_read(&zram->init_lock); + if (init_done(zram)) + atomic_long_set(&zram->stats.max_used_pages, + zs_get_total_pages(meta->mem_pool)); + up_read(&zram->init_lock); + + return len; +} + static ssize_t max_comp_streams_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -464,6 +499,21 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, return ret; } +static inline void update_used_max(struct zram *zram, + const unsigned long pages) +{ + int old_max, cur_max; + + old_max = atomic_long_read(&zram->stats.max_used_pages); + + do { + cur_max = old_max; + if (pages > cur_max) + old_max = atomic_long_cmpxchg( + &zram->stats.max_used_pages, cur_max, pages); + } while (old_max != cur_max); +} + static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, int offset) { @@ -475,6 +525,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, struct zram_meta *meta = zram->meta; struct zcomp_strm *zstrm; bool locked = false; + unsigned long alloced_pages; page = bvec->bv_page; if (is_partial_io(bvec)) { @@ -544,13 +595,15 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, goto out; } - if (zram->limit_pages && - zs_get_total_pages(meta->mem_pool) > zram->limit_pages) { + alloced_pages = zs_get_total_pages(meta->mem_pool); + if (zram->limit_pages && alloced_pages > zram->limit_pages) { zs_free(meta->mem_pool, handle); ret = -ENOMEM; goto out; } + update_used_max(zram, alloced_pages); + cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) { @@ -899,6 +952,8 @@ static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); static DEVICE_ATTR(mem_limit, S_IRUGO | S_IWUSR, mem_limit_show, mem_limit_store); +static DEVICE_ATTR(mem_used_max, S_IRUGO | S_IWUSR, mem_used_max_show, + mem_used_max_store); static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, max_comp_streams_show, max_comp_streams_store); static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR, @@ -928,6 +983,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_compr_data_size.attr, &dev_attr_mem_used_total.attr, &dev_attr_mem_limit.attr, + &dev_attr_mem_used_max.attr, &dev_attr_max_comp_streams.attr, &dev_attr_comp_algorithm.attr, NULL, diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index b7aa9c21553f..c6ee271317f5 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -90,6 +90,7 @@ struct zram_stats { atomic64_t notify_free; /* no. of swap slot free notifications */ atomic64_t zero_pages; /* no. of zero filled pages */ atomic64_t pages_stored; /* no. of pages currently stored */ + atomic_long_t max_used_pages; /* no. of maximum pages stored */ }; struct zram_meta { From c85f1d31057bdde75716b19e6456a91c5c2273da Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 9 Oct 2014 15:29:57 -0700 Subject: [PATCH 72/82] zram: use notify_free to account all free notifications `notify_free' device attribute accounts the number of slot free notifications and internally represents the number of zram_free_page() calls. Slot free notifications are sent only when device is used as a swap device, hence `notify_free' is used only for swap devices. Since f4659d8e620d08 (zram: support REQ_DISCARD) ZRAM handles yet another one free notification (also via zram_free_page() call) -- REQ_DISCARD requests, which are sent by a filesystem, whenever some data blocks are discarded. However, there is no way to know the number of notifications in the latter case. Use `notify_free' to account the number of pages freed by zram_bio_discard() and zram_slot_free_notify(). Depending on usage scenario `notify_free' represents: a) the number of pages freed because of slot free notifications, which is equal to the number of swap_slot_free_notify() calls, so there is no behaviour change b) the number of pages freed because of REQ_DISCARD notifications Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Acked-by: Jerome Marchand Cc: Nitin Gupta Cc: Chao Yu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 015254daf1753003c19c46b90ee85a963260d270) Signed-off-by: Alex Shi Conflicts: Documentation/ABI/testing/sysfs-block-zram --- Documentation/ABI/testing/sysfs-block-zram | 13 ++++++++----- drivers/block/zram/zram_drv.c | 1 + 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 31db44f01936..0c7f4f91c6b5 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -76,11 +76,14 @@ What: /sys/block/zram/notify_free Date: August 2010 Contact: Nitin Gupta Description: - The notify_free file is read-only and specifies the number of - swap slot free notifications received by this device. These - notifications are send to a swap block device when a swap slot - is freed. This statistic is applicable only when this disk is - being used as a swap disk. + The notify_free file is read-only. Depending on device usage + scenario it may account a) the number of pages freed because + of swap slot free notifications or b) the number of pages freed + because of REQ_DISCARD requests sent by bio. The former ones + are sent to a swap block device when a swap slot is freed, which + implies that this disk is being used as a swap disk. The latter + ones are sent by filesystem mounted with discard option, + whenever some data blocks are getting discarded. What: /sys/block/zram/discard Date: August 2010 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 204b7fa9d78e..3503019a9672 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -697,6 +697,7 @@ static void zram_bio_discard(struct zram *zram, u32 index, bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + atomic64_inc(&zram->stats.notify_free); index++; n -= PAGE_SIZE; } From b4f773eb2609e4b4fdf1e4ba11fd5dbc4eb2e655 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Thu, 9 Oct 2014 15:29:59 -0700 Subject: [PATCH 73/82] mm/zsmalloc.c: correct comment for fullness group computation The letter 'f' in "n <= N/f" stands for fullness_threshold_frac, not 1/fullness_threshold_frac. Signed-off-by: Wang Sheng-Hui Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 6dd9737e31504f9377a8a19810ea4922e88516c1) Signed-off-by: Alex Shi --- mm/zsmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index a255c6e87cab..a4556ec316e4 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -163,7 +163,7 @@ enum fullness_group { * n <= N / f, where * n = number of allocated objects * N = total number of objects zspage can store - * f = 1/fullness_threshold_frac + * f = fullness_threshold_frac * * Similarly, we assign zspage to: * ZS_ALMOST_FULL when n > N / f From b900447eb093095e133ffacb6a7de29660e09eae Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Thu, 9 Oct 2014 15:30:01 -0700 Subject: [PATCH 74/82] zsmalloc: simplify init_zspage free obj linking Change zsmalloc init_zspage() logic to iterate through each object on each of its pages, checking the offset to verify the object is on the current page before linking it into the zspage. The current zsmalloc init_zspage free object linking code has logic that relies on there only being one page per zspage when PAGE_SIZE is a multiple of class->size. It calculates the number of objects for the current page, and iterates through all of them plus one, to account for the assumed partial object at the end of the page. While this currently works, the logic can be simplified to just link the object at each successive offset until the offset is larger than PAGE_SIZE, which does not rely on PAGE_SIZE being a multiple of class->size. Signed-off-by: Dan Streetman Acked-by: Minchan Kim Cc: Sergey Senozhatsky Cc: Nitin Gupta Cc: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 5538c562377580947916b3366898f1eb5f53768e) Signed-off-by: Alex Shi --- mm/zsmalloc.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index a4556ec316e4..00c28039b06a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -594,7 +594,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) while (page) { struct page *next_page; struct link_free *link; - unsigned int i, objs_on_page; + unsigned int i = 1; /* * page->index stores offset of first object starting @@ -607,14 +607,10 @@ static void init_zspage(struct page *first_page, struct size_class *class) link = (struct link_free *)kmap_atomic(page) + off / sizeof(*link); - objs_on_page = (PAGE_SIZE - off) / class->size; - for (i = 1; i <= objs_on_page; i++) { - off += class->size; - if (off < PAGE_SIZE) { - link->next = obj_location_to_handle(page, i); - link += class->size / sizeof(*link); - } + while ((off += class->size) < PAGE_SIZE) { + link->next = obj_location_to_handle(page, i++); + link += class->size / sizeof(*link); } /* @@ -626,7 +622,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) link->next = obj_location_to_handle(next_page, 0); kunmap_atomic(link); page = next_page; - off = (off + class->size) % PAGE_SIZE; + off %= PAGE_SIZE; } } From ea56241654a2c7ef9a1122a7ad5c751527ed9b17 Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Wed, 29 Oct 2014 14:50:57 -0700 Subject: [PATCH 75/82] zram: avoid NULL pointer access in concurrent situation There is a rare NULL pointer bug in mem_used_total_show() and mem_used_max_store() in concurrent situation, like this: zram is not initialized, process A is a mem_used_total reader which runs periodically, while process B try to init zram. process A process B access meta, get a NULL value init zram, done init_done() is true access meta->mem_pool, get a NULL pointer BUG This patch fixes this issue. Signed-off-by: Weijie Yang Acked-by: Minchan Kim Acked-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 5a99e95b8d1cd47f6feddcdca6c71d22060df8a2) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3503019a9672..bf9fea268db4 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -99,11 +99,12 @@ static ssize_t mem_used_total_show(struct device *dev, { u64 val = 0; struct zram *zram = dev_to_zram(dev); - struct zram_meta *meta = zram->meta; down_read(&zram->init_lock); - if (init_done(zram)) + if (init_done(zram)) { + struct zram_meta *meta = zram->meta; val = zs_get_total_pages(meta->mem_pool); + } up_read(&zram->init_lock); return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); @@ -173,16 +174,17 @@ static ssize_t mem_used_max_store(struct device *dev, int err; unsigned long val; struct zram *zram = dev_to_zram(dev); - struct zram_meta *meta = zram->meta; err = kstrtoul(buf, 10, &val); if (err || val != 0) return -EINVAL; down_read(&zram->init_lock); - if (init_done(zram)) + if (init_done(zram)) { + struct zram_meta *meta = zram->meta; atomic_long_set(&zram->stats.max_used_pages, zs_get_total_pages(meta->mem_pool)); + } up_read(&zram->init_lock); return len; From 7aecdc886e2e9bacfaf2377067072fa63d649b21 Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Thu, 13 Nov 2014 15:19:05 -0800 Subject: [PATCH 76/82] zram: avoid kunmap_atomic() of a NULL pointer zram could kunmap_atomic() a NULL pointer in a rare situation: a zram page becomes a full-zeroed page after a partial write io. The current code doesn't handle this case and performs kunmap_atomic() on a NULL pointer, which panics the kernel. This patch fixes this issue. Signed-off-by: Weijie Yang Cc: Sergey Senozhatsky Cc: Dan Streetman Cc: Nitin Gupta Cc: Weijie Yang Acked-by: Jerome Marchand Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit c406515239376fc93a30d5d03192182160cbd3fb) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index bf9fea268db4..8a1266ce8bbe 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -559,7 +559,8 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, } if (page_zero_filled(uncmem)) { - kunmap_atomic(user_mem); + if (user_mem) + kunmap_atomic(user_mem); /* Free memory associated with this sector now. */ bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); From 48eaa2e2b66fffbfa6eefa46c34cfbf47ccc1fce Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Fri, 12 Dec 2014 16:56:44 -0800 Subject: [PATCH 77/82] zsmalloc: merge size_class to reduce fragmentation zsmalloc has many size_classes to reduce fragmentation and they are in 16 bytes unit, for example, 16, 32, 48, etc., if PAGE_SIZE is 4096. And, zsmalloc has constraint that each zspage has 4 pages at maximum. In this situation, we can see interesting aspect. Let's think about size_class for 1488, 1472, ..., 1376. To prevent external fragmentation, they uses 4 pages per zspage and so all they can contain 11 objects at maximum. 16384 (4096 * 4) = 1488 * 11 + remains 16384 (4096 * 4) = 1472 * 11 + remains 16384 (4096 * 4) = ... 16384 (4096 * 4) = 1376 * 11 + remains It means that they have same characteristics and classification between them isn't needed. If we use one size_class for them, we can reduce fragementation and save some memory since both the 1488 and 1472 sized classes can only fit 11 objects into 4 pages, and an object that's 1472 bytes can fit into an object that's 1488 bytes, merging these classes to always use objects that are 1488 bytes will reduce the total number of size classes. And reducing the total number of size classes reduces overall fragmentation, because a wider range of compressed pages can fit into a single size class, leaving less unused objects in each size class. For this purpose, this patch implement size_class merging. If there is size_class that have same pages_per_zspage and same number of objects per zspage with previous size_class, we don't create new size_class. Instead, we use previous, same characteristic size_class. With this way, above example sizes (1488, 1472, ..., 1376) use just one size_class so we can get much more memory utilization. Below is result of my simple test. TEST ENV: EXT4 on zram, mount with discard option WORKLOAD: untar kernel source code, remove directory in descending order in size. (drivers arch fs sound include net Documentation firmware kernel tools) Each line represents orig_data_size, compr_data_size, mem_used_total, fragmentation overhead (mem_used - compr_data_size) and overhead ratio (overhead to compr_data_size), respectively, after untar and remove operation is executed. * untar-nomerge.out orig_size compr_size used_size overhead overhead_ratio 525.88MB 199.16MB 210.23MB 11.08MB 5.56% 288.32MB 97.43MB 105.63MB 8.20MB 8.41% 177.32MB 61.12MB 69.40MB 8.28MB 13.55% 146.47MB 47.32MB 56.10MB 8.78MB 18.55% 124.16MB 38.85MB 48.41MB 9.55MB 24.58% 103.93MB 31.68MB 40.93MB 9.25MB 29.21% 84.34MB 22.86MB 32.72MB 9.86MB 43.13% 66.87MB 14.83MB 23.83MB 9.00MB 60.70% 60.67MB 11.11MB 18.60MB 7.49MB 67.48% 55.86MB 8.83MB 16.61MB 7.77MB 88.03% 53.32MB 8.01MB 15.32MB 7.31MB 91.24% * untar-merge.out orig_size compr_size used_size overhead overhead_ratio 526.23MB 199.18MB 209.81MB 10.64MB 5.34% 288.68MB 97.45MB 104.08MB 6.63MB 6.80% 177.68MB 61.14MB 66.93MB 5.79MB 9.47% 146.83MB 47.34MB 52.79MB 5.45MB 11.51% 124.52MB 38.87MB 44.30MB 5.43MB 13.96% 104.29MB 31.70MB 36.83MB 5.13MB 16.19% 84.70MB 22.88MB 27.92MB 5.04MB 22.04% 67.11MB 14.83MB 19.26MB 4.43MB 29.86% 60.82MB 11.10MB 14.90MB 3.79MB 34.17% 55.90MB 8.82MB 12.61MB 3.79MB 42.97% 53.32MB 8.01MB 11.73MB 3.73MB 46.53% As you can see above result, merged one has better utilization (overhead ratio, 5th column) and uses less memory (mem_used_total, 3rd column). Signed-off-by: Joonsoo Kim Cc: Minchan Kim Cc: Nitin Gupta Cc: Jerome Marchand Cc: Sergey Senozhatsky Reviewed-by: Dan Streetman Cc: Luigi Semenzato Cc: Cc: "seungho1.park" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 9eec4cd53f9865b733dc78cf5f6465871beed014) Signed-off-by: Alex Shi --- mm/zsmalloc.c | 80 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 66 insertions(+), 14 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 00c28039b06a..91eac3cf17e8 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -202,7 +202,7 @@ struct link_free { }; struct zs_pool { - struct size_class size_class[ZS_SIZE_CLASSES]; + struct size_class *size_class[ZS_SIZE_CLASSES]; gfp_t flags; /* allocation flags used when growing pool */ atomic_long_t pages_allocated; @@ -434,7 +434,7 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool, if (newfg == currfg) goto out; - class = &pool->size_class[class_idx]; + class = pool->size_class[class_idx]; remove_zspage(page, class, currfg); insert_zspage(page, class, newfg); set_zspage_mapping(page, class_idx, newfg); @@ -891,6 +891,23 @@ static int zs_init(void) return notifier_to_errno(ret); } +static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) +{ + return pages_per_zspage * PAGE_SIZE / size; +} + +static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) +{ + if (prev->pages_per_zspage != pages_per_zspage) + return false; + + if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) + != get_maxobj_per_zspage(size, pages_per_zspage)) + return false; + + return true; +} + /** * zs_create_pool - Creates an allocation pool to work from. * @flags: allocation flags used to allocate pool metadata @@ -911,25 +928,56 @@ struct zs_pool *zs_create_pool(gfp_t flags) if (!pool) return NULL; - for (i = 0; i < ZS_SIZE_CLASSES; i++) { + /* + * Iterate reversly, because, size of size_class that we want to use + * for merging should be larger or equal to current size. + */ + for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { int size; + int pages_per_zspage; struct size_class *class; + struct size_class *prev_class; size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; if (size > ZS_MAX_ALLOC_SIZE) size = ZS_MAX_ALLOC_SIZE; + pages_per_zspage = get_pages_per_zspage(size); + + /* + * size_class is used for normal zsmalloc operation such + * as alloc/free for that size. Although it is natural that we + * have one size_class for each size, there is a chance that we + * can get more memory utilization if we use one size_class for + * many different sizes whose size_class have same + * characteristics. So, we makes size_class point to + * previous size_class if possible. + */ + if (i < ZS_SIZE_CLASSES - 1) { + prev_class = pool->size_class[i + 1]; + if (can_merge(prev_class, size, pages_per_zspage)) { + pool->size_class[i] = prev_class; + continue; + } + } + + class = kzalloc(sizeof(struct size_class), GFP_KERNEL); + if (!class) + goto err; - class = &pool->size_class[i]; class->size = size; class->index = i; + class->pages_per_zspage = pages_per_zspage; spin_lock_init(&class->lock); - class->pages_per_zspage = get_pages_per_zspage(size); - + pool->size_class[i] = class; } pool->flags = flags; return pool; + +err: + zs_destroy_pool(pool); + return NULL; } EXPORT_SYMBOL_GPL(zs_create_pool); @@ -939,7 +987,13 @@ void zs_destroy_pool(struct zs_pool *pool) for (i = 0; i < ZS_SIZE_CLASSES; i++) { int fg; - struct size_class *class = &pool->size_class[i]; + struct size_class *class = pool->size_class[i]; + + if (!class) + continue; + + if (class->index != i) + continue; for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { if (class->fullness_list[fg]) { @@ -948,6 +1002,7 @@ void zs_destroy_pool(struct zs_pool *pool) class->size, fg); } } + kfree(class); } kfree(pool); } @@ -966,7 +1021,6 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) { unsigned long obj; struct link_free *link; - int class_idx; struct size_class *class; struct page *first_page, *m_page; @@ -975,9 +1029,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) return 0; - class_idx = get_size_class_index(size); - class = &pool->size_class[class_idx]; - BUG_ON(class_idx != class->index); + class = pool->size_class[get_size_class_index(size)]; spin_lock(&class->lock); first_page = find_get_zspage(class); @@ -1030,7 +1082,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj) first_page = get_first_page(f_page); get_zspage_mapping(first_page, &class_idx, &fullness); - class = &pool->size_class[class_idx]; + class = pool->size_class[class_idx]; f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); spin_lock(&class->lock); @@ -1091,7 +1143,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, obj_handle_to_location(handle, &page, &obj_idx); get_zspage_mapping(get_first_page(page), &class_idx, &fg); - class = &pool->size_class[class_idx]; + class = pool->size_class[class_idx]; off = obj_idx_to_offset(page, obj_idx, class->size); area = &get_cpu_var(zs_map_area); @@ -1125,7 +1177,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) obj_handle_to_location(handle, &page, &obj_idx); get_zspage_mapping(get_first_page(page), &class_idx, &fg); - class = &pool->size_class[class_idx]; + class = pool->size_class[class_idx]; off = obj_idx_to_offset(page, obj_idx, class->size); area = &__get_cpu_var(zs_map_area); From 211d013d9e8abbd172c43ace2b6d5ed928a4dd6a Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 12 Dec 2014 16:56:56 -0800 Subject: [PATCH 78/82] zsmalloc: fix zs_init cpu notifier error handling Mahendran Ganesh reported that zpool-enabled zsmalloc should not call zpool_unregister_driver() from zs_init() if cpu notifier registration has failed, because error handling is performed before we register the driver via zpool_register_driver() call. Factor out cpu notifier registration and unregistration code and fix zs_init() error handling. link: http://lkml.iu.edu//hypermail/linux/kernel/1411.1/04156.html [akpm@linux-foundation.org: squash bogus gcc warning] [akpm@linux-foundation.org: use __init and __exit] Signed-off-by: Sergey Senozhatsky Reported-by: Mahendran Ganesh Cc: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit b1b00a5b8a6cf32e3973507decf1216709b55072) Signed-off-by: Alex Shi --- mm/zsmalloc.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 91eac3cf17e8..f3d9a14a23f6 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -847,14 +847,10 @@ static struct notifier_block zs_cpu_nb = { .notifier_call = zs_cpu_notifier }; -static void zs_exit(void) +static void zs_unregister_cpu_notifier(void) { int cpu; -#ifdef CONFIG_ZPOOL - zpool_unregister_driver(&zs_zpool_driver); -#endif - cpu_notifier_register_begin(); for_each_online_cpu(cpu) @@ -864,31 +860,44 @@ static void zs_exit(void) cpu_notifier_register_done(); } -static int zs_init(void) +static int zs_register_cpu_notifier(void) { - int cpu, ret; + int cpu, uninitialized_var(ret); cpu_notifier_register_begin(); __register_cpu_notifier(&zs_cpu_nb); for_each_online_cpu(cpu) { ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); - if (notifier_to_errno(ret)) { - cpu_notifier_register_done(); - goto fail; - } + if (notifier_to_errno(ret)) + break; } cpu_notifier_register_done(); + return notifier_to_errno(ret); +} + +static void __exit zs_exit(void) +{ +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zs_zpool_driver); +#endif + zs_unregister_cpu_notifier(); +} + +static int __init zs_init(void) +{ + int ret = zs_register_cpu_notifier(); + + if (ret) { + zs_unregister_cpu_notifier(); + return ret; + } #ifdef CONFIG_ZPOOL zpool_register_driver(&zs_zpool_driver); #endif - return 0; -fail: - zs_exit(); - return notifier_to_errno(ret); } static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) From 5b0af867b453597531c989254f75be634b8e9197 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 12 Dec 2014 16:56:58 -0800 Subject: [PATCH 79/82] zsmalloc: correct fragile [kmap|kunmap]_atomic use The kunmap_atomic should use virtual address getting by kmap_atomic. However, some pieces of code in zsmalloc uses modified address, not the one got by kmap_atomic for kunmap_atomic. It's okay for working because zsmalloc modifies the address inner PAGE_SIZE bounday so it works with current kmap_atomic's implementation. But it's still fragile with potential changing of kmap_atomic so let's correct it. I got a subtle bug when I implemented a new feature of zsmalloc (compaction) due to a link's mishandling (the link was over page boundary). Although it was totally my mistake, it took a while to find the cause because an unpredictable kmapped address was unmapped causing an almost random crash. Signed-off-by: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: Dan Streetman Cc: Seth Jennings Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit af4ee5e977acb150371c28bd85cb7e34cac48b13) Signed-off-by: Alex Shi --- mm/zsmalloc.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index f3d9a14a23f6..7031e12fcf2b 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -595,6 +595,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) struct page *next_page; struct link_free *link; unsigned int i = 1; + void *vaddr; /* * page->index stores offset of first object starting @@ -605,8 +606,8 @@ static void init_zspage(struct page *first_page, struct size_class *class) if (page != first_page) page->index = off; - link = (struct link_free *)kmap_atomic(page) + - off / sizeof(*link); + vaddr = kmap_atomic(page); + link = (struct link_free *)vaddr + off / sizeof(*link); while ((off += class->size) < PAGE_SIZE) { link->next = obj_location_to_handle(page, i++); @@ -620,7 +621,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) */ next_page = get_next_page(page); link->next = obj_location_to_handle(next_page, 0); - kunmap_atomic(link); + kunmap_atomic(vaddr); page = next_page; off %= PAGE_SIZE; } @@ -1031,6 +1032,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) unsigned long obj; struct link_free *link; struct size_class *class; + void *vaddr; struct page *first_page, *m_page; unsigned long m_objidx, m_offset; @@ -1059,11 +1061,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) obj_handle_to_location(obj, &m_page, &m_objidx); m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); - link = (struct link_free *)kmap_atomic(m_page) + - m_offset / sizeof(*link); + vaddr = kmap_atomic(m_page); + link = (struct link_free *)vaddr + m_offset / sizeof(*link); first_page->freelist = link->next; memset(link, POISON_INUSE, sizeof(*link)); - kunmap_atomic(link); + kunmap_atomic(vaddr); first_page->inuse++; /* Now move the zspage to another fullness group, if required */ @@ -1079,6 +1081,7 @@ void zs_free(struct zs_pool *pool, unsigned long obj) struct link_free *link; struct page *first_page, *f_page; unsigned long f_objidx, f_offset; + void *vaddr; int class_idx; struct size_class *class; @@ -1097,10 +1100,10 @@ void zs_free(struct zs_pool *pool, unsigned long obj) spin_lock(&class->lock); /* Insert this object in containing zspage's freelist */ - link = (struct link_free *)((unsigned char *)kmap_atomic(f_page) - + f_offset); + vaddr = kmap_atomic(f_page); + link = (struct link_free *)(vaddr + f_offset); link->next = first_page->freelist; - kunmap_atomic(link); + kunmap_atomic(vaddr); first_page->freelist = (void *)obj; first_page->inuse--; From 60acd81baccac53be98e07c140a0579a3607be9d Mon Sep 17 00:00:00 2001 From: Mahendran Ganesh Date: Fri, 12 Dec 2014 16:57:04 -0800 Subject: [PATCH 80/82] mm/zram: correct ZRAM_ZERO flag bit position In struct zram_table_entry, the element *value* contains obj size and obj zram flags. Bit 0 to bit (ZRAM_FLAG_SHIFT - 1) represent obj size, and bit ZRAM_FLAG_SHIFT to the highest bit of unsigned long represent obj zram_flags. So the first zram flag(ZRAM_ZERO) should be from ZRAM_FLAG_SHIFT instead of (ZRAM_FLAG_SHIFT + 1). This patch fixes this cosmetic issue. Also fix a typo, "page in now accessed" -> "page is now accessed" Signed-off-by: Mahendran Ganesh Acked-by: Minchan Kim Acked-by: Weijie Yang Acked-by: Sergey Senozhatsky Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit d49b1c254c997195872a9e8913660a788298921e) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index c6ee271317f5..b05a816b09ac 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -66,8 +66,8 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; /* Flags for zram pages (table[page_no].value) */ enum zram_pageflags { /* Page consists entirely of zeros */ - ZRAM_ZERO = ZRAM_FLAG_SHIFT + 1, - ZRAM_ACCESS, /* page in now accessed */ + ZRAM_ZERO = ZRAM_FLAG_SHIFT, + ZRAM_ACCESS, /* page is now accessed */ __NR_ZRAM_PAGEFLAGS, }; From 837e91c8966924792bf5ba5ef9a62eaa55489ff7 Mon Sep 17 00:00:00 2001 From: Ganesh Mahendran Date: Thu, 12 Feb 2015 15:00:33 -0800 Subject: [PATCH 81/82] zram: free meta table in zram_meta_free zram_meta_alloc() and zram_meta_free() are a pair. In zram_meta_alloc(), meta table is allocated. So it it better to free it in zram_meta_free(). Signed-off-by: Ganesh Mahendran Acked-by: Minchan Kim Acked-by: Sergey Senozhatsky Cc: Nitin Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit 1fec117281d9f5349c35279c9521f4096fa33357) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 8a1266ce8bbe..3e075d65a158 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -308,8 +308,21 @@ static inline int valid_io_request(struct zram *zram, struct bio *bio) return 1; } -static void zram_meta_free(struct zram_meta *meta) +static void zram_meta_free(struct zram_meta *meta, u64 disksize) { + size_t num_pages = disksize >> PAGE_SHIFT; + size_t index; + + /* Free all pages that are still in this zram device */ + for (index = 0; index < num_pages; index++) { + unsigned long handle = meta->table[index].handle; + + if (!handle) + continue; + + zs_free(meta->mem_pool, handle); + } + zs_destroy_pool(meta->mem_pool); vfree(meta->table); kfree(meta); @@ -708,9 +721,6 @@ static void zram_bio_discard(struct zram *zram, u32 index, static void zram_reset_device(struct zram *zram, bool reset_capacity) { - size_t index; - struct zram_meta *meta; - down_write(&zram->init_lock); zram->limit_pages = 0; @@ -720,20 +730,9 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) return; } - meta = zram->meta; - /* Free all pages that are still in this zram device */ - for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) { - unsigned long handle = meta->table[index].handle; - if (!handle) - continue; - - zs_free(meta->mem_pool, handle); - } - zcomp_destroy(zram->comp); zram->max_comp_streams = 1; - - zram_meta_free(zram->meta); + zram_meta_free(zram->meta, zram->disksize); zram->meta = NULL; /* Reset stats */ memset(&zram->stats, 0, sizeof(zram->stats)); @@ -805,7 +804,7 @@ static ssize_t disksize_store(struct device *dev, up_write(&zram->init_lock); zcomp_destroy(comp); out_free_meta: - zram_meta_free(meta); + zram_meta_free(meta, disksize); return err; } From 043787104c5b1386a96c317f2e82ecff257dd9c5 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 12 Feb 2015 15:00:36 -0800 Subject: [PATCH 82/82] zram: fix umount-reset_store-mount race condition Ganesh Mahendran was the first one who proposed to use bdev->bd_mutex to avoid ->bd_holders race condition: CPU0 CPU1 umount /* zram->init_done is true */ reset_store() bdev->bd_holders == 0 mount ... zram_make_request() zram_reset_device() However, his solution required some considerable amount of code movement, which we can avoid. Apart from using bdev->bd_mutex in reset_store(), this patch also simplifies zram_reset_device(). zram_reset_device() has a bool parameter reset_capacity which tells it whether disk capacity and itself disk should be reset. There are two zram_reset_device() callers: -- zram_exit() passes reset_capacity=false -- reset_store() passes reset_capacity=true So we can move reset_capacity-sensitive work out of zram_reset_device() and perform it unconditionally in reset_store(). This also lets us drop reset_capacity parameter from zram_reset_device() and pass zram pointer only. Signed-off-by: Sergey Senozhatsky Reported-by: Ganesh Mahendran Cc: Minchan Kim Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds (cherry picked from commit ba6b17d68c8e3aa8d55d0474299cb931965c5ea5) Signed-off-by: Alex Shi --- drivers/block/zram/zram_drv.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3e075d65a158..45e2e85815ab 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -719,7 +719,7 @@ static void zram_bio_discard(struct zram *zram, u32 index, } } -static void zram_reset_device(struct zram *zram, bool reset_capacity) +static void zram_reset_device(struct zram *zram) { down_write(&zram->init_lock); @@ -738,18 +738,7 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) memset(&zram->stats, 0, sizeof(zram->stats)); zram->disksize = 0; - if (reset_capacity) - set_capacity(zram->disk, 0); - up_write(&zram->init_lock); - - /* - * Revalidate disk out of the init_lock to avoid lockdep splat. - * It's okay because disk's capacity is protected by init_lock - * so that revalidate_disk always sees up-to-date capacity. - */ - if (reset_capacity) - revalidate_disk(zram->disk); } static ssize_t disksize_store(struct device *dev, @@ -822,6 +811,7 @@ static ssize_t reset_store(struct device *dev, if (!bdev) return -ENOMEM; + mutex_lock(&bdev->bd_mutex); /* Do not reset an active device! */ if (bdev->bd_holders) { ret = -EBUSY; @@ -839,12 +829,17 @@ static ssize_t reset_store(struct device *dev, /* Make sure all pending I/O is finished */ fsync_bdev(bdev); + zram_reset_device(zram); + set_capacity(zram->disk, 0); + + mutex_unlock(&bdev->bd_mutex); + revalidate_disk(zram->disk); bdput(bdev); - zram_reset_device(zram, true); return len; out: + mutex_unlock(&bdev->bd_mutex); bdput(bdev); return ret; } @@ -1147,7 +1142,7 @@ static void __exit zram_exit(void) * Shouldn't access zram->disk after destroy_device * because destroy_device already released zram->disk. */ - zram_reset_device(zram, false); + zram_reset_device(zram); } unregister_blkdev(zram_major, "zram");