1/*
2 * Compressed RAM block device
3 *
4 * Copyright (C) 2008, 2009, 2010  Nitin Gupta
5 *               2012, 2013 Minchan Kim
6 *
7 * This code is released using a dual license strategy: BSD/GPL
8 * You can choose the licence that better fits your requirements.
9 *
10 * Released under the terms of 3-clause BSD License
11 * Released under the terms of GNU General Public License Version 2.0
12 *
13 */
14
15#define KMSG_COMPONENT "zram"
16#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
17
18#include <linux/module.h>
19#include <linux/kernel.h>
20#include <linux/bio.h>
21#include <linux/bitops.h>
22#include <linux/blkdev.h>
23#include <linux/buffer_head.h>
24#include <linux/device.h>
25#include <linux/highmem.h>
26#include <linux/slab.h>
27#include <linux/backing-dev.h>
28#include <linux/string.h>
29#include <linux/vmalloc.h>
30#include <linux/err.h>
31#include <linux/idr.h>
32#include <linux/sysfs.h>
33#include <linux/debugfs.h>
34#include <linux/cpuhotplug.h>
35#include <linux/part_stat.h>
36
37#ifdef CONFIG_ZRAM_GROUP
38#include <linux/memcontrol.h>
39#endif
40
41#include "zram_drv.h"
42
43static DEFINE_IDR(zram_index_idr);
44/* idr index must be protected */
45static DEFINE_MUTEX(zram_index_mutex);
46
47static int zram_major;
48static const char *default_compressor = CONFIG_ZRAM_DEF_COMP;
49
50/* Module params (documentation at end) */
51static unsigned int num_devices = 1;
52/*
53 * Pages that compress to sizes equals or greater than this are stored
54 * uncompressed in memory.
55 */
56static size_t huge_class_size;
57
58static const struct block_device_operations zram_devops;
59
60static void zram_free_page(struct zram *zram, size_t index);
61static int zram_read_page(struct zram *zram, struct page *page, u32 index,
62			  struct bio *parent);
63
64static inline bool init_done(struct zram *zram)
65{
66	return zram->disksize;
67}
68
69static inline struct zram *dev_to_zram(struct device *dev)
70{
71	return (struct zram *)dev_to_disk(dev)->private_data;
72}
73
74static inline void zram_set_element(struct zram *zram, u32 index,
75			unsigned long element)
76{
77	zram->table[index].element = element;
78}
79
80static unsigned long zram_get_element(struct zram *zram, u32 index)
81{
82	return zram->table[index].element;
83}
84
85static inline bool zram_allocated(struct zram *zram, u32 index)
86{
87	return zram_get_obj_size(zram, index) ||
88			zram_test_flag(zram, index, ZRAM_SAME) ||
89			zram_test_flag(zram, index, ZRAM_WB);
90}
91
92#if PAGE_SIZE != 4096
93static inline bool is_partial_io(struct bio_vec *bvec)
94{
95	return bvec->bv_len != PAGE_SIZE;
96}
97#define ZRAM_PARTIAL_IO		1
98#else
99static inline bool is_partial_io(struct bio_vec *bvec)
100{
101	return false;
102}
103#endif
104
105static inline void zram_set_priority(struct zram *zram, u32 index, u32 prio)
106{
107	prio &= ZRAM_COMP_PRIORITY_MASK;
108	/*
109	 * Clear previous priority value first, in case if we recompress
110	 * further an already recompressed page
111	 */
112	zram->table[index].flags &= ~(ZRAM_COMP_PRIORITY_MASK <<
113				      ZRAM_COMP_PRIORITY_BIT1);
114	zram->table[index].flags |= (prio << ZRAM_COMP_PRIORITY_BIT1);
115}
116
117static inline u32 zram_get_priority(struct zram *zram, u32 index)
118{
119	u32 prio = zram->table[index].flags >> ZRAM_COMP_PRIORITY_BIT1;
120
121	return prio & ZRAM_COMP_PRIORITY_MASK;
122}
123
124static inline void update_used_max(struct zram *zram,
125					const unsigned long pages)
126{
127	unsigned long cur_max = atomic_long_read(&zram->stats.max_used_pages);
128
129	do {
130		if (cur_max >= pages)
131			return;
132	} while (!atomic_long_try_cmpxchg(&zram->stats.max_used_pages,
133					  &cur_max, pages));
134}
135
136static inline void zram_fill_page(void *ptr, unsigned long len,
137					unsigned long value)
138{
139	WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long)));
140	memset_l(ptr, value, len / sizeof(unsigned long));
141}
142
143static bool page_same_filled(void *ptr, unsigned long *element)
144{
145	unsigned long *page;
146	unsigned long val;
147	unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1;
148
149	page = (unsigned long *)ptr;
150	val = page[0];
151
152	if (val != page[last_pos])
153		return false;
154
155	for (pos = 1; pos < last_pos; pos++) {
156		if (val != page[pos])
157			return false;
158	}
159
160	*element = val;
161
162	return true;
163}
164
165static ssize_t initstate_show(struct device *dev,
166		struct device_attribute *attr, char *buf)
167{
168	u32 val;
169	struct zram *zram = dev_to_zram(dev);
170
171	down_read(&zram->init_lock);
172	val = init_done(zram);
173	up_read(&zram->init_lock);
174
175	return scnprintf(buf, PAGE_SIZE, "%u\n", val);
176}
177
178static ssize_t disksize_show(struct device *dev,
179		struct device_attribute *attr, char *buf)
180{
181	struct zram *zram = dev_to_zram(dev);
182
183	return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
184}
185
186static ssize_t mem_limit_store(struct device *dev,
187		struct device_attribute *attr, const char *buf, size_t len)
188{
189	u64 limit;
190	char *tmp;
191	struct zram *zram = dev_to_zram(dev);
192
193	limit = memparse(buf, &tmp);
194	if (buf == tmp) /* no chars parsed, invalid input */
195		return -EINVAL;
196
197	down_write(&zram->init_lock);
198	zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT;
199	up_write(&zram->init_lock);
200
201	return len;
202}
203
204static ssize_t mem_used_max_store(struct device *dev,
205		struct device_attribute *attr, const char *buf, size_t len)
206{
207	int err;
208	unsigned long val;
209	struct zram *zram = dev_to_zram(dev);
210
211	err = kstrtoul(buf, 10, &val);
212	if (err || val != 0)
213		return -EINVAL;
214
215	down_read(&zram->init_lock);
216	if (init_done(zram)) {
217		atomic_long_set(&zram->stats.max_used_pages,
218				zs_get_total_pages(zram->mem_pool));
219	}
220	up_read(&zram->init_lock);
221
222	return len;
223}
224
225/*
226 * Mark all pages which are older than or equal to cutoff as IDLE.
227 * Callers should hold the zram init lock in read mode
228 */
229static void mark_idle(struct zram *zram, ktime_t cutoff)
230{
231	int is_idle = 1;
232	unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
233	int index;
234
235	for (index = 0; index < nr_pages; index++) {
236		/*
237		 * Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race.
238		 * See the comment in writeback_store.
239		 */
240		zram_slot_lock(zram, index);
241		if (zram_allocated(zram, index) &&
242				!zram_test_flag(zram, index, ZRAM_UNDER_WB)) {
243#ifdef CONFIG_ZRAM_MEMORY_TRACKING
244			is_idle = !cutoff || ktime_after(cutoff, zram->table[index].ac_time);
245#endif
246			if (is_idle)
247				zram_set_flag(zram, index, ZRAM_IDLE);
248		}
249		zram_slot_unlock(zram, index);
250	}
251}
252
253static ssize_t idle_store(struct device *dev,
254		struct device_attribute *attr, const char *buf, size_t len)
255{
256	struct zram *zram = dev_to_zram(dev);
257	ktime_t cutoff_time = 0;
258	ssize_t rv = -EINVAL;
259
260	if (!sysfs_streq(buf, "all")) {
261		/*
262		 * If it did not parse as 'all' try to treat it as an integer
263		 * when we have memory tracking enabled.
264		 */
265		u64 age_sec;
266
267		if (IS_ENABLED(CONFIG_ZRAM_MEMORY_TRACKING) && !kstrtoull(buf, 0, &age_sec))
268			cutoff_time = ktime_sub(ktime_get_boottime(),
269					ns_to_ktime(age_sec * NSEC_PER_SEC));
270		else
271			goto out;
272	}
273
274	down_read(&zram->init_lock);
275	if (!init_done(zram))
276		goto out_unlock;
277
278	/*
279	 * A cutoff_time of 0 marks everything as idle, this is the
280	 * "all" behavior.
281	 */
282	mark_idle(zram, cutoff_time);
283	rv = len;
284
285out_unlock:
286	up_read(&zram->init_lock);
287out:
288	return rv;
289}
290
291#ifdef CONFIG_ZRAM_WRITEBACK
292static ssize_t writeback_limit_enable_store(struct device *dev,
293		struct device_attribute *attr, const char *buf, size_t len)
294{
295	struct zram *zram = dev_to_zram(dev);
296	u64 val;
297	ssize_t ret = -EINVAL;
298
299	if (kstrtoull(buf, 10, &val))
300		return ret;
301
302	down_read(&zram->init_lock);
303	spin_lock(&zram->wb_limit_lock);
304	zram->wb_limit_enable = val;
305	spin_unlock(&zram->wb_limit_lock);
306	up_read(&zram->init_lock);
307	ret = len;
308
309	return ret;
310}
311
312static ssize_t writeback_limit_enable_show(struct device *dev,
313		struct device_attribute *attr, char *buf)
314{
315	bool val;
316	struct zram *zram = dev_to_zram(dev);
317
318	down_read(&zram->init_lock);
319	spin_lock(&zram->wb_limit_lock);
320	val = zram->wb_limit_enable;
321	spin_unlock(&zram->wb_limit_lock);
322	up_read(&zram->init_lock);
323
324	return scnprintf(buf, PAGE_SIZE, "%d\n", val);
325}
326
327static ssize_t writeback_limit_store(struct device *dev,
328		struct device_attribute *attr, const char *buf, size_t len)
329{
330	struct zram *zram = dev_to_zram(dev);
331	u64 val;
332	ssize_t ret = -EINVAL;
333
334	if (kstrtoull(buf, 10, &val))
335		return ret;
336
337	down_read(&zram->init_lock);
338	spin_lock(&zram->wb_limit_lock);
339	zram->bd_wb_limit = val;
340	spin_unlock(&zram->wb_limit_lock);
341	up_read(&zram->init_lock);
342	ret = len;
343
344	return ret;
345}
346
347static ssize_t writeback_limit_show(struct device *dev,
348		struct device_attribute *attr, char *buf)
349{
350	u64 val;
351	struct zram *zram = dev_to_zram(dev);
352
353	down_read(&zram->init_lock);
354	spin_lock(&zram->wb_limit_lock);
355	val = zram->bd_wb_limit;
356	spin_unlock(&zram->wb_limit_lock);
357	up_read(&zram->init_lock);
358
359	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
360}
361
362static void reset_bdev(struct zram *zram)
363{
364	struct block_device *bdev;
365
366	if (!zram->backing_dev)
367		return;
368
369	bdev = zram->bdev;
370	blkdev_put(bdev, zram);
371	/* hope filp_close flush all of IO */
372	filp_close(zram->backing_dev, NULL);
373	zram->backing_dev = NULL;
374	zram->bdev = NULL;
375	zram->disk->fops = &zram_devops;
376	kvfree(zram->bitmap);
377	zram->bitmap = NULL;
378}
379
380static ssize_t backing_dev_show(struct device *dev,
381		struct device_attribute *attr, char *buf)
382{
383	struct file *file;
384	struct zram *zram = dev_to_zram(dev);
385	char *p;
386	ssize_t ret;
387
388	down_read(&zram->init_lock);
389	file = zram->backing_dev;
390	if (!file) {
391		memcpy(buf, "none\n", 5);
392		up_read(&zram->init_lock);
393		return 5;
394	}
395
396	p = file_path(file, buf, PAGE_SIZE - 1);
397	if (IS_ERR(p)) {
398		ret = PTR_ERR(p);
399		goto out;
400	}
401
402	ret = strlen(p);
403	memmove(buf, p, ret);
404	buf[ret++] = '\n';
405out:
406	up_read(&zram->init_lock);
407	return ret;
408}
409
410static ssize_t backing_dev_store(struct device *dev,
411		struct device_attribute *attr, const char *buf, size_t len)
412{
413	char *file_name;
414	size_t sz;
415	struct file *backing_dev = NULL;
416	struct inode *inode;
417	struct address_space *mapping;
418	unsigned int bitmap_sz;
419	unsigned long nr_pages, *bitmap = NULL;
420	struct block_device *bdev = NULL;
421	int err;
422	struct zram *zram = dev_to_zram(dev);
423
424	file_name = kmalloc(PATH_MAX, GFP_KERNEL);
425	if (!file_name)
426		return -ENOMEM;
427
428	down_write(&zram->init_lock);
429	if (init_done(zram)) {
430		pr_info("Can't setup backing device for initialized device\n");
431		err = -EBUSY;
432		goto out;
433	}
434
435	strscpy(file_name, buf, PATH_MAX);
436	/* ignore trailing newline */
437	sz = strlen(file_name);
438	if (sz > 0 && file_name[sz - 1] == '\n')
439		file_name[sz - 1] = 0x00;
440
441	backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0);
442	if (IS_ERR(backing_dev)) {
443		err = PTR_ERR(backing_dev);
444		backing_dev = NULL;
445		goto out;
446	}
447
448	mapping = backing_dev->f_mapping;
449	inode = mapping->host;
450
451	/* Support only block device in this moment */
452	if (!S_ISBLK(inode->i_mode)) {
453		err = -ENOTBLK;
454		goto out;
455	}
456
457	bdev = blkdev_get_by_dev(inode->i_rdev, BLK_OPEN_READ | BLK_OPEN_WRITE,
458				 zram, NULL);
459	if (IS_ERR(bdev)) {
460		err = PTR_ERR(bdev);
461		bdev = NULL;
462		goto out;
463	}
464
465	nr_pages = i_size_read(inode) >> PAGE_SHIFT;
466	bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
467	bitmap = kvzalloc(bitmap_sz, GFP_KERNEL);
468	if (!bitmap) {
469		err = -ENOMEM;
470		goto out;
471	}
472
473	reset_bdev(zram);
474
475	zram->bdev = bdev;
476	zram->backing_dev = backing_dev;
477	zram->bitmap = bitmap;
478	zram->nr_pages = nr_pages;
479	up_write(&zram->init_lock);
480
481	pr_info("setup backing device %s\n", file_name);
482	kfree(file_name);
483
484	return len;
485out:
486	kvfree(bitmap);
487
488	if (bdev)
489		blkdev_put(bdev, zram);
490
491	if (backing_dev)
492		filp_close(backing_dev, NULL);
493
494	up_write(&zram->init_lock);
495
496	kfree(file_name);
497
498	return err;
499}
500
501static unsigned long alloc_block_bdev(struct zram *zram)
502{
503	unsigned long blk_idx = 1;
504retry:
505	/* skip 0 bit to confuse zram.handle = 0 */
506	blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, blk_idx);
507	if (blk_idx == zram->nr_pages)
508		return 0;
509
510	if (test_and_set_bit(blk_idx, zram->bitmap))
511		goto retry;
512
513	atomic64_inc(&zram->stats.bd_count);
514	return blk_idx;
515}
516
517static void free_block_bdev(struct zram *zram, unsigned long blk_idx)
518{
519	int was_set;
520
521	was_set = test_and_clear_bit(blk_idx, zram->bitmap);
522	WARN_ON_ONCE(!was_set);
523	atomic64_dec(&zram->stats.bd_count);
524}
525
526static void read_from_bdev_async(struct zram *zram, struct page *page,
527			unsigned long entry, struct bio *parent)
528{
529	struct bio *bio;
530
531	bio = bio_alloc(zram->bdev, 1, parent->bi_opf, GFP_NOIO);
532	bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
533	__bio_add_page(bio, page, PAGE_SIZE, 0);
534	bio_chain(bio, parent);
535	submit_bio(bio);
536}
537
538#define HUGE_WRITEBACK			(1<<0)
539#define IDLE_WRITEBACK			(1<<1)
540#define INCOMPRESSIBLE_WRITEBACK	(1<<2)
541
542static ssize_t writeback_store(struct device *dev,
543		struct device_attribute *attr, const char *buf, size_t len)
544{
545	struct zram *zram = dev_to_zram(dev);
546	unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
547	unsigned long index = 0;
548	struct bio bio;
549	struct bio_vec bio_vec;
550	struct page *page;
551	ssize_t ret = len;
552	int mode, err;
553	unsigned long blk_idx = 0;
554
555	if (sysfs_streq(buf, "idle"))
556		mode = IDLE_WRITEBACK;
557	else if (sysfs_streq(buf, "huge"))
558		mode = HUGE_WRITEBACK;
559	else if (sysfs_streq(buf, "huge_idle"))
560		mode = IDLE_WRITEBACK | HUGE_WRITEBACK;
561	else if (sysfs_streq(buf, "incompressible"))
562		mode = INCOMPRESSIBLE_WRITEBACK;
563	else
564		return -EINVAL;
565
566	down_read(&zram->init_lock);
567	if (!init_done(zram)) {
568		ret = -EINVAL;
569		goto release_init_lock;
570	}
571
572	if (!zram->backing_dev) {
573		ret = -ENODEV;
574		goto release_init_lock;
575	}
576
577	page = alloc_page(GFP_KERNEL);
578	if (!page) {
579		ret = -ENOMEM;
580		goto release_init_lock;
581	}
582
583	for (index = 0; index < nr_pages; index++) {
584		spin_lock(&zram->wb_limit_lock);
585		if (zram->wb_limit_enable && !zram->bd_wb_limit) {
586			spin_unlock(&zram->wb_limit_lock);
587			ret = -EIO;
588			break;
589		}
590		spin_unlock(&zram->wb_limit_lock);
591
592		if (!blk_idx) {
593			blk_idx = alloc_block_bdev(zram);
594			if (!blk_idx) {
595				ret = -ENOSPC;
596				break;
597			}
598		}
599
600		zram_slot_lock(zram, index);
601		if (!zram_allocated(zram, index))
602			goto next;
603
604		if (zram_test_flag(zram, index, ZRAM_WB) ||
605				zram_test_flag(zram, index, ZRAM_SAME) ||
606				zram_test_flag(zram, index, ZRAM_UNDER_WB))
607			goto next;
608
609		if (mode & IDLE_WRITEBACK &&
610		    !zram_test_flag(zram, index, ZRAM_IDLE))
611			goto next;
612		if (mode & HUGE_WRITEBACK &&
613		    !zram_test_flag(zram, index, ZRAM_HUGE))
614			goto next;
615		if (mode & INCOMPRESSIBLE_WRITEBACK &&
616		    !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
617			goto next;
618
619		/*
620		 * Clearing ZRAM_UNDER_WB is duty of caller.
621		 * IOW, zram_free_page never clear it.
622		 */
623		zram_set_flag(zram, index, ZRAM_UNDER_WB);
624		/* Need for hugepage writeback racing */
625		zram_set_flag(zram, index, ZRAM_IDLE);
626		zram_slot_unlock(zram, index);
627		if (zram_read_page(zram, page, index, NULL)) {
628			zram_slot_lock(zram, index);
629			zram_clear_flag(zram, index, ZRAM_UNDER_WB);
630			zram_clear_flag(zram, index, ZRAM_IDLE);
631			zram_slot_unlock(zram, index);
632			continue;
633		}
634
635		bio_init(&bio, zram->bdev, &bio_vec, 1,
636			 REQ_OP_WRITE | REQ_SYNC);
637		bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
638		__bio_add_page(&bio, page, PAGE_SIZE, 0);
639
640		/*
641		 * XXX: A single page IO would be inefficient for write
642		 * but it would be not bad as starter.
643		 */
644		err = submit_bio_wait(&bio);
645		if (err) {
646			zram_slot_lock(zram, index);
647			zram_clear_flag(zram, index, ZRAM_UNDER_WB);
648			zram_clear_flag(zram, index, ZRAM_IDLE);
649			zram_slot_unlock(zram, index);
650			/*
651			 * BIO errors are not fatal, we continue and simply
652			 * attempt to writeback the remaining objects (pages).
653			 * At the same time we need to signal user-space that
654			 * some writes (at least one, but also could be all of
655			 * them) were not successful and we do so by returning
656			 * the most recent BIO error.
657			 */
658			ret = err;
659			continue;
660		}
661
662		atomic64_inc(&zram->stats.bd_writes);
663		/*
664		 * We released zram_slot_lock so need to check if the slot was
665		 * changed. If there is freeing for the slot, we can catch it
666		 * easily by zram_allocated.
667		 * A subtle case is the slot is freed/reallocated/marked as
668		 * ZRAM_IDLE again. To close the race, idle_store doesn't
669		 * mark ZRAM_IDLE once it found the slot was ZRAM_UNDER_WB.
670		 * Thus, we could close the race by checking ZRAM_IDLE bit.
671		 */
672		zram_slot_lock(zram, index);
673		if (!zram_allocated(zram, index) ||
674			  !zram_test_flag(zram, index, ZRAM_IDLE)) {
675			zram_clear_flag(zram, index, ZRAM_UNDER_WB);
676			zram_clear_flag(zram, index, ZRAM_IDLE);
677			goto next;
678		}
679
680		zram_free_page(zram, index);
681		zram_clear_flag(zram, index, ZRAM_UNDER_WB);
682		zram_set_flag(zram, index, ZRAM_WB);
683		zram_set_element(zram, index, blk_idx);
684		blk_idx = 0;
685		atomic64_inc(&zram->stats.pages_stored);
686		spin_lock(&zram->wb_limit_lock);
687		if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
688			zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
689		spin_unlock(&zram->wb_limit_lock);
690next:
691		zram_slot_unlock(zram, index);
692	}
693
694	if (blk_idx)
695		free_block_bdev(zram, blk_idx);
696	__free_page(page);
697release_init_lock:
698	up_read(&zram->init_lock);
699
700	return ret;
701}
702
703struct zram_work {
704	struct work_struct work;
705	struct zram *zram;
706	unsigned long entry;
707	struct page *page;
708	int error;
709};
710
711static void zram_sync_read(struct work_struct *work)
712{
713	struct zram_work *zw = container_of(work, struct zram_work, work);
714	struct bio_vec bv;
715	struct bio bio;
716
717	bio_init(&bio, zw->zram->bdev, &bv, 1, REQ_OP_READ);
718	bio.bi_iter.bi_sector = zw->entry * (PAGE_SIZE >> 9);
719	__bio_add_page(&bio, zw->page, PAGE_SIZE, 0);
720	zw->error = submit_bio_wait(&bio);
721}
722
723/*
724 * Block layer want one ->submit_bio to be active at a time, so if we use
725 * chained IO with parent IO in same context, it's a deadlock. To avoid that,
726 * use a worker thread context.
727 */
728static int read_from_bdev_sync(struct zram *zram, struct page *page,
729				unsigned long entry)
730{
731	struct zram_work work;
732
733	work.page = page;
734	work.zram = zram;
735	work.entry = entry;
736
737	INIT_WORK_ONSTACK(&work.work, zram_sync_read);
738	queue_work(system_unbound_wq, &work.work);
739	flush_work(&work.work);
740	destroy_work_on_stack(&work.work);
741
742	return work.error;
743}
744
745static int read_from_bdev(struct zram *zram, struct page *page,
746			unsigned long entry, struct bio *parent)
747{
748	atomic64_inc(&zram->stats.bd_reads);
749	if (!parent) {
750		if (WARN_ON_ONCE(!IS_ENABLED(ZRAM_PARTIAL_IO)))
751			return -EIO;
752		return read_from_bdev_sync(zram, page, entry);
753	}
754	read_from_bdev_async(zram, page, entry, parent);
755	return 0;
756}
757#else
758static inline void reset_bdev(struct zram *zram) {};
759static int read_from_bdev(struct zram *zram, struct page *page,
760			unsigned long entry, struct bio *parent)
761{
762	return -EIO;
763}
764
765static void free_block_bdev(struct zram *zram, unsigned long blk_idx) {};
766#endif
767
768#ifdef CONFIG_ZRAM_MEMORY_TRACKING
769
770static struct dentry *zram_debugfs_root;
771
772static void zram_debugfs_create(void)
773{
774	zram_debugfs_root = debugfs_create_dir("zram", NULL);
775}
776
777static void zram_debugfs_destroy(void)
778{
779	debugfs_remove_recursive(zram_debugfs_root);
780}
781
782static void zram_accessed(struct zram *zram, u32 index)
783{
784	zram_clear_flag(zram, index, ZRAM_IDLE);
785	zram->table[index].ac_time = ktime_get_boottime();
786}
787
788static ssize_t read_block_state(struct file *file, char __user *buf,
789				size_t count, loff_t *ppos)
790{
791	char *kbuf;
792	ssize_t index, written = 0;
793	struct zram *zram = file->private_data;
794	unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
795	struct timespec64 ts;
796
797	kbuf = kvmalloc(count, GFP_KERNEL);
798	if (!kbuf)
799		return -ENOMEM;
800
801	down_read(&zram->init_lock);
802	if (!init_done(zram)) {
803		up_read(&zram->init_lock);
804		kvfree(kbuf);
805		return -EINVAL;
806	}
807
808	for (index = *ppos; index < nr_pages; index++) {
809		int copied;
810
811		zram_slot_lock(zram, index);
812		if (!zram_allocated(zram, index))
813			goto next;
814
815		ts = ktime_to_timespec64(zram->table[index].ac_time);
816		copied = snprintf(kbuf + written, count,
817			"%12zd %12lld.%06lu %c%c%c%c%c%c\n",
818			index, (s64)ts.tv_sec,
819			ts.tv_nsec / NSEC_PER_USEC,
820			zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.',
821			zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.',
822			zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.',
823			zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.',
824			zram_get_priority(zram, index) ? 'r' : '.',
825			zram_test_flag(zram, index,
826				       ZRAM_INCOMPRESSIBLE) ? 'n' : '.');
827
828		if (count <= copied) {
829			zram_slot_unlock(zram, index);
830			break;
831		}
832		written += copied;
833		count -= copied;
834next:
835		zram_slot_unlock(zram, index);
836		*ppos += 1;
837	}
838
839	up_read(&zram->init_lock);
840	if (copy_to_user(buf, kbuf, written))
841		written = -EFAULT;
842	kvfree(kbuf);
843
844	return written;
845}
846
847static const struct file_operations proc_zram_block_state_op = {
848	.open = simple_open,
849	.read = read_block_state,
850	.llseek = default_llseek,
851};
852
853static void zram_debugfs_register(struct zram *zram)
854{
855	if (!zram_debugfs_root)
856		return;
857
858	zram->debugfs_dir = debugfs_create_dir(zram->disk->disk_name,
859						zram_debugfs_root);
860	debugfs_create_file("block_state", 0400, zram->debugfs_dir,
861				zram, &proc_zram_block_state_op);
862}
863
864static void zram_debugfs_unregister(struct zram *zram)
865{
866	debugfs_remove_recursive(zram->debugfs_dir);
867}
868#else
869static void zram_debugfs_create(void) {};
870static void zram_debugfs_destroy(void) {};
871static void zram_accessed(struct zram *zram, u32 index)
872{
873	zram_clear_flag(zram, index, ZRAM_IDLE);
874};
875static void zram_debugfs_register(struct zram *zram) {};
876static void zram_debugfs_unregister(struct zram *zram) {};
877#endif
878
879/*
880 * We switched to per-cpu streams and this attr is not needed anymore.
881 * However, we will keep it around for some time, because:
882 * a) we may revert per-cpu streams in the future
883 * b) it's visible to user space and we need to follow our 2 years
884 *    retirement rule; but we already have a number of 'soon to be
885 *    altered' attrs, so max_comp_streams need to wait for the next
886 *    layoff cycle.
887 */
888static ssize_t max_comp_streams_show(struct device *dev,
889		struct device_attribute *attr, char *buf)
890{
891	return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus());
892}
893
894static ssize_t max_comp_streams_store(struct device *dev,
895		struct device_attribute *attr, const char *buf, size_t len)
896{
897	return len;
898}
899
900static void comp_algorithm_set(struct zram *zram, u32 prio, const char *alg)
901{
902	/* Do not free statically defined compression algorithms */
903	if (zram->comp_algs[prio] != default_compressor)
904		kfree(zram->comp_algs[prio]);
905
906	zram->comp_algs[prio] = alg;
907}
908
909static ssize_t __comp_algorithm_show(struct zram *zram, u32 prio, char *buf)
910{
911	ssize_t sz;
912
913	down_read(&zram->init_lock);
914	sz = zcomp_available_show(zram->comp_algs[prio], buf);
915	up_read(&zram->init_lock);
916
917	return sz;
918}
919
920static int __comp_algorithm_store(struct zram *zram, u32 prio, const char *buf)
921{
922	char *compressor;
923	size_t sz;
924
925	sz = strlen(buf);
926	if (sz >= CRYPTO_MAX_ALG_NAME)
927		return -E2BIG;
928
929	compressor = kstrdup(buf, GFP_KERNEL);
930	if (!compressor)
931		return -ENOMEM;
932
933	/* ignore trailing newline */
934	if (sz > 0 && compressor[sz - 1] == '\n')
935		compressor[sz - 1] = 0x00;
936
937	if (!zcomp_available_algorithm(compressor)) {
938		kfree(compressor);
939		return -EINVAL;
940	}
941
942	down_write(&zram->init_lock);
943	if (init_done(zram)) {
944		up_write(&zram->init_lock);
945		kfree(compressor);
946		pr_info("Can't change algorithm for initialized device\n");
947		return -EBUSY;
948	}
949
950	comp_algorithm_set(zram, prio, compressor);
951	up_write(&zram->init_lock);
952	return 0;
953}
954
955static ssize_t comp_algorithm_show(struct device *dev,
956				   struct device_attribute *attr,
957				   char *buf)
958{
959	struct zram *zram = dev_to_zram(dev);
960
961	return __comp_algorithm_show(zram, ZRAM_PRIMARY_COMP, buf);
962}
963
964static ssize_t comp_algorithm_store(struct device *dev,
965				    struct device_attribute *attr,
966				    const char *buf,
967				    size_t len)
968{
969	struct zram *zram = dev_to_zram(dev);
970	int ret;
971
972	ret = __comp_algorithm_store(zram, ZRAM_PRIMARY_COMP, buf);
973	return ret ? ret : len;
974}
975
976#ifdef CONFIG_ZRAM_MULTI_COMP
977static ssize_t recomp_algorithm_show(struct device *dev,
978				     struct device_attribute *attr,
979				     char *buf)
980{
981	struct zram *zram = dev_to_zram(dev);
982	ssize_t sz = 0;
983	u32 prio;
984
985	for (prio = ZRAM_SECONDARY_COMP; prio < ZRAM_MAX_COMPS; prio++) {
986		if (!zram->comp_algs[prio])
987			continue;
988
989		sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, "#%d: ", prio);
990		sz += __comp_algorithm_show(zram, prio, buf + sz);
991	}
992
993	return sz;
994}
995
996static ssize_t recomp_algorithm_store(struct device *dev,
997				      struct device_attribute *attr,
998				      const char *buf,
999				      size_t len)
1000{
1001	struct zram *zram = dev_to_zram(dev);
1002	int prio = ZRAM_SECONDARY_COMP;
1003	char *args, *param, *val;
1004	char *alg = NULL;
1005	int ret;
1006
1007	args = skip_spaces(buf);
1008	while (*args) {
1009		args = next_arg(args, &param, &val);
1010
1011		if (!val || !*val)
1012			return -EINVAL;
1013
1014		if (!strcmp(param, "algo")) {
1015			alg = val;
1016			continue;
1017		}
1018
1019		if (!strcmp(param, "priority")) {
1020			ret = kstrtoint(val, 10, &prio);
1021			if (ret)
1022				return ret;
1023			continue;
1024		}
1025	}
1026
1027	if (!alg)
1028		return -EINVAL;
1029
1030	if (prio < ZRAM_SECONDARY_COMP || prio >= ZRAM_MAX_COMPS)
1031		return -EINVAL;
1032
1033	ret = __comp_algorithm_store(zram, prio, alg);
1034	return ret ? ret : len;
1035}
1036#endif
1037
1038static ssize_t compact_store(struct device *dev,
1039		struct device_attribute *attr, const char *buf, size_t len)
1040{
1041	struct zram *zram = dev_to_zram(dev);
1042
1043	down_read(&zram->init_lock);
1044	if (!init_done(zram)) {
1045		up_read(&zram->init_lock);
1046		return -EINVAL;
1047	}
1048
1049	zs_compact(zram->mem_pool);
1050	up_read(&zram->init_lock);
1051
1052	return len;
1053}
1054
1055static ssize_t io_stat_show(struct device *dev,
1056		struct device_attribute *attr, char *buf)
1057{
1058	struct zram *zram = dev_to_zram(dev);
1059	ssize_t ret;
1060
1061	down_read(&zram->init_lock);
1062	ret = scnprintf(buf, PAGE_SIZE,
1063			"%8llu %8llu 0 %8llu\n",
1064			(u64)atomic64_read(&zram->stats.failed_reads),
1065			(u64)atomic64_read(&zram->stats.failed_writes),
1066			(u64)atomic64_read(&zram->stats.notify_free));
1067	up_read(&zram->init_lock);
1068
1069	return ret;
1070}
1071
1072static ssize_t mm_stat_show(struct device *dev,
1073		struct device_attribute *attr, char *buf)
1074{
1075	struct zram *zram = dev_to_zram(dev);
1076	struct zs_pool_stats pool_stats;
1077	u64 orig_size, mem_used = 0;
1078	long max_used;
1079	ssize_t ret;
1080
1081	memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats));
1082
1083	down_read(&zram->init_lock);
1084	if (init_done(zram)) {
1085		mem_used = zs_get_total_pages(zram->mem_pool);
1086		zs_pool_stats(zram->mem_pool, &pool_stats);
1087	}
1088
1089	orig_size = atomic64_read(&zram->stats.pages_stored);
1090	max_used = atomic_long_read(&zram->stats.max_used_pages);
1091
1092	ret = scnprintf(buf, PAGE_SIZE,
1093			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
1094			orig_size << PAGE_SHIFT,
1095			(u64)atomic64_read(&zram->stats.compr_data_size),
1096			mem_used << PAGE_SHIFT,
1097			zram->limit_pages << PAGE_SHIFT,
1098			max_used << PAGE_SHIFT,
1099			(u64)atomic64_read(&zram->stats.same_pages),
1100			atomic_long_read(&pool_stats.pages_compacted),
1101			(u64)atomic64_read(&zram->stats.huge_pages),
1102			(u64)atomic64_read(&zram->stats.huge_pages_since));
1103	up_read(&zram->init_lock);
1104
1105	return ret;
1106}
1107
1108#ifdef CONFIG_ZRAM_WRITEBACK
1109#define FOUR_K(x) ((x) * (1 << (PAGE_SHIFT - 12)))
1110static ssize_t bd_stat_show(struct device *dev,
1111		struct device_attribute *attr, char *buf)
1112{
1113	struct zram *zram = dev_to_zram(dev);
1114	ssize_t ret;
1115
1116	down_read(&zram->init_lock);
1117	ret = scnprintf(buf, PAGE_SIZE,
1118		"%8llu %8llu %8llu\n",
1119			FOUR_K((u64)atomic64_read(&zram->stats.bd_count)),
1120			FOUR_K((u64)atomic64_read(&zram->stats.bd_reads)),
1121			FOUR_K((u64)atomic64_read(&zram->stats.bd_writes)));
1122	up_read(&zram->init_lock);
1123
1124	return ret;
1125}
1126#endif
1127
1128static ssize_t debug_stat_show(struct device *dev,
1129		struct device_attribute *attr, char *buf)
1130{
1131	int version = 1;
1132	struct zram *zram = dev_to_zram(dev);
1133	ssize_t ret;
1134
1135	down_read(&zram->init_lock);
1136	ret = scnprintf(buf, PAGE_SIZE,
1137			"version: %d\n%8llu %8llu\n",
1138			version,
1139			(u64)atomic64_read(&zram->stats.writestall),
1140			(u64)atomic64_read(&zram->stats.miss_free));
1141	up_read(&zram->init_lock);
1142
1143	return ret;
1144}
1145
1146static DEVICE_ATTR_RO(io_stat);
1147static DEVICE_ATTR_RO(mm_stat);
1148#ifdef CONFIG_ZRAM_WRITEBACK
1149static DEVICE_ATTR_RO(bd_stat);
1150#endif
1151static DEVICE_ATTR_RO(debug_stat);
1152
1153#ifdef CONFIG_ZRAM_GROUP
1154static ssize_t group_show(struct device *dev, struct device_attribute *attr, char *buf)
1155{
1156	struct zram *zram = dev_to_zram(dev);
1157	int ret = 0;
1158
1159	down_read(&zram->init_lock);
1160	if (zram->zgrp_ctrl == ZGRP_NONE)
1161		ret = snprintf(buf, PAGE_SIZE - 1, "disable\n");
1162	else if (zram->zgrp_ctrl == ZGRP_TRACK)
1163		ret = snprintf(buf, PAGE_SIZE - 1, "readonly\n");
1164#ifdef CONFIG_ZRAM_GROUP_WRITEBACK
1165	else if (zram->zgrp_ctrl == ZGRP_WRITE)
1166		ret = snprintf(buf, PAGE_SIZE - 1, "readwrite\n");
1167#endif
1168	up_read(&zram->init_lock);
1169
1170	return ret;
1171}
1172
1173static ssize_t group_store(struct device *dev, struct device_attribute *attr,
1174				const char *buf, size_t len)
1175{
1176	struct zram *zram = dev_to_zram(dev);
1177	int ret;
1178#ifdef CONFIG_ZRAM_GROUP_DEBUG
1179	u32 op, gid, index;
1180
1181	ret = sscanf(buf, "%u %u %u", &op, &index, &gid);
1182	if (ret == 3) {
1183		pr_info("op[%u] index[%u] gid[%u].\n", op, index, gid);
1184		group_debug(zram, op, index, gid);
1185		return len;
1186	}
1187#endif
1188
1189	ret = len;
1190	down_write(&zram->init_lock);
1191	if (init_done(zram)) {
1192		pr_info("Can't setup group ctrl for initialized device!\n");
1193		ret = -EBUSY;
1194		goto out;
1195	}
1196	if (!strcmp(buf, "disable\n"))
1197		zram->zgrp_ctrl = ZGRP_NONE;
1198	else if (!strcmp(buf, "readonly\n"))
1199		zram->zgrp_ctrl = ZGRP_TRACK;
1200#ifdef CONFIG_ZRAM_GROUP_WRITEBACK
1201	else if (!strcmp(buf, "readwrite\n"))
1202		zram->zgrp_ctrl = ZGRP_WRITE;
1203#endif
1204	else
1205		ret = -EINVAL;
1206out:
1207	up_write(&zram->init_lock);
1208
1209	return ret;
1210}
1211#endif
1212
1213static void zram_meta_free(struct zram *zram, u64 disksize)
1214{
1215	size_t num_pages = disksize >> PAGE_SHIFT;
1216	size_t index;
1217
1218	/* Free all pages that are still in this zram device */
1219	for (index = 0; index < num_pages; index++)
1220		zram_free_page(zram, index);
1221
1222	zs_destroy_pool(zram->mem_pool);
1223	vfree(zram->table);
1224#ifdef CONFIG_ZRAM_GROUP
1225	zram_group_deinit(zram);
1226#endif
1227}
1228
1229static bool zram_meta_alloc(struct zram *zram, u64 disksize)
1230{
1231	size_t num_pages;
1232
1233	num_pages = disksize >> PAGE_SHIFT;
1234	zram->table = vzalloc(array_size(num_pages, sizeof(*zram->table)));
1235	if (!zram->table)
1236		return false;
1237
1238	zram->mem_pool = zs_create_pool(zram->disk->disk_name);
1239	if (!zram->mem_pool) {
1240		vfree(zram->table);
1241		return false;
1242	}
1243
1244	if (!huge_class_size)
1245		huge_class_size = zs_huge_class_size(zram->mem_pool);
1246#ifdef CONFIG_ZRAM_GROUP
1247	zram_group_init(zram, num_pages);
1248#endif
1249
1250	return true;
1251}
1252
1253/*
1254 * To protect concurrent access to the same index entry,
1255 * caller should hold this table index entry's bit_spinlock to
1256 * indicate this index entry is accessing.
1257 */
1258static void zram_free_page(struct zram *zram, size_t index)
1259{
1260	unsigned long handle;
1261
1262#ifdef CONFIG_ZRAM_GROUP
1263	zram_group_untrack_obj(zram, index);
1264#endif
1265
1266#ifdef CONFIG_ZRAM_MEMORY_TRACKING
1267	zram->table[index].ac_time = 0;
1268#endif
1269	if (zram_test_flag(zram, index, ZRAM_IDLE))
1270		zram_clear_flag(zram, index, ZRAM_IDLE);
1271
1272	if (zram_test_flag(zram, index, ZRAM_HUGE)) {
1273		zram_clear_flag(zram, index, ZRAM_HUGE);
1274		atomic64_dec(&zram->stats.huge_pages);
1275	}
1276
1277	if (zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
1278		zram_clear_flag(zram, index, ZRAM_INCOMPRESSIBLE);
1279
1280	zram_set_priority(zram, index, 0);
1281
1282	if (zram_test_flag(zram, index, ZRAM_WB)) {
1283		zram_clear_flag(zram, index, ZRAM_WB);
1284		free_block_bdev(zram, zram_get_element(zram, index));
1285		goto out;
1286	}
1287
1288	/*
1289	 * No memory is allocated for same element filled pages.
1290	 * Simply clear same page flag.
1291	 */
1292	if (zram_test_flag(zram, index, ZRAM_SAME)) {
1293		zram_clear_flag(zram, index, ZRAM_SAME);
1294		atomic64_dec(&zram->stats.same_pages);
1295		goto out;
1296	}
1297
1298	handle = zram_get_handle(zram, index);
1299	if (!handle)
1300		return;
1301
1302	zs_free(zram->mem_pool, handle);
1303
1304	atomic64_sub(zram_get_obj_size(zram, index),
1305			&zram->stats.compr_data_size);
1306out:
1307	atomic64_dec(&zram->stats.pages_stored);
1308	zram_set_handle(zram, index, 0);
1309	zram_set_obj_size(zram, index, 0);
1310	WARN_ON_ONCE(zram->table[index].flags &
1311		~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB));
1312}
1313
1314/*
1315 * Reads (decompresses if needed) a page from zspool (zsmalloc).
1316 * Corresponding ZRAM slot should be locked.
1317 */
1318static int zram_read_from_zspool(struct zram *zram, struct page *page,
1319				 u32 index)
1320{
1321	struct zcomp_strm *zstrm;
1322	unsigned long handle;
1323	unsigned int size;
1324	void *src, *dst;
1325	u32 prio;
1326	int ret;
1327
1328	handle = zram_get_handle(zram, index);
1329	if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) {
1330		unsigned long value;
1331		void *mem;
1332
1333		value = handle ? zram_get_element(zram, index) : 0;
1334		mem = kmap_atomic(page);
1335		zram_fill_page(mem, PAGE_SIZE, value);
1336		kunmap_atomic(mem);
1337		return 0;
1338	}
1339
1340	size = zram_get_obj_size(zram, index);
1341
1342	if (size != PAGE_SIZE) {
1343		prio = zram_get_priority(zram, index);
1344		zstrm = zcomp_stream_get(zram->comps[prio]);
1345	}
1346
1347	src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
1348	if (size == PAGE_SIZE) {
1349		dst = kmap_atomic(page);
1350		memcpy(dst, src, PAGE_SIZE);
1351		kunmap_atomic(dst);
1352		ret = 0;
1353	} else {
1354		dst = kmap_atomic(page);
1355		ret = zcomp_decompress(zstrm, src, size, dst);
1356		kunmap_atomic(dst);
1357		zcomp_stream_put(zram->comps[prio]);
1358	}
1359	zs_unmap_object(zram->mem_pool, handle);
1360	return ret;
1361}
1362
1363static int zram_read_page(struct zram *zram, struct page *page, u32 index,
1364			  struct bio *parent)
1365{
1366	int ret;
1367
1368	zram_slot_lock(zram, index);
1369#ifdef CONFIG_ZRAM_GROUP_WRITEBACK
1370	if (!parent) {
1371		ret = zram_group_fault_obj(zram, index);
1372		if (ret) {
1373			zram_slot_unlock(zram, index);
1374			return ret;
1375		}
1376	}
1377
1378	if (zram_test_flag(zram, index, ZRAM_GWB)) {
1379		zram_slot_unlock(zram, index);
1380		return -EIO;
1381	}
1382#endif
1383	if (!zram_test_flag(zram, index, ZRAM_WB)) {
1384		/* Slot should be locked through out the function call */
1385		ret = zram_read_from_zspool(zram, page, index);
1386		zram_slot_unlock(zram, index);
1387	} else {
1388		/*
1389		 * The slot should be unlocked before reading from the backing
1390		 * device.
1391		 */
1392		zram_slot_unlock(zram, index);
1393
1394		ret = read_from_bdev(zram, page, zram_get_element(zram, index),
1395				     parent);
1396	}
1397
1398	/* Should NEVER happen. Return bio error if it does. */
1399	if (WARN_ON(ret < 0))
1400		pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
1401
1402	return ret;
1403}
1404
1405/*
1406 * Use a temporary buffer to decompress the page, as the decompressor
1407 * always expects a full page for the output.
1408 */
1409static int zram_bvec_read_partial(struct zram *zram, struct bio_vec *bvec,
1410				  u32 index, int offset)
1411{
1412	struct page *page = alloc_page(GFP_NOIO);
1413	int ret;
1414
1415	if (!page)
1416		return -ENOMEM;
1417	ret = zram_read_page(zram, page, index, NULL);
1418	if (likely(!ret))
1419		memcpy_to_bvec(bvec, page_address(page) + offset);
1420	__free_page(page);
1421	return ret;
1422}
1423
1424static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
1425			  u32 index, int offset, struct bio *bio)
1426{
1427	if (is_partial_io(bvec))
1428		return zram_bvec_read_partial(zram, bvec, index, offset);
1429	return zram_read_page(zram, bvec->bv_page, index, bio);
1430}
1431
1432static int zram_write_page(struct zram *zram, struct page *page, u32 index)
1433{
1434	int ret = 0;
1435	unsigned long alloced_pages;
1436	unsigned long handle = -ENOMEM;
1437	unsigned int comp_len = 0;
1438	void *src, *dst, *mem;
1439	struct zcomp_strm *zstrm;
1440	unsigned long element = 0;
1441	enum zram_pageflags flags = 0;
1442
1443	mem = kmap_atomic(page);
1444	if (page_same_filled(mem, &element)) {
1445		kunmap_atomic(mem);
1446		/* Free memory associated with this sector now. */
1447		flags = ZRAM_SAME;
1448		atomic64_inc(&zram->stats.same_pages);
1449		goto out;
1450	}
1451	kunmap_atomic(mem);
1452
1453compress_again:
1454	zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]);
1455	src = kmap_atomic(page);
1456	ret = zcomp_compress(zstrm, src, &comp_len);
1457	kunmap_atomic(src);
1458
1459	if (unlikely(ret)) {
1460		zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
1461		pr_err("Compression failed! err=%d\n", ret);
1462		zs_free(zram->mem_pool, handle);
1463		return ret;
1464	}
1465
1466	if (comp_len >= huge_class_size)
1467		comp_len = PAGE_SIZE;
1468	/*
1469	 * handle allocation has 2 paths:
1470	 * a) fast path is executed with preemption disabled (for
1471	 *  per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
1472	 *  since we can't sleep;
1473	 * b) slow path enables preemption and attempts to allocate
1474	 *  the page with __GFP_DIRECT_RECLAIM bit set. we have to
1475	 *  put per-cpu compression stream and, thus, to re-do
1476	 *  the compression once handle is allocated.
1477	 *
1478	 * if we have a 'non-null' handle here then we are coming
1479	 * from the slow path and handle has already been allocated.
1480	 */
1481	if (IS_ERR_VALUE(handle))
1482		handle = zs_malloc(zram->mem_pool, comp_len,
1483				__GFP_KSWAPD_RECLAIM |
1484				__GFP_NOWARN |
1485				__GFP_HIGHMEM |
1486				__GFP_MOVABLE);
1487	if (IS_ERR_VALUE(handle)) {
1488		zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
1489		atomic64_inc(&zram->stats.writestall);
1490		handle = zs_malloc(zram->mem_pool, comp_len,
1491				GFP_NOIO | __GFP_HIGHMEM |
1492				__GFP_MOVABLE);
1493		if (IS_ERR_VALUE(handle))
1494			return PTR_ERR((void *)handle);
1495
1496		if (comp_len != PAGE_SIZE)
1497			goto compress_again;
1498		/*
1499		 * If the page is not compressible, you need to acquire the
1500		 * lock and execute the code below. The zcomp_stream_get()
1501		 * call is needed to disable the cpu hotplug and grab the
1502		 * zstrm buffer back. It is necessary that the dereferencing
1503		 * of the zstrm variable below occurs correctly.
1504		 */
1505		zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]);
1506	}
1507
1508	alloced_pages = zs_get_total_pages(zram->mem_pool);
1509	update_used_max(zram, alloced_pages);
1510
1511	if (zram->limit_pages && alloced_pages > zram->limit_pages) {
1512		zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
1513		zs_free(zram->mem_pool, handle);
1514		return -ENOMEM;
1515	}
1516
1517	dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
1518
1519	src = zstrm->buffer;
1520	if (comp_len == PAGE_SIZE)
1521		src = kmap_atomic(page);
1522	memcpy(dst, src, comp_len);
1523	if (comp_len == PAGE_SIZE)
1524		kunmap_atomic(src);
1525
1526	zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]);
1527	zs_unmap_object(zram->mem_pool, handle);
1528	atomic64_add(comp_len, &zram->stats.compr_data_size);
1529out:
1530	/*
1531	 * Free memory associated with this sector
1532	 * before overwriting unused sectors.
1533	 */
1534	zram_slot_lock(zram, index);
1535	zram_free_page(zram, index);
1536
1537	if (comp_len == PAGE_SIZE) {
1538		zram_set_flag(zram, index, ZRAM_HUGE);
1539		atomic64_inc(&zram->stats.huge_pages);
1540		atomic64_inc(&zram->stats.huge_pages_since);
1541	}
1542
1543	if (flags) {
1544		zram_set_flag(zram, index, flags);
1545		zram_set_element(zram, index, element);
1546	}  else {
1547		zram_set_handle(zram, index, handle);
1548		zram_set_obj_size(zram, index, comp_len);
1549	}
1550#ifdef CONFIG_ZRAM_GROUP
1551	zram_group_track_obj(zram, index, page_memcg(page));
1552#endif
1553	zram_slot_unlock(zram, index);
1554
1555	/* Update stats */
1556	atomic64_inc(&zram->stats.pages_stored);
1557	return ret;
1558}
1559
1560/*
1561 * This is a partial IO. Read the full page before writing the changes.
1562 */
1563static int zram_bvec_write_partial(struct zram *zram, struct bio_vec *bvec,
1564				   u32 index, int offset, struct bio *bio)
1565{
1566	struct page *page = alloc_page(GFP_NOIO);
1567	int ret;
1568
1569	if (!page)
1570		return -ENOMEM;
1571
1572	ret = zram_read_page(zram, page, index, bio);
1573	if (!ret) {
1574		memcpy_from_bvec(page_address(page) + offset, bvec);
1575		ret = zram_write_page(zram, page, index);
1576	}
1577	__free_page(page);
1578	return ret;
1579}
1580
1581static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
1582			   u32 index, int offset, struct bio *bio)
1583{
1584	if (is_partial_io(bvec))
1585		return zram_bvec_write_partial(zram, bvec, index, offset, bio);
1586	return zram_write_page(zram, bvec->bv_page, index);
1587}
1588
1589#ifdef CONFIG_ZRAM_MULTI_COMP
1590/*
1591 * This function will decompress (unless it's ZRAM_HUGE) the page and then
1592 * attempt to compress it using provided compression algorithm priority
1593 * (which is potentially more effective).
1594 *
1595 * Corresponding ZRAM slot should be locked.
1596 */
1597static int zram_recompress(struct zram *zram, u32 index, struct page *page,
1598			   u32 threshold, u32 prio, u32 prio_max)
1599{
1600	struct zcomp_strm *zstrm = NULL;
1601	unsigned long handle_old;
1602	unsigned long handle_new;
1603	unsigned int comp_len_old;
1604	unsigned int comp_len_new;
1605	unsigned int class_index_old;
1606	unsigned int class_index_new;
1607	u32 num_recomps = 0;
1608	void *src, *dst;
1609	int ret;
1610
1611	handle_old = zram_get_handle(zram, index);
1612	if (!handle_old)
1613		return -EINVAL;
1614
1615	comp_len_old = zram_get_obj_size(zram, index);
1616	/*
1617	 * Do not recompress objects that are already "small enough".
1618	 */
1619	if (comp_len_old < threshold)
1620		return 0;
1621
1622	ret = zram_read_from_zspool(zram, page, index);
1623	if (ret)
1624		return ret;
1625
1626	class_index_old = zs_lookup_class_index(zram->mem_pool, comp_len_old);
1627	/*
1628	 * Iterate the secondary comp algorithms list (in order of priority)
1629	 * and try to recompress the page.
1630	 */
1631	for (; prio < prio_max; prio++) {
1632		if (!zram->comps[prio])
1633			continue;
1634
1635		/*
1636		 * Skip if the object is already re-compressed with a higher
1637		 * priority algorithm (or same algorithm).
1638		 */
1639		if (prio <= zram_get_priority(zram, index))
1640			continue;
1641
1642		num_recomps++;
1643		zstrm = zcomp_stream_get(zram->comps[prio]);
1644		src = kmap_atomic(page);
1645		ret = zcomp_compress(zstrm, src, &comp_len_new);
1646		kunmap_atomic(src);
1647
1648		if (ret) {
1649			zcomp_stream_put(zram->comps[prio]);
1650			return ret;
1651		}
1652
1653		class_index_new = zs_lookup_class_index(zram->mem_pool,
1654							comp_len_new);
1655
1656		/* Continue until we make progress */
1657		if (class_index_new >= class_index_old ||
1658		    (threshold && comp_len_new >= threshold)) {
1659			zcomp_stream_put(zram->comps[prio]);
1660			continue;
1661		}
1662
1663		/* Recompression was successful so break out */
1664		break;
1665	}
1666
1667	/*
1668	 * We did not try to recompress, e.g. when we have only one
1669	 * secondary algorithm and the page is already recompressed
1670	 * using that algorithm
1671	 */
1672	if (!zstrm)
1673		return 0;
1674
1675	if (class_index_new >= class_index_old) {
1676		/*
1677		 * Secondary algorithms failed to re-compress the page
1678		 * in a way that would save memory, mark the object as
1679		 * incompressible so that we will not try to compress
1680		 * it again.
1681		 *
1682		 * We need to make sure that all secondary algorithms have
1683		 * failed, so we test if the number of recompressions matches
1684		 * the number of active secondary algorithms.
1685		 */
1686		if (num_recomps == zram->num_active_comps - 1)
1687			zram_set_flag(zram, index, ZRAM_INCOMPRESSIBLE);
1688		return 0;
1689	}
1690
1691	/* Successful recompression but above threshold */
1692	if (threshold && comp_len_new >= threshold)
1693		return 0;
1694
1695	/*
1696	 * No direct reclaim (slow path) for handle allocation and no
1697	 * re-compression attempt (unlike in zram_write_bvec()) since
1698	 * we already have stored that object in zsmalloc. If we cannot
1699	 * alloc memory for recompressed object then we bail out and
1700	 * simply keep the old (existing) object in zsmalloc.
1701	 */
1702	handle_new = zs_malloc(zram->mem_pool, comp_len_new,
1703			       __GFP_KSWAPD_RECLAIM |
1704			       __GFP_NOWARN |
1705			       __GFP_HIGHMEM |
1706			       __GFP_MOVABLE);
1707	if (IS_ERR_VALUE(handle_new)) {
1708		zcomp_stream_put(zram->comps[prio]);
1709		return PTR_ERR((void *)handle_new);
1710	}
1711
1712	dst = zs_map_object(zram->mem_pool, handle_new, ZS_MM_WO);
1713	memcpy(dst, zstrm->buffer, comp_len_new);
1714	zcomp_stream_put(zram->comps[prio]);
1715
1716	zs_unmap_object(zram->mem_pool, handle_new);
1717
1718	zram_free_page(zram, index);
1719	zram_set_handle(zram, index, handle_new);
1720	zram_set_obj_size(zram, index, comp_len_new);
1721	zram_set_priority(zram, index, prio);
1722
1723	atomic64_add(comp_len_new, &zram->stats.compr_data_size);
1724	atomic64_inc(&zram->stats.pages_stored);
1725
1726	return 0;
1727}
1728
1729#define RECOMPRESS_IDLE		(1 << 0)
1730#define RECOMPRESS_HUGE		(1 << 1)
1731
1732static ssize_t recompress_store(struct device *dev,
1733				struct device_attribute *attr,
1734				const char *buf, size_t len)
1735{
1736	u32 prio = ZRAM_SECONDARY_COMP, prio_max = ZRAM_MAX_COMPS;
1737	struct zram *zram = dev_to_zram(dev);
1738	unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
1739	char *args, *param, *val, *algo = NULL;
1740	u32 mode = 0, threshold = 0;
1741	unsigned long index;
1742	struct page *page;
1743	ssize_t ret;
1744
1745	args = skip_spaces(buf);
1746	while (*args) {
1747		args = next_arg(args, &param, &val);
1748
1749		if (!val || !*val)
1750			return -EINVAL;
1751
1752		if (!strcmp(param, "type")) {
1753			if (!strcmp(val, "idle"))
1754				mode = RECOMPRESS_IDLE;
1755			if (!strcmp(val, "huge"))
1756				mode = RECOMPRESS_HUGE;
1757			if (!strcmp(val, "huge_idle"))
1758				mode = RECOMPRESS_IDLE | RECOMPRESS_HUGE;
1759			continue;
1760		}
1761
1762		if (!strcmp(param, "threshold")) {
1763			/*
1764			 * We will re-compress only idle objects equal or
1765			 * greater in size than watermark.
1766			 */
1767			ret = kstrtouint(val, 10, &threshold);
1768			if (ret)
1769				return ret;
1770			continue;
1771		}
1772
1773		if (!strcmp(param, "algo")) {
1774			algo = val;
1775			continue;
1776		}
1777	}
1778
1779	if (threshold >= huge_class_size)
1780		return -EINVAL;
1781
1782	down_read(&zram->init_lock);
1783	if (!init_done(zram)) {
1784		ret = -EINVAL;
1785		goto release_init_lock;
1786	}
1787
1788	if (algo) {
1789		bool found = false;
1790
1791		for (; prio < ZRAM_MAX_COMPS; prio++) {
1792			if (!zram->comp_algs[prio])
1793				continue;
1794
1795			if (!strcmp(zram->comp_algs[prio], algo)) {
1796				prio_max = min(prio + 1, ZRAM_MAX_COMPS);
1797				found = true;
1798				break;
1799			}
1800		}
1801
1802		if (!found) {
1803			ret = -EINVAL;
1804			goto release_init_lock;
1805		}
1806	}
1807
1808	page = alloc_page(GFP_KERNEL);
1809	if (!page) {
1810		ret = -ENOMEM;
1811		goto release_init_lock;
1812	}
1813
1814	ret = len;
1815	for (index = 0; index < nr_pages; index++) {
1816		int err = 0;
1817
1818		zram_slot_lock(zram, index);
1819
1820		if (!zram_allocated(zram, index))
1821			goto next;
1822
1823		if (mode & RECOMPRESS_IDLE &&
1824		    !zram_test_flag(zram, index, ZRAM_IDLE))
1825			goto next;
1826
1827		if (mode & RECOMPRESS_HUGE &&
1828		    !zram_test_flag(zram, index, ZRAM_HUGE))
1829			goto next;
1830
1831		if (zram_test_flag(zram, index, ZRAM_WB) ||
1832		    zram_test_flag(zram, index, ZRAM_UNDER_WB) ||
1833		    zram_test_flag(zram, index, ZRAM_SAME) ||
1834		    zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE))
1835			goto next;
1836
1837		err = zram_recompress(zram, index, page, threshold,
1838				      prio, prio_max);
1839next:
1840		zram_slot_unlock(zram, index);
1841		if (err) {
1842			ret = err;
1843			break;
1844		}
1845
1846		cond_resched();
1847	}
1848
1849	__free_page(page);
1850
1851release_init_lock:
1852	up_read(&zram->init_lock);
1853	return ret;
1854}
1855#endif
1856
1857static void zram_bio_discard(struct zram *zram, struct bio *bio)
1858{
1859	size_t n = bio->bi_iter.bi_size;
1860	u32 index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
1861	u32 offset = (bio->bi_iter.bi_sector & (SECTORS_PER_PAGE - 1)) <<
1862			SECTOR_SHIFT;
1863
1864	/*
1865	 * zram manages data in physical block size units. Because logical block
1866	 * size isn't identical with physical block size on some arch, we
1867	 * could get a discard request pointing to a specific offset within a
1868	 * certain physical block.  Although we can handle this request by
1869	 * reading that physiclal block and decompressing and partially zeroing
1870	 * and re-compressing and then re-storing it, this isn't reasonable
1871	 * because our intent with a discard request is to save memory.  So
1872	 * skipping this logical block is appropriate here.
1873	 */
1874	if (offset) {
1875		if (n <= (PAGE_SIZE - offset))
1876			return;
1877
1878		n -= (PAGE_SIZE - offset);
1879		index++;
1880	}
1881
1882	while (n >= PAGE_SIZE) {
1883		zram_slot_lock(zram, index);
1884		zram_free_page(zram, index);
1885		zram_slot_unlock(zram, index);
1886		atomic64_inc(&zram->stats.notify_free);
1887		index++;
1888		n -= PAGE_SIZE;
1889	}
1890
1891	bio_endio(bio);
1892}
1893
1894static void zram_bio_read(struct zram *zram, struct bio *bio)
1895{
1896	unsigned long start_time = bio_start_io_acct(bio);
1897	struct bvec_iter iter = bio->bi_iter;
1898
1899	do {
1900		u32 index = iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
1901		u32 offset = (iter.bi_sector & (SECTORS_PER_PAGE - 1)) <<
1902				SECTOR_SHIFT;
1903		struct bio_vec bv = bio_iter_iovec(bio, iter);
1904
1905		bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
1906
1907		if (zram_bvec_read(zram, &bv, index, offset, bio) < 0) {
1908			atomic64_inc(&zram->stats.failed_reads);
1909			bio->bi_status = BLK_STS_IOERR;
1910			break;
1911		}
1912		flush_dcache_page(bv.bv_page);
1913
1914		zram_slot_lock(zram, index);
1915		zram_accessed(zram, index);
1916		zram_slot_unlock(zram, index);
1917
1918		bio_advance_iter_single(bio, &iter, bv.bv_len);
1919	} while (iter.bi_size);
1920
1921	bio_end_io_acct(bio, start_time);
1922	bio_endio(bio);
1923}
1924
1925static void zram_bio_write(struct zram *zram, struct bio *bio)
1926{
1927	unsigned long start_time = bio_start_io_acct(bio);
1928	struct bvec_iter iter = bio->bi_iter;
1929
1930	do {
1931		u32 index = iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
1932		u32 offset = (iter.bi_sector & (SECTORS_PER_PAGE - 1)) <<
1933				SECTOR_SHIFT;
1934		struct bio_vec bv = bio_iter_iovec(bio, iter);
1935
1936		bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
1937
1938		if (zram_bvec_write(zram, &bv, index, offset, bio) < 0) {
1939			atomic64_inc(&zram->stats.failed_writes);
1940			bio->bi_status = BLK_STS_IOERR;
1941			break;
1942		}
1943
1944		zram_slot_lock(zram, index);
1945		zram_accessed(zram, index);
1946		zram_slot_unlock(zram, index);
1947
1948		bio_advance_iter_single(bio, &iter, bv.bv_len);
1949	} while (iter.bi_size);
1950
1951	bio_end_io_acct(bio, start_time);
1952	bio_endio(bio);
1953}
1954
1955/*
1956 * Handler function for all zram I/O requests.
1957 */
1958static void zram_submit_bio(struct bio *bio)
1959{
1960	struct zram *zram = bio->bi_bdev->bd_disk->private_data;
1961
1962	switch (bio_op(bio)) {
1963	case REQ_OP_READ:
1964		zram_bio_read(zram, bio);
1965		break;
1966	case REQ_OP_WRITE:
1967		zram_bio_write(zram, bio);
1968		break;
1969	case REQ_OP_DISCARD:
1970	case REQ_OP_WRITE_ZEROES:
1971		zram_bio_discard(zram, bio);
1972		break;
1973	default:
1974		WARN_ON_ONCE(1);
1975		bio_endio(bio);
1976	}
1977}
1978
1979static void zram_slot_free_notify(struct block_device *bdev,
1980				unsigned long index)
1981{
1982	struct zram *zram;
1983
1984	zram = bdev->bd_disk->private_data;
1985
1986	atomic64_inc(&zram->stats.notify_free);
1987	if (!zram_slot_trylock(zram, index)) {
1988		atomic64_inc(&zram->stats.miss_free);
1989		return;
1990	}
1991
1992	zram_free_page(zram, index);
1993	zram_slot_unlock(zram, index);
1994}
1995
1996static void zram_destroy_comps(struct zram *zram)
1997{
1998	u32 prio;
1999
2000	for (prio = 0; prio < ZRAM_MAX_COMPS; prio++) {
2001		struct zcomp *comp = zram->comps[prio];
2002
2003		zram->comps[prio] = NULL;
2004		if (!comp)
2005			continue;
2006		zcomp_destroy(comp);
2007		zram->num_active_comps--;
2008	}
2009}
2010
2011static void zram_reset_device(struct zram *zram)
2012{
2013	down_write(&zram->init_lock);
2014
2015	zram->limit_pages = 0;
2016
2017	if (!init_done(zram)) {
2018		up_write(&zram->init_lock);
2019		return;
2020	}
2021
2022	set_capacity_and_notify(zram->disk, 0);
2023	part_stat_set_all(zram->disk->part0, 0);
2024
2025	/* I/O operation under all of CPU are done so let's free */
2026	zram_meta_free(zram, zram->disksize);
2027	zram->disksize = 0;
2028	zram_destroy_comps(zram);
2029	memset(&zram->stats, 0, sizeof(zram->stats));
2030	reset_bdev(zram);
2031
2032	comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
2033	up_write(&zram->init_lock);
2034}
2035
2036static ssize_t disksize_store(struct device *dev,
2037		struct device_attribute *attr, const char *buf, size_t len)
2038{
2039	u64 disksize;
2040	struct zcomp *comp;
2041	struct zram *zram = dev_to_zram(dev);
2042	int err;
2043	u32 prio;
2044
2045	disksize = memparse(buf, NULL);
2046	if (!disksize)
2047		return -EINVAL;
2048
2049	down_write(&zram->init_lock);
2050	if (init_done(zram)) {
2051		pr_info("Cannot change disksize for initialized device\n");
2052		err = -EBUSY;
2053		goto out_unlock;
2054	}
2055
2056	disksize = PAGE_ALIGN(disksize);
2057	if (!zram_meta_alloc(zram, disksize)) {
2058		err = -ENOMEM;
2059		goto out_unlock;
2060	}
2061
2062	for (prio = 0; prio < ZRAM_MAX_COMPS; prio++) {
2063		if (!zram->comp_algs[prio])
2064			continue;
2065
2066		comp = zcomp_create(zram->comp_algs[prio]);
2067		if (IS_ERR(comp)) {
2068			pr_err("Cannot initialise %s compressing backend\n",
2069			       zram->comp_algs[prio]);
2070			err = PTR_ERR(comp);
2071			goto out_free_comps;
2072		}
2073
2074		zram->comps[prio] = comp;
2075		zram->num_active_comps++;
2076	}
2077	zram->disksize = disksize;
2078	set_capacity_and_notify(zram->disk, zram->disksize >> SECTOR_SHIFT);
2079	up_write(&zram->init_lock);
2080
2081	return len;
2082
2083out_free_comps:
2084	zram_destroy_comps(zram);
2085	zram_meta_free(zram, disksize);
2086out_unlock:
2087	up_write(&zram->init_lock);
2088	return err;
2089}
2090
2091static ssize_t reset_store(struct device *dev,
2092		struct device_attribute *attr, const char *buf, size_t len)
2093{
2094	int ret;
2095	unsigned short do_reset;
2096	struct zram *zram;
2097	struct gendisk *disk;
2098
2099	ret = kstrtou16(buf, 10, &do_reset);
2100	if (ret)
2101		return ret;
2102
2103	if (!do_reset)
2104		return -EINVAL;
2105
2106	zram = dev_to_zram(dev);
2107	disk = zram->disk;
2108
2109	mutex_lock(&disk->open_mutex);
2110	/* Do not reset an active device or claimed device */
2111	if (disk_openers(disk) || zram->claim) {
2112		mutex_unlock(&disk->open_mutex);
2113		return -EBUSY;
2114	}
2115
2116	/* From now on, anyone can't open /dev/zram[0-9] */
2117	zram->claim = true;
2118	mutex_unlock(&disk->open_mutex);
2119
2120	/* Make sure all the pending I/O are finished */
2121	sync_blockdev(disk->part0);
2122	zram_reset_device(zram);
2123
2124	mutex_lock(&disk->open_mutex);
2125	zram->claim = false;
2126	mutex_unlock(&disk->open_mutex);
2127
2128	return len;
2129}
2130
2131static int zram_open(struct gendisk *disk, blk_mode_t mode)
2132{
2133	struct zram *zram = disk->private_data;
2134
2135	WARN_ON(!mutex_is_locked(&disk->open_mutex));
2136
2137	/* zram was claimed to reset so open request fails */
2138	if (zram->claim)
2139		return -EBUSY;
2140	return 0;
2141}
2142
2143static const struct block_device_operations zram_devops = {
2144	.open = zram_open,
2145	.submit_bio = zram_submit_bio,
2146	.swap_slot_free_notify = zram_slot_free_notify,
2147	.owner = THIS_MODULE
2148};
2149
2150static DEVICE_ATTR_WO(compact);
2151static DEVICE_ATTR_RW(disksize);
2152static DEVICE_ATTR_RO(initstate);
2153static DEVICE_ATTR_WO(reset);
2154static DEVICE_ATTR_WO(mem_limit);
2155static DEVICE_ATTR_WO(mem_used_max);
2156static DEVICE_ATTR_WO(idle);
2157static DEVICE_ATTR_RW(max_comp_streams);
2158static DEVICE_ATTR_RW(comp_algorithm);
2159#ifdef CONFIG_ZRAM_WRITEBACK
2160static DEVICE_ATTR_RW(backing_dev);
2161static DEVICE_ATTR_WO(writeback);
2162static DEVICE_ATTR_RW(writeback_limit);
2163static DEVICE_ATTR_RW(writeback_limit_enable);
2164#endif
2165#ifdef CONFIG_ZRAM_MULTI_COMP
2166static DEVICE_ATTR_RW(recomp_algorithm);
2167static DEVICE_ATTR_WO(recompress);
2168#endif
2169#ifdef CONFIG_ZRAM_GROUP
2170static DEVICE_ATTR_RW(group);
2171#endif
2172
2173static struct attribute *zram_disk_attrs[] = {
2174	&dev_attr_disksize.attr,
2175	&dev_attr_initstate.attr,
2176	&dev_attr_reset.attr,
2177	&dev_attr_compact.attr,
2178	&dev_attr_mem_limit.attr,
2179	&dev_attr_mem_used_max.attr,
2180	&dev_attr_idle.attr,
2181	&dev_attr_max_comp_streams.attr,
2182	&dev_attr_comp_algorithm.attr,
2183#ifdef CONFIG_ZRAM_WRITEBACK
2184	&dev_attr_backing_dev.attr,
2185	&dev_attr_writeback.attr,
2186	&dev_attr_writeback_limit.attr,
2187	&dev_attr_writeback_limit_enable.attr,
2188#endif
2189	&dev_attr_io_stat.attr,
2190	&dev_attr_mm_stat.attr,
2191#ifdef CONFIG_ZRAM_WRITEBACK
2192	&dev_attr_bd_stat.attr,
2193#endif
2194	&dev_attr_debug_stat.attr,
2195#ifdef CONFIG_ZRAM_MULTI_COMP
2196	&dev_attr_recomp_algorithm.attr,
2197	&dev_attr_recompress.attr,
2198#endif
2199#ifdef CONFIG_ZRAM_GROUP
2200	&dev_attr_group.attr,
2201#endif
2202	NULL,
2203};
2204
2205ATTRIBUTE_GROUPS(zram_disk);
2206
2207/*
2208 * Allocate and initialize new zram device. the function returns
2209 * '>= 0' device_id upon success, and negative value otherwise.
2210 */
2211static int zram_add(void)
2212{
2213	struct zram *zram;
2214	int ret, device_id;
2215
2216	zram = kzalloc(sizeof(struct zram), GFP_KERNEL);
2217	if (!zram)
2218		return -ENOMEM;
2219
2220	ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL);
2221	if (ret < 0)
2222		goto out_free_dev;
2223	device_id = ret;
2224
2225	init_rwsem(&zram->init_lock);
2226#ifdef CONFIG_ZRAM_WRITEBACK
2227	spin_lock_init(&zram->wb_limit_lock);
2228#endif
2229
2230	/* gendisk structure */
2231	zram->disk = blk_alloc_disk(NUMA_NO_NODE);
2232	if (!zram->disk) {
2233		pr_err("Error allocating disk structure for device %d\n",
2234			device_id);
2235		ret = -ENOMEM;
2236		goto out_free_idr;
2237	}
2238
2239	zram->disk->major = zram_major;
2240	zram->disk->first_minor = device_id;
2241	zram->disk->minors = 1;
2242	zram->disk->flags |= GENHD_FL_NO_PART;
2243	zram->disk->fops = &zram_devops;
2244	zram->disk->private_data = zram;
2245	snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
2246
2247	/* Actual capacity set using sysfs (/sys/block/zram<id>/disksize */
2248	set_capacity(zram->disk, 0);
2249	/* zram devices sort of resembles non-rotational disks */
2250	blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue);
2251	blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue);
2252
2253	/*
2254	 * To ensure that we always get PAGE_SIZE aligned
2255	 * and n*PAGE_SIZED sized I/O requests.
2256	 */
2257	blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
2258	blk_queue_logical_block_size(zram->disk->queue,
2259					ZRAM_LOGICAL_BLOCK_SIZE);
2260	blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
2261	blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
2262	zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
2263	blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
2264
2265	/*
2266	 * zram_bio_discard() will clear all logical blocks if logical block
2267	 * size is identical with physical block size(PAGE_SIZE). But if it is
2268	 * different, we will skip discarding some parts of logical blocks in
2269	 * the part of the request range which isn't aligned to physical block
2270	 * size.  So we can't ensure that all discarded logical blocks are
2271	 * zeroed.
2272	 */
2273	if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
2274		blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
2275
2276	blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue);
2277	ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
2278	if (ret)
2279		goto out_cleanup_disk;
2280
2281	comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
2282
2283	zram_debugfs_register(zram);
2284	pr_info("Added device: %s\n", zram->disk->disk_name);
2285	return device_id;
2286
2287out_cleanup_disk:
2288	put_disk(zram->disk);
2289out_free_idr:
2290	idr_remove(&zram_index_idr, device_id);
2291out_free_dev:
2292	kfree(zram);
2293	return ret;
2294}
2295
2296static int zram_remove(struct zram *zram)
2297{
2298	bool claimed;
2299
2300	mutex_lock(&zram->disk->open_mutex);
2301	if (disk_openers(zram->disk)) {
2302		mutex_unlock(&zram->disk->open_mutex);
2303		return -EBUSY;
2304	}
2305
2306	claimed = zram->claim;
2307	if (!claimed)
2308		zram->claim = true;
2309	mutex_unlock(&zram->disk->open_mutex);
2310
2311	zram_debugfs_unregister(zram);
2312
2313	if (claimed) {
2314		/*
2315		 * If we were claimed by reset_store(), del_gendisk() will
2316		 * wait until reset_store() is done, so nothing need to do.
2317		 */
2318		;
2319	} else {
2320		/* Make sure all the pending I/O are finished */
2321		sync_blockdev(zram->disk->part0);
2322		zram_reset_device(zram);
2323	}
2324
2325	pr_info("Removed device: %s\n", zram->disk->disk_name);
2326
2327	del_gendisk(zram->disk);
2328
2329	/* del_gendisk drains pending reset_store */
2330	WARN_ON_ONCE(claimed && zram->claim);
2331
2332	/*
2333	 * disksize_store() may be called in between zram_reset_device()
2334	 * and del_gendisk(), so run the last reset to avoid leaking
2335	 * anything allocated with disksize_store()
2336	 */
2337	zram_reset_device(zram);
2338
2339	put_disk(zram->disk);
2340	kfree(zram);
2341	return 0;
2342}
2343
2344/* zram-control sysfs attributes */
2345
2346/*
2347 * NOTE: hot_add attribute is not the usual read-only sysfs attribute. In a
2348 * sense that reading from this file does alter the state of your system -- it
2349 * creates a new un-initialized zram device and returns back this device's
2350 * device_id (or an error code if it fails to create a new device).
2351 */
2352static ssize_t hot_add_show(const struct class *class,
2353			const struct class_attribute *attr,
2354			char *buf)
2355{
2356	int ret;
2357
2358	mutex_lock(&zram_index_mutex);
2359	ret = zram_add();
2360	mutex_unlock(&zram_index_mutex);
2361
2362	if (ret < 0)
2363		return ret;
2364	return scnprintf(buf, PAGE_SIZE, "%d\n", ret);
2365}
2366/* This attribute must be set to 0400, so CLASS_ATTR_RO() can not be used */
2367static struct class_attribute class_attr_hot_add =
2368	__ATTR(hot_add, 0400, hot_add_show, NULL);
2369
2370static ssize_t hot_remove_store(const struct class *class,
2371			const struct class_attribute *attr,
2372			const char *buf,
2373			size_t count)
2374{
2375	struct zram *zram;
2376	int ret, dev_id;
2377
2378	/* dev_id is gendisk->first_minor, which is `int' */
2379	ret = kstrtoint(buf, 10, &dev_id);
2380	if (ret)
2381		return ret;
2382	if (dev_id < 0)
2383		return -EINVAL;
2384
2385	mutex_lock(&zram_index_mutex);
2386
2387	zram = idr_find(&zram_index_idr, dev_id);
2388	if (zram) {
2389		ret = zram_remove(zram);
2390		if (!ret)
2391			idr_remove(&zram_index_idr, dev_id);
2392	} else {
2393		ret = -ENODEV;
2394	}
2395
2396	mutex_unlock(&zram_index_mutex);
2397	return ret ? ret : count;
2398}
2399static CLASS_ATTR_WO(hot_remove);
2400
2401static struct attribute *zram_control_class_attrs[] = {
2402	&class_attr_hot_add.attr,
2403	&class_attr_hot_remove.attr,
2404	NULL,
2405};
2406ATTRIBUTE_GROUPS(zram_control_class);
2407
2408static struct class zram_control_class = {
2409	.name		= "zram-control",
2410	.class_groups	= zram_control_class_groups,
2411};
2412
2413static int zram_remove_cb(int id, void *ptr, void *data)
2414{
2415	WARN_ON_ONCE(zram_remove(ptr));
2416	return 0;
2417}
2418
2419static void destroy_devices(void)
2420{
2421	class_unregister(&zram_control_class);
2422	idr_for_each(&zram_index_idr, &zram_remove_cb, NULL);
2423	zram_debugfs_destroy();
2424	idr_destroy(&zram_index_idr);
2425	unregister_blkdev(zram_major, "zram");
2426	cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
2427}
2428
2429static int __init zram_init(void)
2430{
2431	int ret;
2432
2433	BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > BITS_PER_LONG);
2434
2435	ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare",
2436				      zcomp_cpu_up_prepare, zcomp_cpu_dead);
2437	if (ret < 0)
2438		return ret;
2439
2440	ret = class_register(&zram_control_class);
2441	if (ret) {
2442		pr_err("Unable to register zram-control class\n");
2443		cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
2444		return ret;
2445	}
2446
2447	zram_debugfs_create();
2448	zram_major = register_blkdev(0, "zram");
2449	if (zram_major <= 0) {
2450		pr_err("Unable to get major number\n");
2451		class_unregister(&zram_control_class);
2452		cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
2453		return -EBUSY;
2454	}
2455
2456	while (num_devices != 0) {
2457		mutex_lock(&zram_index_mutex);
2458		ret = zram_add();
2459		mutex_unlock(&zram_index_mutex);
2460		if (ret < 0)
2461			goto out_error;
2462		num_devices--;
2463	}
2464
2465	return 0;
2466
2467out_error:
2468	destroy_devices();
2469	return ret;
2470}
2471
2472static void __exit zram_exit(void)
2473{
2474	destroy_devices();
2475}
2476
2477module_init(zram_init);
2478module_exit(zram_exit);
2479
2480module_param(num_devices, uint, 0);
2481MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices");
2482
2483MODULE_LICENSE("Dual BSD/GPL");
2484MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
2485MODULE_DESCRIPTION("Compressed RAM Block Device");
2486