xref: /kernel/linux/linux-6.6/fs/erofs/zdata.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2018 HUAWEI, Inc.
4 *             https://www.huawei.com/
5 * Copyright (C) 2022 Alibaba Cloud
6 */
7#include "compress.h"
8#include <linux/psi.h>
9#include <linux/cpuhotplug.h>
10#include <trace/events/erofs.h>
11
12#define Z_EROFS_PCLUSTER_MAX_PAGES	(Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
13#define Z_EROFS_INLINE_BVECS		2
14
15/*
16 * let's leave a type here in case of introducing
17 * another tagged pointer later.
18 */
19typedef void *z_erofs_next_pcluster_t;
20
21struct z_erofs_bvec {
22	struct page *page;
23	int offset;
24	unsigned int end;
25};
26
27#define __Z_EROFS_BVSET(name, total) \
28struct name { \
29	/* point to the next page which contains the following bvecs */ \
30	struct page *nextpage; \
31	struct z_erofs_bvec bvec[total]; \
32}
33__Z_EROFS_BVSET(z_erofs_bvset,);
34__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
35
36/*
37 * Structure fields follow one of the following exclusion rules.
38 *
39 * I: Modifiable by initialization/destruction paths and read-only
40 *    for everyone else;
41 *
42 * L: Field should be protected by the pcluster lock;
43 *
44 * A: Field should be accessed / updated in atomic for parallelized code.
45 */
46struct z_erofs_pcluster {
47	struct erofs_workgroup obj;
48	struct mutex lock;
49
50	/* A: point to next chained pcluster or TAILs */
51	z_erofs_next_pcluster_t next;
52
53	/* L: the maximum decompression size of this round */
54	unsigned int length;
55
56	/* L: total number of bvecs */
57	unsigned int vcnt;
58
59	/* I: page offset of start position of decompression */
60	unsigned short pageofs_out;
61
62	/* I: page offset of inline compressed data */
63	unsigned short pageofs_in;
64
65	union {
66		/* L: inline a certain number of bvec for bootstrap */
67		struct z_erofs_bvset_inline bvset;
68
69		/* I: can be used to free the pcluster by RCU. */
70		struct rcu_head rcu;
71	};
72
73	union {
74		/* I: physical cluster size in pages */
75		unsigned short pclusterpages;
76
77		/* I: tailpacking inline compressed size */
78		unsigned short tailpacking_size;
79	};
80
81	/* I: compression algorithm format */
82	unsigned char algorithmformat;
83
84	/* L: whether partial decompression or not */
85	bool partial;
86
87	/* L: indicate several pageofs_outs or not */
88	bool multibases;
89
90	/* A: compressed bvecs (can be cached or inplaced pages) */
91	struct z_erofs_bvec compressed_bvecs[];
92};
93
94/* the end of a chain of pclusters */
95#define Z_EROFS_PCLUSTER_TAIL           ((void *) 0x700 + POISON_POINTER_DELTA)
96#define Z_EROFS_PCLUSTER_NIL            (NULL)
97
98struct z_erofs_decompressqueue {
99	struct super_block *sb;
100	atomic_t pending_bios;
101	z_erofs_next_pcluster_t head;
102
103	union {
104		struct completion done;
105		struct work_struct work;
106		struct kthread_work kthread_work;
107	} u;
108	bool eio, sync;
109};
110
111static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
112{
113	return !pcl->obj.index;
114}
115
116static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
117{
118	if (z_erofs_is_inline_pcluster(pcl))
119		return 1;
120	return pcl->pclusterpages;
121}
122
123/*
124 * bit 30: I/O error occurred on this page
125 * bit 0 - 29: remaining parts to complete this page
126 */
127#define Z_EROFS_PAGE_EIO			(1 << 30)
128
129static inline void z_erofs_onlinepage_init(struct page *page)
130{
131	union {
132		atomic_t o;
133		unsigned long v;
134	} u = { .o = ATOMIC_INIT(1) };
135
136	set_page_private(page, u.v);
137	smp_wmb();
138	SetPagePrivate(page);
139}
140
141static inline void z_erofs_onlinepage_split(struct page *page)
142{
143	atomic_inc((atomic_t *)&page->private);
144}
145
146static void z_erofs_onlinepage_endio(struct page *page, int err)
147{
148	int orig, v;
149
150	DBG_BUGON(!PagePrivate(page));
151
152	do {
153		orig = atomic_read((atomic_t *)&page->private);
154		v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0);
155	} while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig);
156
157	if (!(v & ~Z_EROFS_PAGE_EIO)) {
158		set_page_private(page, 0);
159		ClearPagePrivate(page);
160		if (!(v & Z_EROFS_PAGE_EIO))
161			SetPageUptodate(page);
162		unlock_page(page);
163	}
164}
165
166#define Z_EROFS_ONSTACK_PAGES		32
167
168/*
169 * since pclustersize is variable for big pcluster feature, introduce slab
170 * pools implementation for different pcluster sizes.
171 */
172struct z_erofs_pcluster_slab {
173	struct kmem_cache *slab;
174	unsigned int maxpages;
175	char name[48];
176};
177
178#define _PCLP(n) { .maxpages = n }
179
180static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
181	_PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
182	_PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
183};
184
185struct z_erofs_bvec_iter {
186	struct page *bvpage;
187	struct z_erofs_bvset *bvset;
188	unsigned int nr, cur;
189};
190
191static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter)
192{
193	if (iter->bvpage)
194		kunmap_local(iter->bvset);
195	return iter->bvpage;
196}
197
198static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter)
199{
200	unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec;
201	/* have to access nextpage in advance, otherwise it will be unmapped */
202	struct page *nextpage = iter->bvset->nextpage;
203	struct page *oldpage;
204
205	DBG_BUGON(!nextpage);
206	oldpage = z_erofs_bvec_iter_end(iter);
207	iter->bvpage = nextpage;
208	iter->bvset = kmap_local_page(nextpage);
209	iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec);
210	iter->cur = 0;
211	return oldpage;
212}
213
214static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter,
215				    struct z_erofs_bvset_inline *bvset,
216				    unsigned int bootstrap_nr,
217				    unsigned int cur)
218{
219	*iter = (struct z_erofs_bvec_iter) {
220		.nr = bootstrap_nr,
221		.bvset = (struct z_erofs_bvset *)bvset,
222	};
223
224	while (cur > iter->nr) {
225		cur -= iter->nr;
226		z_erofs_bvset_flip(iter);
227	}
228	iter->cur = cur;
229}
230
231static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
232				struct z_erofs_bvec *bvec,
233				struct page **candidate_bvpage,
234				struct page **pagepool)
235{
236	if (iter->cur >= iter->nr) {
237		struct page *nextpage = *candidate_bvpage;
238
239		if (!nextpage) {
240			nextpage = erofs_allocpage(pagepool, GFP_NOFS);
241			if (!nextpage)
242				return -ENOMEM;
243			set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
244		}
245		DBG_BUGON(iter->bvset->nextpage);
246		iter->bvset->nextpage = nextpage;
247		z_erofs_bvset_flip(iter);
248
249		iter->bvset->nextpage = NULL;
250		*candidate_bvpage = NULL;
251	}
252	iter->bvset->bvec[iter->cur++] = *bvec;
253	return 0;
254}
255
256static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter,
257				 struct z_erofs_bvec *bvec,
258				 struct page **old_bvpage)
259{
260	if (iter->cur == iter->nr)
261		*old_bvpage = z_erofs_bvset_flip(iter);
262	else
263		*old_bvpage = NULL;
264	*bvec = iter->bvset->bvec[iter->cur++];
265}
266
267static void z_erofs_destroy_pcluster_pool(void)
268{
269	int i;
270
271	for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
272		if (!pcluster_pool[i].slab)
273			continue;
274		kmem_cache_destroy(pcluster_pool[i].slab);
275		pcluster_pool[i].slab = NULL;
276	}
277}
278
279static int z_erofs_create_pcluster_pool(void)
280{
281	struct z_erofs_pcluster_slab *pcs;
282	struct z_erofs_pcluster *a;
283	unsigned int size;
284
285	for (pcs = pcluster_pool;
286	     pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
287		size = struct_size(a, compressed_bvecs, pcs->maxpages);
288
289		sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages);
290		pcs->slab = kmem_cache_create(pcs->name, size, 0,
291					      SLAB_RECLAIM_ACCOUNT, NULL);
292		if (pcs->slab)
293			continue;
294
295		z_erofs_destroy_pcluster_pool();
296		return -ENOMEM;
297	}
298	return 0;
299}
300
301static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages)
302{
303	int i;
304
305	for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
306		struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
307		struct z_erofs_pcluster *pcl;
308
309		if (nrpages > pcs->maxpages)
310			continue;
311
312		pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
313		if (!pcl)
314			return ERR_PTR(-ENOMEM);
315		pcl->pclusterpages = nrpages;
316		return pcl;
317	}
318	return ERR_PTR(-EINVAL);
319}
320
321static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
322{
323	unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
324	int i;
325
326	for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
327		struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
328
329		if (pclusterpages > pcs->maxpages)
330			continue;
331
332		kmem_cache_free(pcs->slab, pcl);
333		return;
334	}
335	DBG_BUGON(1);
336}
337
338static struct workqueue_struct *z_erofs_workqueue __read_mostly;
339
340#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
341static struct kthread_worker __rcu **z_erofs_pcpu_workers;
342
343static void erofs_destroy_percpu_workers(void)
344{
345	struct kthread_worker *worker;
346	unsigned int cpu;
347
348	for_each_possible_cpu(cpu) {
349		worker = rcu_dereference_protected(
350					z_erofs_pcpu_workers[cpu], 1);
351		rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL);
352		if (worker)
353			kthread_destroy_worker(worker);
354	}
355	kfree(z_erofs_pcpu_workers);
356}
357
358static struct kthread_worker *erofs_init_percpu_worker(int cpu)
359{
360	struct kthread_worker *worker =
361		kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu);
362
363	if (IS_ERR(worker))
364		return worker;
365	if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI))
366		sched_set_fifo_low(worker->task);
367	return worker;
368}
369
370static int erofs_init_percpu_workers(void)
371{
372	struct kthread_worker *worker;
373	unsigned int cpu;
374
375	z_erofs_pcpu_workers = kcalloc(num_possible_cpus(),
376			sizeof(struct kthread_worker *), GFP_ATOMIC);
377	if (!z_erofs_pcpu_workers)
378		return -ENOMEM;
379
380	for_each_online_cpu(cpu) {	/* could miss cpu{off,on}line? */
381		worker = erofs_init_percpu_worker(cpu);
382		if (!IS_ERR(worker))
383			rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker);
384	}
385	return 0;
386}
387#else
388static inline void erofs_destroy_percpu_workers(void) {}
389static inline int erofs_init_percpu_workers(void) { return 0; }
390#endif
391
392#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD)
393static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock);
394static enum cpuhp_state erofs_cpuhp_state;
395
396static int erofs_cpu_online(unsigned int cpu)
397{
398	struct kthread_worker *worker, *old;
399
400	worker = erofs_init_percpu_worker(cpu);
401	if (IS_ERR(worker))
402		return PTR_ERR(worker);
403
404	spin_lock(&z_erofs_pcpu_worker_lock);
405	old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu],
406			lockdep_is_held(&z_erofs_pcpu_worker_lock));
407	if (!old)
408		rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker);
409	spin_unlock(&z_erofs_pcpu_worker_lock);
410	if (old)
411		kthread_destroy_worker(worker);
412	return 0;
413}
414
415static int erofs_cpu_offline(unsigned int cpu)
416{
417	struct kthread_worker *worker;
418
419	spin_lock(&z_erofs_pcpu_worker_lock);
420	worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu],
421			lockdep_is_held(&z_erofs_pcpu_worker_lock));
422	rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL);
423	spin_unlock(&z_erofs_pcpu_worker_lock);
424
425	synchronize_rcu();
426	if (worker)
427		kthread_destroy_worker(worker);
428	return 0;
429}
430
431static int erofs_cpu_hotplug_init(void)
432{
433	int state;
434
435	state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
436			"fs/erofs:online", erofs_cpu_online, erofs_cpu_offline);
437	if (state < 0)
438		return state;
439
440	erofs_cpuhp_state = state;
441	return 0;
442}
443
444static void erofs_cpu_hotplug_destroy(void)
445{
446	if (erofs_cpuhp_state)
447		cpuhp_remove_state_nocalls(erofs_cpuhp_state);
448}
449#else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */
450static inline int erofs_cpu_hotplug_init(void) { return 0; }
451static inline void erofs_cpu_hotplug_destroy(void) {}
452#endif
453
454void z_erofs_exit_zip_subsystem(void)
455{
456	erofs_cpu_hotplug_destroy();
457	erofs_destroy_percpu_workers();
458	destroy_workqueue(z_erofs_workqueue);
459	z_erofs_destroy_pcluster_pool();
460}
461
462int __init z_erofs_init_zip_subsystem(void)
463{
464	int err = z_erofs_create_pcluster_pool();
465
466	if (err)
467		goto out_error_pcluster_pool;
468
469	z_erofs_workqueue = alloc_workqueue("erofs_worker",
470			WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus());
471	if (!z_erofs_workqueue) {
472		err = -ENOMEM;
473		goto out_error_workqueue_init;
474	}
475
476	err = erofs_init_percpu_workers();
477	if (err)
478		goto out_error_pcpu_worker;
479
480	err = erofs_cpu_hotplug_init();
481	if (err < 0)
482		goto out_error_cpuhp_init;
483	return err;
484
485out_error_cpuhp_init:
486	erofs_destroy_percpu_workers();
487out_error_pcpu_worker:
488	destroy_workqueue(z_erofs_workqueue);
489out_error_workqueue_init:
490	z_erofs_destroy_pcluster_pool();
491out_error_pcluster_pool:
492	return err;
493}
494
495enum z_erofs_pclustermode {
496	Z_EROFS_PCLUSTER_INFLIGHT,
497	/*
498	 * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
499	 * could be dispatched into bypass queue later due to uptodated managed
500	 * pages. All related online pages cannot be reused for inplace I/O (or
501	 * bvpage) since it can be directly decoded without I/O submission.
502	 */
503	Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
504	/*
505	 * The pcluster was just linked to a decompression chain by us.  It can
506	 * also be linked with the remaining pclusters, which means if the
507	 * processing page is the tail page of a pcluster, this pcluster can
508	 * safely use the whole page (since the previous pcluster is within the
509	 * same chain) for in-place I/O, as illustrated below:
510	 *  ___________________________________________________
511	 * |  tail (partial) page  |    head (partial) page    |
512	 * |  (of the current pcl) |   (of the previous pcl)   |
513	 * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____|
514	 *
515	 * [  (*) the page above can be used as inplace I/O.   ]
516	 */
517	Z_EROFS_PCLUSTER_FOLLOWED,
518};
519
520struct z_erofs_decompress_frontend {
521	struct inode *const inode;
522	struct erofs_map_blocks map;
523	struct z_erofs_bvec_iter biter;
524
525	struct page *pagepool;
526	struct page *candidate_bvpage;
527	struct z_erofs_pcluster *pcl;
528	z_erofs_next_pcluster_t owned_head;
529	enum z_erofs_pclustermode mode;
530
531	erofs_off_t headoffset;
532
533	/* a pointer used to pick up inplace I/O pages */
534	unsigned int icur;
535};
536
537#define DECOMPRESS_FRONTEND_INIT(__i) { \
538	.inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
539	.mode = Z_EROFS_PCLUSTER_FOLLOWED }
540
541static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
542{
543	unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy;
544
545	if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
546		return false;
547
548	if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED))
549		return true;
550
551	if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
552	    fe->map.m_la < fe->headoffset)
553		return true;
554
555	return false;
556}
557
558static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
559{
560	struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
561	struct z_erofs_pcluster *pcl = fe->pcl;
562	bool shouldalloc = z_erofs_should_alloc_cache(fe);
563	bool standalone = true;
564	/*
565	 * optimistic allocation without direct reclaim since inplace I/O
566	 * can be used if low memory otherwise.
567	 */
568	gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
569			__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
570	unsigned int i;
571
572	if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
573		return;
574
575	for (i = 0; i < pcl->pclusterpages; ++i) {
576		struct page *page;
577		void *t;	/* mark pages just found for debugging */
578		struct page *newpage = NULL;
579
580		/* the compressed page was loaded before */
581		if (READ_ONCE(pcl->compressed_bvecs[i].page))
582			continue;
583
584		page = find_get_page(mc, pcl->obj.index + i);
585
586		if (page) {
587			t = (void *)((unsigned long)page | 1);
588		} else {
589			/* I/O is needed, no possible to decompress directly */
590			standalone = false;
591			if (!shouldalloc)
592				continue;
593
594			/*
595			 * try to use cached I/O if page allocation
596			 * succeeds or fallback to in-place I/O instead
597			 * to avoid any direct reclaim.
598			 */
599			newpage = erofs_allocpage(&fe->pagepool, gfp);
600			if (!newpage)
601				continue;
602			set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
603			t = (void *)((unsigned long)newpage | 1);
604		}
605
606		if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
607			continue;
608
609		if (page)
610			put_page(page);
611		else if (newpage)
612			erofs_pagepool_add(&fe->pagepool, newpage);
613	}
614
615	/*
616	 * don't do inplace I/O if all compressed pages are available in
617	 * managed cache since it can be moved to the bypass queue instead.
618	 */
619	if (standalone)
620		fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
621}
622
623/* called by erofs_shrinker to get rid of all compressed_pages */
624int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
625				       struct erofs_workgroup *grp)
626{
627	struct z_erofs_pcluster *const pcl =
628		container_of(grp, struct z_erofs_pcluster, obj);
629	int i;
630
631	DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
632	/*
633	 * refcount of workgroup is now freezed as 0,
634	 * therefore no need to worry about available decompression users.
635	 */
636	for (i = 0; i < pcl->pclusterpages; ++i) {
637		struct page *page = pcl->compressed_bvecs[i].page;
638
639		if (!page)
640			continue;
641
642		/* block other users from reclaiming or migrating the page */
643		if (!trylock_page(page))
644			return -EBUSY;
645
646		if (!erofs_page_is_managed(sbi, page))
647			continue;
648
649		/* barrier is implied in the following 'unlock_page' */
650		WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
651		detach_page_private(page);
652		unlock_page(page);
653	}
654	return 0;
655}
656
657static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
658{
659	struct z_erofs_pcluster *pcl = folio_get_private(folio);
660	bool ret;
661	int i;
662
663	if (!folio_test_private(folio))
664		return true;
665
666	ret = false;
667	spin_lock(&pcl->obj.lockref.lock);
668	if (pcl->obj.lockref.count > 0)
669		goto out;
670
671	DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
672	for (i = 0; i < pcl->pclusterpages; ++i) {
673		if (pcl->compressed_bvecs[i].page == &folio->page) {
674			WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
675			ret = true;
676			break;
677		}
678	}
679	if (ret)
680		folio_detach_private(folio);
681out:
682	spin_unlock(&pcl->obj.lockref.lock);
683	return ret;
684}
685
686/*
687 * It will be called only on inode eviction. In case that there are still some
688 * decompression requests in progress, wait with rescheduling for a bit here.
689 * An extra lock could be introduced instead but it seems unnecessary.
690 */
691static void z_erofs_cache_invalidate_folio(struct folio *folio,
692					   size_t offset, size_t length)
693{
694	const size_t stop = length + offset;
695
696	/* Check for potential overflow in debug mode */
697	DBG_BUGON(stop > folio_size(folio) || stop < length);
698
699	if (offset == 0 && stop == folio_size(folio))
700		while (!z_erofs_cache_release_folio(folio, GFP_NOFS))
701			cond_resched();
702}
703
704static const struct address_space_operations z_erofs_cache_aops = {
705	.release_folio = z_erofs_cache_release_folio,
706	.invalidate_folio = z_erofs_cache_invalidate_folio,
707};
708
709int erofs_init_managed_cache(struct super_block *sb)
710{
711	struct inode *const inode = new_inode(sb);
712
713	if (!inode)
714		return -ENOMEM;
715
716	set_nlink(inode, 1);
717	inode->i_size = OFFSET_MAX;
718	inode->i_mapping->a_ops = &z_erofs_cache_aops;
719	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
720	EROFS_SB(sb)->managed_cache = inode;
721	return 0;
722}
723
724static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
725				   struct z_erofs_bvec *bvec)
726{
727	struct z_erofs_pcluster *const pcl = fe->pcl;
728
729	while (fe->icur > 0) {
730		if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
731			     NULL, bvec->page)) {
732			pcl->compressed_bvecs[fe->icur] = *bvec;
733			return true;
734		}
735	}
736	return false;
737}
738
739/* callers must be with pcluster lock held */
740static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
741			       struct z_erofs_bvec *bvec, bool exclusive)
742{
743	int ret;
744
745	if (exclusive) {
746		/* give priority for inplaceio to use file pages first */
747		if (z_erofs_try_inplace_io(fe, bvec))
748			return 0;
749		/* otherwise, check if it can be used as a bvpage */
750		if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
751		    !fe->candidate_bvpage)
752			fe->candidate_bvpage = bvec->page;
753	}
754	ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage,
755				   &fe->pagepool);
756	fe->pcl->vcnt += (ret >= 0);
757	return ret;
758}
759
760static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
761{
762	struct z_erofs_pcluster *pcl = f->pcl;
763	z_erofs_next_pcluster_t *owned_head = &f->owned_head;
764
765	/* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */
766	if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
767		    *owned_head) == Z_EROFS_PCLUSTER_NIL) {
768		*owned_head = &pcl->next;
769		/* so we can attach this pcluster to our submission chain. */
770		f->mode = Z_EROFS_PCLUSTER_FOLLOWED;
771		return;
772	}
773
774	/* type 2, it belongs to an ongoing chain */
775	f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
776}
777
778static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
779{
780	struct erofs_map_blocks *map = &fe->map;
781	bool ztailpacking = map->m_flags & EROFS_MAP_META;
782	struct z_erofs_pcluster *pcl;
783	struct erofs_workgroup *grp;
784	int err;
785
786	if (!(map->m_flags & EROFS_MAP_ENCODED) ||
787	    (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) {
788		DBG_BUGON(1);
789		return -EFSCORRUPTED;
790	}
791
792	/* no available pcluster, let's allocate one */
793	pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 :
794				     map->m_plen >> PAGE_SHIFT);
795	if (IS_ERR(pcl))
796		return PTR_ERR(pcl);
797
798	spin_lock_init(&pcl->obj.lockref.lock);
799	pcl->obj.lockref.count = 1;	/* one ref for this request */
800	pcl->algorithmformat = map->m_algorithmformat;
801	pcl->length = 0;
802	pcl->partial = true;
803
804	/* new pclusters should be claimed as type 1, primary and followed */
805	pcl->next = fe->owned_head;
806	pcl->pageofs_out = map->m_la & ~PAGE_MASK;
807	fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
808
809	/*
810	 * lock all primary followed works before visible to others
811	 * and mutex_trylock *never* fails for a new pcluster.
812	 */
813	mutex_init(&pcl->lock);
814	DBG_BUGON(!mutex_trylock(&pcl->lock));
815
816	if (ztailpacking) {
817		pcl->obj.index = 0;	/* which indicates ztailpacking */
818		pcl->tailpacking_size = map->m_plen;
819	} else {
820		pcl->obj.index = map->m_pa >> PAGE_SHIFT;
821
822		grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj);
823		if (IS_ERR(grp)) {
824			err = PTR_ERR(grp);
825			goto err_out;
826		}
827
828		if (grp != &pcl->obj) {
829			fe->pcl = container_of(grp,
830					struct z_erofs_pcluster, obj);
831			err = -EEXIST;
832			goto err_out;
833		}
834	}
835	fe->owned_head = &pcl->next;
836	fe->pcl = pcl;
837	return 0;
838
839err_out:
840	mutex_unlock(&pcl->lock);
841	z_erofs_free_pcluster(pcl);
842	return err;
843}
844
845static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
846{
847	struct erofs_map_blocks *map = &fe->map;
848	struct super_block *sb = fe->inode->i_sb;
849	erofs_blk_t blknr = erofs_blknr(sb, map->m_pa);
850	struct erofs_workgroup *grp = NULL;
851	int ret;
852
853	DBG_BUGON(fe->pcl);
854
855	/* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
856	DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
857
858	if (!(map->m_flags & EROFS_MAP_META)) {
859		grp = erofs_find_workgroup(sb, blknr);
860	} else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
861		DBG_BUGON(1);
862		return -EFSCORRUPTED;
863	}
864
865	if (grp) {
866		fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
867		ret = -EEXIST;
868	} else {
869		ret = z_erofs_register_pcluster(fe);
870	}
871
872	if (ret == -EEXIST) {
873		mutex_lock(&fe->pcl->lock);
874		z_erofs_try_to_claim_pcluster(fe);
875	} else if (ret) {
876		return ret;
877	}
878
879	z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
880				Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
881	if (!z_erofs_is_inline_pcluster(fe->pcl)) {
882		/* bind cache first when cached decompression is preferred */
883		z_erofs_bind_cache(fe);
884	} else {
885		void *mptr;
886
887		mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP);
888		if (IS_ERR(mptr)) {
889			ret = PTR_ERR(mptr);
890			erofs_err(sb, "failed to get inline data %d", ret);
891			return ret;
892		}
893		get_page(map->buf.page);
894		WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page);
895		fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK;
896		fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
897	}
898	/* file-backed inplace I/O pages are traversed in reverse order */
899	fe->icur = z_erofs_pclusterpages(fe->pcl);
900	return 0;
901}
902
903/*
904 * keep in mind that no referenced pclusters will be freed
905 * only after a RCU grace period.
906 */
907static void z_erofs_rcu_callback(struct rcu_head *head)
908{
909	z_erofs_free_pcluster(container_of(head,
910			struct z_erofs_pcluster, rcu));
911}
912
913void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
914{
915	struct z_erofs_pcluster *const pcl =
916		container_of(grp, struct z_erofs_pcluster, obj);
917
918	call_rcu(&pcl->rcu, z_erofs_rcu_callback);
919}
920
921static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
922{
923	struct z_erofs_pcluster *pcl = fe->pcl;
924
925	if (!pcl)
926		return;
927
928	z_erofs_bvec_iter_end(&fe->biter);
929	mutex_unlock(&pcl->lock);
930
931	if (fe->candidate_bvpage)
932		fe->candidate_bvpage = NULL;
933
934	/*
935	 * if all pending pages are added, don't hold its reference
936	 * any longer if the pcluster isn't hosted by ourselves.
937	 */
938	if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
939		erofs_workgroup_put(&pcl->obj);
940
941	fe->pcl = NULL;
942}
943
944static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
945			unsigned int cur, unsigned int end, erofs_off_t pos)
946{
947	struct inode *packed_inode = EROFS_SB(sb)->packed_inode;
948	struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
949	unsigned int cnt;
950	u8 *src;
951
952	if (!packed_inode)
953		return -EFSCORRUPTED;
954
955	buf.inode = packed_inode;
956	for (; cur < end; cur += cnt, pos += cnt) {
957		cnt = min_t(unsigned int, end - cur,
958			    sb->s_blocksize - erofs_blkoff(sb, pos));
959		src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP);
960		if (IS_ERR(src)) {
961			erofs_put_metabuf(&buf);
962			return PTR_ERR(src);
963		}
964		memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt);
965	}
966	erofs_put_metabuf(&buf);
967	return 0;
968}
969
970static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
971				struct page *page)
972{
973	struct inode *const inode = fe->inode;
974	struct erofs_map_blocks *const map = &fe->map;
975	const loff_t offset = page_offset(page);
976	bool tight = true, exclusive;
977	unsigned int cur, end, len, split;
978	int err = 0;
979
980	z_erofs_onlinepage_init(page);
981
982	split = 0;
983	end = PAGE_SIZE;
984repeat:
985	if (offset + end - 1 < map->m_la ||
986	    offset + end - 1 >= map->m_la + map->m_llen) {
987		z_erofs_pcluster_end(fe);
988		map->m_la = offset + end - 1;
989		map->m_llen = 0;
990		err = z_erofs_map_blocks_iter(inode, map, 0);
991		if (err)
992			goto out;
993	}
994
995	cur = offset > map->m_la ? 0 : map->m_la - offset;
996	/* bump split parts first to avoid several separate cases */
997	++split;
998
999	if (!(map->m_flags & EROFS_MAP_MAPPED)) {
1000		zero_user_segment(page, cur, end);
1001		tight = false;
1002		goto next_part;
1003	}
1004
1005	if (map->m_flags & EROFS_MAP_FRAGMENT) {
1006		erofs_off_t fpos = offset + cur - map->m_la;
1007
1008		len = min_t(unsigned int, map->m_llen - fpos, end - cur);
1009		err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len,
1010				EROFS_I(inode)->z_fragmentoff + fpos);
1011		if (err)
1012			goto out;
1013		tight = false;
1014		goto next_part;
1015	}
1016
1017	if (!fe->pcl) {
1018		err = z_erofs_pcluster_begin(fe);
1019		if (err)
1020			goto out;
1021	}
1022
1023	/*
1024	 * Ensure the current partial page belongs to this submit chain rather
1025	 * than other concurrent submit chains or the noio(bypass) chain since
1026	 * those chains are handled asynchronously thus the page cannot be used
1027	 * for inplace I/O or bvpage (should be processed in a strict order.)
1028	 */
1029	tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
1030	exclusive = (!cur && ((split <= 1) || tight));
1031	if (cur)
1032		tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
1033
1034	err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) {
1035					.page = page,
1036					.offset = offset - map->m_la,
1037					.end = end,
1038				  }), exclusive);
1039	if (err)
1040		goto out;
1041
1042	z_erofs_onlinepage_split(page);
1043	if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
1044		fe->pcl->multibases = true;
1045	if (fe->pcl->length < offset + end - map->m_la) {
1046		fe->pcl->length = offset + end - map->m_la;
1047		fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
1048	}
1049	if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
1050	    !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
1051	    fe->pcl->length == map->m_llen)
1052		fe->pcl->partial = false;
1053next_part:
1054	/* shorten the remaining extent to update progress */
1055	map->m_llen = offset + cur - map->m_la;
1056	map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
1057
1058	end = cur;
1059	if (end > 0)
1060		goto repeat;
1061
1062out:
1063	z_erofs_onlinepage_endio(page, err);
1064	return err;
1065}
1066
1067static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi,
1068				       unsigned int readahead_pages)
1069{
1070	/* auto: enable for read_folio, disable for readahead */
1071	if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) &&
1072	    !readahead_pages)
1073		return true;
1074
1075	if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) &&
1076	    (readahead_pages <= sbi->opt.max_sync_decompress_pages))
1077		return true;
1078
1079	return false;
1080}
1081
1082static bool z_erofs_page_is_invalidated(struct page *page)
1083{
1084	return !page->mapping && !z_erofs_is_shortlived_page(page);
1085}
1086
1087struct z_erofs_decompress_backend {
1088	struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
1089	struct super_block *sb;
1090	struct z_erofs_pcluster *pcl;
1091
1092	/* pages with the longest decompressed length for deduplication */
1093	struct page **decompressed_pages;
1094	/* pages to keep the compressed data */
1095	struct page **compressed_pages;
1096
1097	struct list_head decompressed_secondary_bvecs;
1098	struct page **pagepool;
1099	unsigned int onstack_used, nr_pages;
1100};
1101
1102struct z_erofs_bvec_item {
1103	struct z_erofs_bvec bvec;
1104	struct list_head list;
1105};
1106
1107static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
1108					 struct z_erofs_bvec *bvec)
1109{
1110	struct z_erofs_bvec_item *item;
1111	unsigned int pgnr;
1112
1113	if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) &&
1114	    (bvec->end == PAGE_SIZE ||
1115	     bvec->offset + bvec->end == be->pcl->length)) {
1116		pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT;
1117		DBG_BUGON(pgnr >= be->nr_pages);
1118		if (!be->decompressed_pages[pgnr]) {
1119			be->decompressed_pages[pgnr] = bvec->page;
1120			return;
1121		}
1122	}
1123
1124	/* (cold path) one pcluster is requested multiple times */
1125	item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL);
1126	item->bvec = *bvec;
1127	list_add(&item->list, &be->decompressed_secondary_bvecs);
1128}
1129
1130static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
1131				      int err)
1132{
1133	unsigned int off0 = be->pcl->pageofs_out;
1134	struct list_head *p, *n;
1135
1136	list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) {
1137		struct z_erofs_bvec_item *bvi;
1138		unsigned int end, cur;
1139		void *dst, *src;
1140
1141		bvi = container_of(p, struct z_erofs_bvec_item, list);
1142		cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0;
1143		end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset,
1144			    bvi->bvec.end);
1145		dst = kmap_local_page(bvi->bvec.page);
1146		while (cur < end) {
1147			unsigned int pgnr, scur, len;
1148
1149			pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT;
1150			DBG_BUGON(pgnr >= be->nr_pages);
1151
1152			scur = bvi->bvec.offset + cur -
1153					((pgnr << PAGE_SHIFT) - off0);
1154			len = min_t(unsigned int, end - cur, PAGE_SIZE - scur);
1155			if (!be->decompressed_pages[pgnr]) {
1156				err = -EFSCORRUPTED;
1157				cur += len;
1158				continue;
1159			}
1160			src = kmap_local_page(be->decompressed_pages[pgnr]);
1161			memcpy(dst + cur, src + scur, len);
1162			kunmap_local(src);
1163			cur += len;
1164		}
1165		kunmap_local(dst);
1166		z_erofs_onlinepage_endio(bvi->bvec.page, err);
1167		list_del(p);
1168		kfree(bvi);
1169	}
1170}
1171
1172static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
1173{
1174	struct z_erofs_pcluster *pcl = be->pcl;
1175	struct z_erofs_bvec_iter biter;
1176	struct page *old_bvpage;
1177	int i;
1178
1179	z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0);
1180	for (i = 0; i < pcl->vcnt; ++i) {
1181		struct z_erofs_bvec bvec;
1182
1183		z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage);
1184
1185		if (old_bvpage)
1186			z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
1187
1188		DBG_BUGON(z_erofs_page_is_invalidated(bvec.page));
1189		z_erofs_do_decompressed_bvec(be, &bvec);
1190	}
1191
1192	old_bvpage = z_erofs_bvec_iter_end(&biter);
1193	if (old_bvpage)
1194		z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
1195}
1196
1197static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
1198				  bool *overlapped)
1199{
1200	struct z_erofs_pcluster *pcl = be->pcl;
1201	unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
1202	int i, err = 0;
1203
1204	*overlapped = false;
1205	for (i = 0; i < pclusterpages; ++i) {
1206		struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i];
1207		struct page *page = bvec->page;
1208
1209		/* compressed pages ought to be present before decompressing */
1210		if (!page) {
1211			DBG_BUGON(1);
1212			continue;
1213		}
1214		be->compressed_pages[i] = page;
1215
1216		if (z_erofs_is_inline_pcluster(pcl)) {
1217			if (!PageUptodate(page))
1218				err = -EIO;
1219			continue;
1220		}
1221
1222		DBG_BUGON(z_erofs_page_is_invalidated(page));
1223		if (!z_erofs_is_shortlived_page(page)) {
1224			if (erofs_page_is_managed(EROFS_SB(be->sb), page)) {
1225				if (!PageUptodate(page))
1226					err = -EIO;
1227				continue;
1228			}
1229			z_erofs_do_decompressed_bvec(be, bvec);
1230			*overlapped = true;
1231		}
1232	}
1233
1234	if (err)
1235		return err;
1236	return 0;
1237}
1238
1239static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
1240				       int err)
1241{
1242	struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
1243	struct z_erofs_pcluster *pcl = be->pcl;
1244	unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
1245	const struct z_erofs_decompressor *decompressor =
1246				&erofs_decompressors[pcl->algorithmformat];
1247	unsigned int i, inputsize;
1248	int err2;
1249	struct page *page;
1250	bool overlapped;
1251
1252	mutex_lock(&pcl->lock);
1253	be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
1254
1255	/* allocate (de)compressed page arrays if cannot be kept on stack */
1256	be->decompressed_pages = NULL;
1257	be->compressed_pages = NULL;
1258	be->onstack_used = 0;
1259	if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) {
1260		be->decompressed_pages = be->onstack_pages;
1261		be->onstack_used = be->nr_pages;
1262		memset(be->decompressed_pages, 0,
1263		       sizeof(struct page *) * be->nr_pages);
1264	}
1265
1266	if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES)
1267		be->compressed_pages = be->onstack_pages + be->onstack_used;
1268
1269	if (!be->decompressed_pages)
1270		be->decompressed_pages =
1271			kvcalloc(be->nr_pages, sizeof(struct page *),
1272				 GFP_KERNEL | __GFP_NOFAIL);
1273	if (!be->compressed_pages)
1274		be->compressed_pages =
1275			kvcalloc(pclusterpages, sizeof(struct page *),
1276				 GFP_KERNEL | __GFP_NOFAIL);
1277
1278	z_erofs_parse_out_bvecs(be);
1279	err2 = z_erofs_parse_in_bvecs(be, &overlapped);
1280	if (err2)
1281		err = err2;
1282	if (err)
1283		goto out;
1284
1285	if (z_erofs_is_inline_pcluster(pcl))
1286		inputsize = pcl->tailpacking_size;
1287	else
1288		inputsize = pclusterpages * PAGE_SIZE;
1289
1290	err = decompressor->decompress(&(struct z_erofs_decompress_req) {
1291					.sb = be->sb,
1292					.in = be->compressed_pages,
1293					.out = be->decompressed_pages,
1294					.pageofs_in = pcl->pageofs_in,
1295					.pageofs_out = pcl->pageofs_out,
1296					.inputsize = inputsize,
1297					.outputsize = pcl->length,
1298					.alg = pcl->algorithmformat,
1299					.inplace_io = overlapped,
1300					.partial_decoding = pcl->partial,
1301					.fillgaps = pcl->multibases,
1302				 }, be->pagepool);
1303
1304out:
1305	/* must handle all compressed pages before actual file pages */
1306	if (z_erofs_is_inline_pcluster(pcl)) {
1307		page = pcl->compressed_bvecs[0].page;
1308		WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
1309		put_page(page);
1310	} else {
1311		for (i = 0; i < pclusterpages; ++i) {
1312			/* consider shortlived pages added when decompressing */
1313			page = be->compressed_pages[i];
1314
1315			if (erofs_page_is_managed(sbi, page))
1316				continue;
1317			(void)z_erofs_put_shortlivedpage(be->pagepool, page);
1318			WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
1319		}
1320	}
1321	if (be->compressed_pages < be->onstack_pages ||
1322	    be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES)
1323		kvfree(be->compressed_pages);
1324	z_erofs_fill_other_copies(be, err);
1325
1326	for (i = 0; i < be->nr_pages; ++i) {
1327		page = be->decompressed_pages[i];
1328		if (!page)
1329			continue;
1330
1331		DBG_BUGON(z_erofs_page_is_invalidated(page));
1332
1333		/* recycle all individual short-lived pages */
1334		if (z_erofs_put_shortlivedpage(be->pagepool, page))
1335			continue;
1336		z_erofs_onlinepage_endio(page, err);
1337	}
1338
1339	if (be->decompressed_pages != be->onstack_pages)
1340		kvfree(be->decompressed_pages);
1341
1342	pcl->length = 0;
1343	pcl->partial = true;
1344	pcl->multibases = false;
1345	pcl->bvset.nextpage = NULL;
1346	pcl->vcnt = 0;
1347
1348	/* pcluster lock MUST be taken before the following line */
1349	WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
1350	mutex_unlock(&pcl->lock);
1351	return err;
1352}
1353
1354static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
1355				     struct page **pagepool)
1356{
1357	struct z_erofs_decompress_backend be = {
1358		.sb = io->sb,
1359		.pagepool = pagepool,
1360		.decompressed_secondary_bvecs =
1361			LIST_HEAD_INIT(be.decompressed_secondary_bvecs),
1362	};
1363	z_erofs_next_pcluster_t owned = io->head;
1364
1365	while (owned != Z_EROFS_PCLUSTER_TAIL) {
1366		DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
1367
1368		be.pcl = container_of(owned, struct z_erofs_pcluster, next);
1369		owned = READ_ONCE(be.pcl->next);
1370
1371		z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0);
1372		if (z_erofs_is_inline_pcluster(be.pcl))
1373			z_erofs_free_pcluster(be.pcl);
1374		else
1375			erofs_workgroup_put(&be.pcl->obj);
1376	}
1377}
1378
1379static void z_erofs_decompressqueue_work(struct work_struct *work)
1380{
1381	struct z_erofs_decompressqueue *bgq =
1382		container_of(work, struct z_erofs_decompressqueue, u.work);
1383	struct page *pagepool = NULL;
1384
1385	DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL);
1386	z_erofs_decompress_queue(bgq, &pagepool);
1387	erofs_release_pages(&pagepool);
1388	kvfree(bgq);
1389}
1390
1391#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
1392static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work)
1393{
1394	z_erofs_decompressqueue_work((struct work_struct *)work);
1395}
1396#endif
1397
1398static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
1399				       int bios)
1400{
1401	struct erofs_sb_info *const sbi = EROFS_SB(io->sb);
1402
1403	/* wake up the caller thread for sync decompression */
1404	if (io->sync) {
1405		if (!atomic_add_return(bios, &io->pending_bios))
1406			complete(&io->u.done);
1407		return;
1408	}
1409
1410	if (atomic_add_return(bios, &io->pending_bios))
1411		return;
1412	/* Use (kthread_)work and sync decompression for atomic contexts only */
1413	if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) {
1414#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
1415		struct kthread_worker *worker;
1416
1417		rcu_read_lock();
1418		worker = rcu_dereference(
1419				z_erofs_pcpu_workers[raw_smp_processor_id()]);
1420		if (!worker) {
1421			INIT_WORK(&io->u.work, z_erofs_decompressqueue_work);
1422			queue_work(z_erofs_workqueue, &io->u.work);
1423		} else {
1424			kthread_queue_work(worker, &io->u.kthread_work);
1425		}
1426		rcu_read_unlock();
1427#else
1428		queue_work(z_erofs_workqueue, &io->u.work);
1429#endif
1430		/* enable sync decompression for readahead */
1431		if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO)
1432			sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON;
1433		return;
1434	}
1435	z_erofs_decompressqueue_work(&io->u.work);
1436}
1437
1438static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
1439					       unsigned int nr,
1440					       struct page **pagepool,
1441					       struct address_space *mc)
1442{
1443	const pgoff_t index = pcl->obj.index;
1444	gfp_t gfp = mapping_gfp_mask(mc);
1445	bool tocache = false;
1446
1447	struct address_space *mapping;
1448	struct page *oldpage, *page;
1449	int justfound;
1450
1451repeat:
1452	page = READ_ONCE(pcl->compressed_bvecs[nr].page);
1453	oldpage = page;
1454
1455	if (!page)
1456		goto out_allocpage;
1457
1458	justfound = (unsigned long)page & 1UL;
1459	page = (struct page *)((unsigned long)page & ~1UL);
1460
1461	/*
1462	 * preallocated cached pages, which is used to avoid direct reclaim
1463	 * otherwise, it will go inplace I/O path instead.
1464	 */
1465	if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
1466		WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
1467		set_page_private(page, 0);
1468		tocache = true;
1469		goto out_tocache;
1470	}
1471	mapping = READ_ONCE(page->mapping);
1472
1473	/*
1474	 * file-backed online pages in plcuster are all locked steady,
1475	 * therefore it is impossible for `mapping' to be NULL.
1476	 */
1477	if (mapping && mapping != mc)
1478		/* ought to be unmanaged pages */
1479		goto out;
1480
1481	/* directly return for shortlived page as well */
1482	if (z_erofs_is_shortlived_page(page))
1483		goto out;
1484
1485	lock_page(page);
1486
1487	/* only true if page reclaim goes wrong, should never happen */
1488	DBG_BUGON(justfound && PagePrivate(page));
1489
1490	/* the page is still in manage cache */
1491	if (page->mapping == mc) {
1492		WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
1493
1494		if (!PagePrivate(page)) {
1495			/*
1496			 * impossible to be !PagePrivate(page) for
1497			 * the current restriction as well if
1498			 * the page is already in compressed_bvecs[].
1499			 */
1500			DBG_BUGON(!justfound);
1501
1502			justfound = 0;
1503			set_page_private(page, (unsigned long)pcl);
1504			SetPagePrivate(page);
1505		}
1506
1507		/* no need to submit io if it is already up-to-date */
1508		if (PageUptodate(page)) {
1509			unlock_page(page);
1510			page = NULL;
1511		}
1512		goto out;
1513	}
1514
1515	/*
1516	 * the managed page has been truncated, it's unsafe to
1517	 * reuse this one, let's allocate a new cache-managed page.
1518	 */
1519	DBG_BUGON(page->mapping);
1520	DBG_BUGON(!justfound);
1521
1522	tocache = true;
1523	unlock_page(page);
1524	put_page(page);
1525out_allocpage:
1526	page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
1527	if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page,
1528			       oldpage, page)) {
1529		erofs_pagepool_add(pagepool, page);
1530		cond_resched();
1531		goto repeat;
1532	}
1533out_tocache:
1534	if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
1535		/* turn into temporary page if fails (1 ref) */
1536		set_page_private(page, Z_EROFS_SHORTLIVED_PAGE);
1537		goto out;
1538	}
1539	attach_page_private(page, pcl);
1540	/* drop a refcount added by allocpage (then we have 2 refs here) */
1541	put_page(page);
1542
1543out:	/* the only exit (for tracing and debugging) */
1544	return page;
1545}
1546
1547static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
1548			      struct z_erofs_decompressqueue *fgq, bool *fg)
1549{
1550	struct z_erofs_decompressqueue *q;
1551
1552	if (fg && !*fg) {
1553		q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN);
1554		if (!q) {
1555			*fg = true;
1556			goto fg_out;
1557		}
1558#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
1559		kthread_init_work(&q->u.kthread_work,
1560				  z_erofs_decompressqueue_kthread_work);
1561#else
1562		INIT_WORK(&q->u.work, z_erofs_decompressqueue_work);
1563#endif
1564	} else {
1565fg_out:
1566		q = fgq;
1567		init_completion(&fgq->u.done);
1568		atomic_set(&fgq->pending_bios, 0);
1569		q->eio = false;
1570		q->sync = true;
1571	}
1572	q->sb = sb;
1573	q->head = Z_EROFS_PCLUSTER_TAIL;
1574	return q;
1575}
1576
1577/* define decompression jobqueue types */
1578enum {
1579	JQ_BYPASS,
1580	JQ_SUBMIT,
1581	NR_JOBQUEUES,
1582};
1583
1584static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
1585				    z_erofs_next_pcluster_t qtail[],
1586				    z_erofs_next_pcluster_t owned_head)
1587{
1588	z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
1589	z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
1590
1591	WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);
1592
1593	WRITE_ONCE(*submit_qtail, owned_head);
1594	WRITE_ONCE(*bypass_qtail, &pcl->next);
1595
1596	qtail[JQ_BYPASS] = &pcl->next;
1597}
1598
1599static void z_erofs_decompressqueue_endio(struct bio *bio)
1600{
1601	struct z_erofs_decompressqueue *q = bio->bi_private;
1602	blk_status_t err = bio->bi_status;
1603	struct bio_vec *bvec;
1604	struct bvec_iter_all iter_all;
1605
1606	bio_for_each_segment_all(bvec, bio, iter_all) {
1607		struct page *page = bvec->bv_page;
1608
1609		DBG_BUGON(PageUptodate(page));
1610		DBG_BUGON(z_erofs_page_is_invalidated(page));
1611
1612		if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
1613			if (!err)
1614				SetPageUptodate(page);
1615			unlock_page(page);
1616		}
1617	}
1618	if (err)
1619		q->eio = true;
1620	z_erofs_decompress_kickoff(q, -1);
1621	bio_put(bio);
1622}
1623
1624static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
1625				 struct z_erofs_decompressqueue *fgq,
1626				 bool *force_fg, bool readahead)
1627{
1628	struct super_block *sb = f->inode->i_sb;
1629	struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
1630	z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
1631	struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
1632	z_erofs_next_pcluster_t owned_head = f->owned_head;
1633	/* bio is NULL initially, so no need to initialize last_{index,bdev} */
1634	pgoff_t last_index;
1635	struct block_device *last_bdev;
1636	unsigned int nr_bios = 0;
1637	struct bio *bio = NULL;
1638	unsigned long pflags;
1639	int memstall = 0;
1640
1641	/*
1642	 * if managed cache is enabled, bypass jobqueue is needed,
1643	 * no need to read from device for all pclusters in this queue.
1644	 */
1645	q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL);
1646	q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg);
1647
1648	qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
1649	qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
1650
1651	/* by default, all need io submission */
1652	q[JQ_SUBMIT]->head = owned_head;
1653
1654	do {
1655		struct erofs_map_dev mdev;
1656		struct z_erofs_pcluster *pcl;
1657		pgoff_t cur, end;
1658		unsigned int i = 0;
1659		bool bypass = true;
1660
1661		DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
1662		pcl = container_of(owned_head, struct z_erofs_pcluster, next);
1663		owned_head = READ_ONCE(pcl->next);
1664
1665		if (z_erofs_is_inline_pcluster(pcl)) {
1666			move_to_bypass_jobqueue(pcl, qtail, owned_head);
1667			continue;
1668		}
1669
1670		/* no device id here, thus it will always succeed */
1671		mdev = (struct erofs_map_dev) {
1672			.m_pa = erofs_pos(sb, pcl->obj.index),
1673		};
1674		(void)erofs_map_dev(sb, &mdev);
1675
1676		cur = erofs_blknr(sb, mdev.m_pa);
1677		end = cur + pcl->pclusterpages;
1678
1679		do {
1680			struct page *page;
1681
1682			page = pickup_page_for_submission(pcl, i++,
1683					&f->pagepool, mc);
1684			if (!page)
1685				continue;
1686
1687			if (bio && (cur != last_index + 1 ||
1688				    last_bdev != mdev.m_bdev)) {
1689submit_bio_retry:
1690				submit_bio(bio);
1691				if (memstall) {
1692					psi_memstall_leave(&pflags);
1693					memstall = 0;
1694				}
1695				bio = NULL;
1696			}
1697
1698			if (unlikely(PageWorkingset(page)) && !memstall) {
1699				psi_memstall_enter(&pflags);
1700				memstall = 1;
1701			}
1702
1703			if (!bio) {
1704				bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
1705						REQ_OP_READ, GFP_NOIO);
1706				bio->bi_end_io = z_erofs_decompressqueue_endio;
1707
1708				last_bdev = mdev.m_bdev;
1709				bio->bi_iter.bi_sector = (sector_t)cur <<
1710					(sb->s_blocksize_bits - 9);
1711				bio->bi_private = q[JQ_SUBMIT];
1712				if (readahead)
1713					bio->bi_opf |= REQ_RAHEAD;
1714				++nr_bios;
1715			}
1716
1717			if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
1718				goto submit_bio_retry;
1719
1720			last_index = cur;
1721			bypass = false;
1722		} while (++cur < end);
1723
1724		if (!bypass)
1725			qtail[JQ_SUBMIT] = &pcl->next;
1726		else
1727			move_to_bypass_jobqueue(pcl, qtail, owned_head);
1728	} while (owned_head != Z_EROFS_PCLUSTER_TAIL);
1729
1730	if (bio) {
1731		submit_bio(bio);
1732		if (memstall)
1733			psi_memstall_leave(&pflags);
1734	}
1735
1736	/*
1737	 * although background is preferred, no one is pending for submission.
1738	 * don't issue decompression but drop it directly instead.
1739	 */
1740	if (!*force_fg && !nr_bios) {
1741		kvfree(q[JQ_SUBMIT]);
1742		return;
1743	}
1744	z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios);
1745}
1746
1747static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
1748			     bool force_fg, bool ra)
1749{
1750	struct z_erofs_decompressqueue io[NR_JOBQUEUES];
1751
1752	if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
1753		return;
1754	z_erofs_submit_queue(f, io, &force_fg, ra);
1755
1756	/* handle bypass queue (no i/o pclusters) immediately */
1757	z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
1758
1759	if (!force_fg)
1760		return;
1761
1762	/* wait until all bios are completed */
1763	wait_for_completion_io(&io[JQ_SUBMIT].u.done);
1764
1765	/* handle synchronous decompress queue in the caller context */
1766	z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool);
1767}
1768
1769/*
1770 * Since partial uptodate is still unimplemented for now, we have to use
1771 * approximate readmore strategies as a start.
1772 */
1773static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
1774		struct readahead_control *rac, bool backmost)
1775{
1776	struct inode *inode = f->inode;
1777	struct erofs_map_blocks *map = &f->map;
1778	erofs_off_t cur, end, headoffset = f->headoffset;
1779	int err;
1780
1781	if (backmost) {
1782		if (rac)
1783			end = headoffset + readahead_length(rac) - 1;
1784		else
1785			end = headoffset + PAGE_SIZE - 1;
1786		map->m_la = end;
1787		err = z_erofs_map_blocks_iter(inode, map,
1788					      EROFS_GET_BLOCKS_READMORE);
1789		if (err)
1790			return;
1791
1792		/* expand ra for the trailing edge if readahead */
1793		if (rac) {
1794			cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
1795			readahead_expand(rac, headoffset, cur - headoffset);
1796			return;
1797		}
1798		end = round_up(end, PAGE_SIZE);
1799	} else {
1800		end = round_up(map->m_la, PAGE_SIZE);
1801
1802		if (!map->m_llen)
1803			return;
1804	}
1805
1806	cur = map->m_la + map->m_llen - 1;
1807	while ((cur >= end) && (cur < i_size_read(inode))) {
1808		pgoff_t index = cur >> PAGE_SHIFT;
1809		struct page *page;
1810
1811		page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
1812		if (page) {
1813			if (PageUptodate(page))
1814				unlock_page(page);
1815			else
1816				(void)z_erofs_do_read_page(f, page);
1817			put_page(page);
1818		}
1819
1820		if (cur < PAGE_SIZE)
1821			break;
1822		cur = (index << PAGE_SHIFT) - 1;
1823	}
1824}
1825
1826static int z_erofs_read_folio(struct file *file, struct folio *folio)
1827{
1828	struct inode *const inode = folio->mapping->host;
1829	struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
1830	struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
1831	int err;
1832
1833	trace_erofs_read_folio(folio, false);
1834	f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
1835
1836	z_erofs_pcluster_readmore(&f, NULL, true);
1837	err = z_erofs_do_read_page(&f, &folio->page);
1838	z_erofs_pcluster_readmore(&f, NULL, false);
1839	z_erofs_pcluster_end(&f);
1840
1841	/* if some compressed cluster ready, need submit them anyway */
1842	z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false);
1843
1844	if (err && err != -EINTR)
1845		erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu",
1846			  err, folio->index, EROFS_I(inode)->nid);
1847
1848	erofs_put_metabuf(&f.map.buf);
1849	erofs_release_pages(&f.pagepool);
1850	return err;
1851}
1852
1853static void z_erofs_readahead(struct readahead_control *rac)
1854{
1855	struct inode *const inode = rac->mapping->host;
1856	struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
1857	struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
1858	struct folio *head = NULL, *folio;
1859	unsigned int nr_folios;
1860	int err;
1861
1862	f.headoffset = readahead_pos(rac);
1863
1864	z_erofs_pcluster_readmore(&f, rac, true);
1865	nr_folios = readahead_count(rac);
1866	trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false);
1867
1868	while ((folio = readahead_folio(rac))) {
1869		folio->private = head;
1870		head = folio;
1871	}
1872
1873	/* traverse in reverse order for best metadata I/O performance */
1874	while (head) {
1875		folio = head;
1876		head = folio_get_private(folio);
1877
1878		err = z_erofs_do_read_page(&f, &folio->page);
1879		if (err && err != -EINTR)
1880			erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
1881				  folio->index, EROFS_I(inode)->nid);
1882	}
1883	z_erofs_pcluster_readmore(&f, rac, false);
1884	z_erofs_pcluster_end(&f);
1885
1886	z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true);
1887	erofs_put_metabuf(&f.map.buf);
1888	erofs_release_pages(&f.pagepool);
1889}
1890
1891const struct address_space_operations z_erofs_aops = {
1892	.read_folio = z_erofs_read_folio,
1893	.readahead = z_erofs_readahead,
1894};
1895