xref: /kernel/linux/linux-6.6/fs/f2fs/segment.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * fs/f2fs/segment.c
4 *
5 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
6 *             http://www.samsung.com/
7 */
8#include <linux/fs.h>
9#include <linux/f2fs_fs.h>
10#include <linux/bio.h>
11#include <linux/blkdev.h>
12#include <linux/sched/mm.h>
13#include <linux/prefetch.h>
14#include <linux/kthread.h>
15#include <linux/swap.h>
16#include <linux/timer.h>
17#include <linux/freezer.h>
18#include <linux/sched/signal.h>
19#include <linux/random.h>
20
21#include "f2fs.h"
22#include "segment.h"
23#include "node.h"
24#include "gc.h"
25#include "iostat.h"
26#include <trace/events/f2fs.h>
27
28#define __reverse_ffz(x) __reverse_ffs(~(x))
29
30static struct kmem_cache *discard_entry_slab;
31static struct kmem_cache *discard_cmd_slab;
32static struct kmem_cache *sit_entry_set_slab;
33static struct kmem_cache *revoke_entry_slab;
34
35static unsigned long __reverse_ulong(unsigned char *str)
36{
37	unsigned long tmp = 0;
38	int shift = 24, idx = 0;
39
40#if BITS_PER_LONG == 64
41	shift = 56;
42#endif
43	while (shift >= 0) {
44		tmp |= (unsigned long)str[idx++] << shift;
45		shift -= BITS_PER_BYTE;
46	}
47	return tmp;
48}
49
50/*
51 * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
52 * MSB and LSB are reversed in a byte by f2fs_set_bit.
53 */
54static inline unsigned long __reverse_ffs(unsigned long word)
55{
56	int num = 0;
57
58#if BITS_PER_LONG == 64
59	if ((word & 0xffffffff00000000UL) == 0)
60		num += 32;
61	else
62		word >>= 32;
63#endif
64	if ((word & 0xffff0000) == 0)
65		num += 16;
66	else
67		word >>= 16;
68
69	if ((word & 0xff00) == 0)
70		num += 8;
71	else
72		word >>= 8;
73
74	if ((word & 0xf0) == 0)
75		num += 4;
76	else
77		word >>= 4;
78
79	if ((word & 0xc) == 0)
80		num += 2;
81	else
82		word >>= 2;
83
84	if ((word & 0x2) == 0)
85		num += 1;
86	return num;
87}
88
89/*
90 * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because
91 * f2fs_set_bit makes MSB and LSB reversed in a byte.
92 * @size must be integral times of unsigned long.
93 * Example:
94 *                             MSB <--> LSB
95 *   f2fs_set_bit(0, bitmap) => 1000 0000
96 *   f2fs_set_bit(7, bitmap) => 0000 0001
97 */
98static unsigned long __find_rev_next_bit(const unsigned long *addr,
99			unsigned long size, unsigned long offset)
100{
101	const unsigned long *p = addr + BIT_WORD(offset);
102	unsigned long result = size;
103	unsigned long tmp;
104
105	if (offset >= size)
106		return size;
107
108	size -= (offset & ~(BITS_PER_LONG - 1));
109	offset %= BITS_PER_LONG;
110
111	while (1) {
112		if (*p == 0)
113			goto pass;
114
115		tmp = __reverse_ulong((unsigned char *)p);
116
117		tmp &= ~0UL >> offset;
118		if (size < BITS_PER_LONG)
119			tmp &= (~0UL << (BITS_PER_LONG - size));
120		if (tmp)
121			goto found;
122pass:
123		if (size <= BITS_PER_LONG)
124			break;
125		size -= BITS_PER_LONG;
126		offset = 0;
127		p++;
128	}
129	return result;
130found:
131	return result - size + __reverse_ffs(tmp);
132}
133
134static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
135			unsigned long size, unsigned long offset)
136{
137	const unsigned long *p = addr + BIT_WORD(offset);
138	unsigned long result = size;
139	unsigned long tmp;
140
141	if (offset >= size)
142		return size;
143
144	size -= (offset & ~(BITS_PER_LONG - 1));
145	offset %= BITS_PER_LONG;
146
147	while (1) {
148		if (*p == ~0UL)
149			goto pass;
150
151		tmp = __reverse_ulong((unsigned char *)p);
152
153		if (offset)
154			tmp |= ~0UL << (BITS_PER_LONG - offset);
155		if (size < BITS_PER_LONG)
156			tmp |= ~0UL >> size;
157		if (tmp != ~0UL)
158			goto found;
159pass:
160		if (size <= BITS_PER_LONG)
161			break;
162		size -= BITS_PER_LONG;
163		offset = 0;
164		p++;
165	}
166	return result;
167found:
168	return result - size + __reverse_ffz(tmp);
169}
170
171bool f2fs_need_SSR(struct f2fs_sb_info *sbi)
172{
173	int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
174	int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
175	int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
176
177	if (f2fs_lfs_mode(sbi))
178		return false;
179	if (sbi->gc_mode == GC_URGENT_HIGH)
180		return true;
181	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
182		return true;
183
184	return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
185			SM_I(sbi)->min_ssr_sections + reserved_sections(sbi));
186}
187
188void f2fs_abort_atomic_write(struct inode *inode, bool clean)
189{
190	struct f2fs_inode_info *fi = F2FS_I(inode);
191
192	if (!f2fs_is_atomic_file(inode))
193		return;
194
195	release_atomic_write_cnt(inode);
196	clear_inode_flag(inode, FI_ATOMIC_COMMITTED);
197	clear_inode_flag(inode, FI_ATOMIC_REPLACE);
198	clear_inode_flag(inode, FI_ATOMIC_FILE);
199	stat_dec_atomic_inode(inode);
200
201	F2FS_I(inode)->atomic_write_task = NULL;
202
203	if (clean) {
204		truncate_inode_pages_final(inode->i_mapping);
205		f2fs_i_size_write(inode, fi->original_i_size);
206		fi->original_i_size = 0;
207	}
208	/* avoid stale dirty inode during eviction */
209	sync_inode_metadata(inode, 0);
210}
211
212static int __replace_atomic_write_block(struct inode *inode, pgoff_t index,
213			block_t new_addr, block_t *old_addr, bool recover)
214{
215	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
216	struct dnode_of_data dn;
217	struct node_info ni;
218	int err;
219
220retry:
221	set_new_dnode(&dn, inode, NULL, NULL, 0);
222	err = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE);
223	if (err) {
224		if (err == -ENOMEM) {
225			f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
226			goto retry;
227		}
228		return err;
229	}
230
231	err = f2fs_get_node_info(sbi, dn.nid, &ni, false);
232	if (err) {
233		f2fs_put_dnode(&dn);
234		return err;
235	}
236
237	if (recover) {
238		/* dn.data_blkaddr is always valid */
239		if (!__is_valid_data_blkaddr(new_addr)) {
240			if (new_addr == NULL_ADDR)
241				dec_valid_block_count(sbi, inode, 1);
242			f2fs_invalidate_blocks(sbi, dn.data_blkaddr);
243			f2fs_update_data_blkaddr(&dn, new_addr);
244		} else {
245			f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
246				new_addr, ni.version, true, true);
247		}
248	} else {
249		blkcnt_t count = 1;
250
251		err = inc_valid_block_count(sbi, inode, &count, true);
252		if (err) {
253			f2fs_put_dnode(&dn);
254			return err;
255		}
256
257		*old_addr = dn.data_blkaddr;
258		f2fs_truncate_data_blocks_range(&dn, 1);
259		dec_valid_block_count(sbi, F2FS_I(inode)->cow_inode, count);
260
261		f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr,
262					ni.version, true, false);
263	}
264
265	f2fs_put_dnode(&dn);
266
267	trace_f2fs_replace_atomic_write_block(inode, F2FS_I(inode)->cow_inode,
268			index, old_addr ? *old_addr : 0, new_addr, recover);
269	return 0;
270}
271
272static void __complete_revoke_list(struct inode *inode, struct list_head *head,
273					bool revoke)
274{
275	struct revoke_entry *cur, *tmp;
276	pgoff_t start_index = 0;
277	bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE);
278
279	list_for_each_entry_safe(cur, tmp, head, list) {
280		if (revoke) {
281			__replace_atomic_write_block(inode, cur->index,
282						cur->old_addr, NULL, true);
283		} else if (truncate) {
284			f2fs_truncate_hole(inode, start_index, cur->index);
285			start_index = cur->index + 1;
286		}
287
288		list_del(&cur->list);
289		kmem_cache_free(revoke_entry_slab, cur);
290	}
291
292	if (!revoke && truncate)
293		f2fs_do_truncate_blocks(inode, start_index * PAGE_SIZE, false);
294}
295
296static int __f2fs_commit_atomic_write(struct inode *inode)
297{
298	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
299	struct f2fs_inode_info *fi = F2FS_I(inode);
300	struct inode *cow_inode = fi->cow_inode;
301	struct revoke_entry *new;
302	struct list_head revoke_list;
303	block_t blkaddr;
304	struct dnode_of_data dn;
305	pgoff_t len = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
306	pgoff_t off = 0, blen, index;
307	int ret = 0, i;
308
309	INIT_LIST_HEAD(&revoke_list);
310
311	while (len) {
312		blen = min_t(pgoff_t, ADDRS_PER_BLOCK(cow_inode), len);
313
314		set_new_dnode(&dn, cow_inode, NULL, NULL, 0);
315		ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA);
316		if (ret && ret != -ENOENT) {
317			goto out;
318		} else if (ret == -ENOENT) {
319			ret = 0;
320			if (dn.max_level == 0)
321				goto out;
322			goto next;
323		}
324
325		blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, cow_inode),
326				len);
327		index = off;
328		for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) {
329			blkaddr = f2fs_data_blkaddr(&dn);
330
331			if (!__is_valid_data_blkaddr(blkaddr)) {
332				continue;
333			} else if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
334					DATA_GENERIC_ENHANCE)) {
335				f2fs_put_dnode(&dn);
336				ret = -EFSCORRUPTED;
337				f2fs_handle_error(sbi,
338						ERROR_INVALID_BLKADDR);
339				goto out;
340			}
341
342			new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS,
343							true, NULL);
344
345			ret = __replace_atomic_write_block(inode, index, blkaddr,
346							&new->old_addr, false);
347			if (ret) {
348				f2fs_put_dnode(&dn);
349				kmem_cache_free(revoke_entry_slab, new);
350				goto out;
351			}
352
353			f2fs_update_data_blkaddr(&dn, NULL_ADDR);
354			new->index = index;
355			list_add_tail(&new->list, &revoke_list);
356		}
357		f2fs_put_dnode(&dn);
358next:
359		off += blen;
360		len -= blen;
361	}
362
363out:
364	if (ret) {
365		sbi->revoked_atomic_block += fi->atomic_write_cnt;
366	} else {
367		sbi->committed_atomic_block += fi->atomic_write_cnt;
368		set_inode_flag(inode, FI_ATOMIC_COMMITTED);
369	}
370
371	__complete_revoke_list(inode, &revoke_list, ret ? true : false);
372
373	return ret;
374}
375
376int f2fs_commit_atomic_write(struct inode *inode)
377{
378	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
379	struct f2fs_inode_info *fi = F2FS_I(inode);
380	int err;
381
382	err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
383	if (err)
384		return err;
385
386	f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
387	f2fs_lock_op(sbi);
388
389	err = __f2fs_commit_atomic_write(inode);
390
391	f2fs_unlock_op(sbi);
392	f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
393
394	return err;
395}
396
397/*
398 * This function balances dirty node and dentry pages.
399 * In addition, it controls garbage collection.
400 */
401void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
402{
403	if (time_to_inject(sbi, FAULT_CHECKPOINT))
404		f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT);
405
406	/* balance_fs_bg is able to be pending */
407	if (need && excess_cached_nats(sbi))
408		f2fs_balance_fs_bg(sbi, false);
409
410	if (!f2fs_is_checkpoint_ready(sbi))
411		return;
412
413	/*
414	 * We should do GC or end up with checkpoint, if there are so many dirty
415	 * dir/node pages without enough free segments.
416	 */
417	if (has_enough_free_secs(sbi, 0, 0))
418		return;
419
420	if (test_opt(sbi, GC_MERGE) && sbi->gc_thread &&
421				sbi->gc_thread->f2fs_gc_task) {
422		DEFINE_WAIT(wait);
423
424		prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait,
425					TASK_UNINTERRUPTIBLE);
426		wake_up(&sbi->gc_thread->gc_wait_queue_head);
427		io_schedule();
428		finish_wait(&sbi->gc_thread->fggc_wq, &wait);
429	} else {
430		struct f2fs_gc_control gc_control = {
431			.victim_segno = NULL_SEGNO,
432			.init_gc_type = BG_GC,
433			.no_bg_gc = true,
434			.should_migrate_blocks = false,
435			.err_gc_skipped = false,
436			.nr_free_secs = 1 };
437		f2fs_down_write(&sbi->gc_lock);
438		stat_inc_gc_call_count(sbi, FOREGROUND);
439		f2fs_gc(sbi, &gc_control);
440	}
441}
442
443static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi)
444{
445	int factor = f2fs_rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2;
446	unsigned int dents = get_pages(sbi, F2FS_DIRTY_DENTS);
447	unsigned int qdata = get_pages(sbi, F2FS_DIRTY_QDATA);
448	unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES);
449	unsigned int meta = get_pages(sbi, F2FS_DIRTY_META);
450	unsigned int imeta = get_pages(sbi, F2FS_DIRTY_IMETA);
451	unsigned int threshold = sbi->blocks_per_seg * factor *
452					DEFAULT_DIRTY_THRESHOLD;
453	unsigned int global_threshold = threshold * 3 / 2;
454
455	if (dents >= threshold || qdata >= threshold ||
456		nodes >= threshold || meta >= threshold ||
457		imeta >= threshold)
458		return true;
459	return dents + qdata + nodes + meta + imeta >  global_threshold;
460}
461
462void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
463{
464	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
465		return;
466
467	/* try to shrink extent cache when there is no enough memory */
468	if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE))
469		f2fs_shrink_read_extent_tree(sbi,
470				READ_EXTENT_CACHE_SHRINK_NUMBER);
471
472	/* try to shrink age extent cache when there is no enough memory */
473	if (!f2fs_available_free_memory(sbi, AGE_EXTENT_CACHE))
474		f2fs_shrink_age_extent_tree(sbi,
475				AGE_EXTENT_CACHE_SHRINK_NUMBER);
476
477	/* check the # of cached NAT entries */
478	if (!f2fs_available_free_memory(sbi, NAT_ENTRIES))
479		f2fs_try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK);
480
481	if (!f2fs_available_free_memory(sbi, FREE_NIDS))
482		f2fs_try_to_free_nids(sbi, MAX_FREE_NIDS);
483	else
484		f2fs_build_free_nids(sbi, false, false);
485
486	if (excess_dirty_nats(sbi) || excess_dirty_threshold(sbi) ||
487		excess_prefree_segs(sbi) || !f2fs_space_for_roll_forward(sbi))
488		goto do_sync;
489
490	/* there is background inflight IO or foreground operation recently */
491	if (is_inflight_io(sbi, REQ_TIME) ||
492		(!f2fs_time_over(sbi, REQ_TIME) && f2fs_rwsem_is_locked(&sbi->cp_rwsem)))
493		return;
494
495	/* exceed periodical checkpoint timeout threshold */
496	if (f2fs_time_over(sbi, CP_TIME))
497		goto do_sync;
498
499	/* checkpoint is the only way to shrink partial cached entries */
500	if (f2fs_available_free_memory(sbi, NAT_ENTRIES) &&
501		f2fs_available_free_memory(sbi, INO_ENTRIES))
502		return;
503
504do_sync:
505	if (test_opt(sbi, DATA_FLUSH) && from_bg) {
506		struct blk_plug plug;
507
508		mutex_lock(&sbi->flush_lock);
509
510		blk_start_plug(&plug);
511		f2fs_sync_dirty_inodes(sbi, FILE_INODE, false);
512		blk_finish_plug(&plug);
513
514		mutex_unlock(&sbi->flush_lock);
515	}
516	stat_inc_cp_call_count(sbi, BACKGROUND);
517	f2fs_sync_fs(sbi->sb, 1);
518}
519
520static int __submit_flush_wait(struct f2fs_sb_info *sbi,
521				struct block_device *bdev)
522{
523	int ret = blkdev_issue_flush(bdev);
524
525	trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER),
526				test_opt(sbi, FLUSH_MERGE), ret);
527	if (!ret)
528		f2fs_update_iostat(sbi, NULL, FS_FLUSH_IO, 0);
529	return ret;
530}
531
532static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino)
533{
534	int ret = 0;
535	int i;
536
537	if (!f2fs_is_multi_device(sbi))
538		return __submit_flush_wait(sbi, sbi->sb->s_bdev);
539
540	for (i = 0; i < sbi->s_ndevs; i++) {
541		if (!f2fs_is_dirty_device(sbi, ino, i, FLUSH_INO))
542			continue;
543		ret = __submit_flush_wait(sbi, FDEV(i).bdev);
544		if (ret)
545			break;
546	}
547	return ret;
548}
549
550static int issue_flush_thread(void *data)
551{
552	struct f2fs_sb_info *sbi = data;
553	struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
554	wait_queue_head_t *q = &fcc->flush_wait_queue;
555repeat:
556	if (kthread_should_stop())
557		return 0;
558
559	if (!llist_empty(&fcc->issue_list)) {
560		struct flush_cmd *cmd, *next;
561		int ret;
562
563		fcc->dispatch_list = llist_del_all(&fcc->issue_list);
564		fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
565
566		cmd = llist_entry(fcc->dispatch_list, struct flush_cmd, llnode);
567
568		ret = submit_flush_wait(sbi, cmd->ino);
569		atomic_inc(&fcc->issued_flush);
570
571		llist_for_each_entry_safe(cmd, next,
572					  fcc->dispatch_list, llnode) {
573			cmd->ret = ret;
574			complete(&cmd->wait);
575		}
576		fcc->dispatch_list = NULL;
577	}
578
579	wait_event_interruptible(*q,
580		kthread_should_stop() || !llist_empty(&fcc->issue_list));
581	goto repeat;
582}
583
584int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino)
585{
586	struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
587	struct flush_cmd cmd;
588	int ret;
589
590	if (test_opt(sbi, NOBARRIER))
591		return 0;
592
593	if (!test_opt(sbi, FLUSH_MERGE)) {
594		atomic_inc(&fcc->queued_flush);
595		ret = submit_flush_wait(sbi, ino);
596		atomic_dec(&fcc->queued_flush);
597		atomic_inc(&fcc->issued_flush);
598		return ret;
599	}
600
601	if (atomic_inc_return(&fcc->queued_flush) == 1 ||
602	    f2fs_is_multi_device(sbi)) {
603		ret = submit_flush_wait(sbi, ino);
604		atomic_dec(&fcc->queued_flush);
605
606		atomic_inc(&fcc->issued_flush);
607		return ret;
608	}
609
610	cmd.ino = ino;
611	init_completion(&cmd.wait);
612
613	llist_add(&cmd.llnode, &fcc->issue_list);
614
615	/*
616	 * update issue_list before we wake up issue_flush thread, this
617	 * smp_mb() pairs with another barrier in ___wait_event(), see
618	 * more details in comments of waitqueue_active().
619	 */
620	smp_mb();
621
622	if (waitqueue_active(&fcc->flush_wait_queue))
623		wake_up(&fcc->flush_wait_queue);
624
625	if (fcc->f2fs_issue_flush) {
626		wait_for_completion(&cmd.wait);
627		atomic_dec(&fcc->queued_flush);
628	} else {
629		struct llist_node *list;
630
631		list = llist_del_all(&fcc->issue_list);
632		if (!list) {
633			wait_for_completion(&cmd.wait);
634			atomic_dec(&fcc->queued_flush);
635		} else {
636			struct flush_cmd *tmp, *next;
637
638			ret = submit_flush_wait(sbi, ino);
639
640			llist_for_each_entry_safe(tmp, next, list, llnode) {
641				if (tmp == &cmd) {
642					cmd.ret = ret;
643					atomic_dec(&fcc->queued_flush);
644					continue;
645				}
646				tmp->ret = ret;
647				complete(&tmp->wait);
648			}
649		}
650	}
651
652	return cmd.ret;
653}
654
655int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi)
656{
657	dev_t dev = sbi->sb->s_bdev->bd_dev;
658	struct flush_cmd_control *fcc;
659
660	if (SM_I(sbi)->fcc_info) {
661		fcc = SM_I(sbi)->fcc_info;
662		if (fcc->f2fs_issue_flush)
663			return 0;
664		goto init_thread;
665	}
666
667	fcc = f2fs_kzalloc(sbi, sizeof(struct flush_cmd_control), GFP_KERNEL);
668	if (!fcc)
669		return -ENOMEM;
670	atomic_set(&fcc->issued_flush, 0);
671	atomic_set(&fcc->queued_flush, 0);
672	init_waitqueue_head(&fcc->flush_wait_queue);
673	init_llist_head(&fcc->issue_list);
674	SM_I(sbi)->fcc_info = fcc;
675	if (!test_opt(sbi, FLUSH_MERGE))
676		return 0;
677
678init_thread:
679	fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
680				"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
681	if (IS_ERR(fcc->f2fs_issue_flush)) {
682		int err = PTR_ERR(fcc->f2fs_issue_flush);
683
684		fcc->f2fs_issue_flush = NULL;
685		return err;
686	}
687
688	return 0;
689}
690
691void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
692{
693	struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
694
695	if (fcc && fcc->f2fs_issue_flush) {
696		struct task_struct *flush_thread = fcc->f2fs_issue_flush;
697
698		fcc->f2fs_issue_flush = NULL;
699		kthread_stop(flush_thread);
700	}
701	if (free) {
702		kfree(fcc);
703		SM_I(sbi)->fcc_info = NULL;
704	}
705}
706
707int f2fs_flush_device_cache(struct f2fs_sb_info *sbi)
708{
709	int ret = 0, i;
710
711	if (!f2fs_is_multi_device(sbi))
712		return 0;
713
714	if (test_opt(sbi, NOBARRIER))
715		return 0;
716
717	for (i = 1; i < sbi->s_ndevs; i++) {
718		int count = DEFAULT_RETRY_IO_COUNT;
719
720		if (!f2fs_test_bit(i, (char *)&sbi->dirty_device))
721			continue;
722
723		do {
724			ret = __submit_flush_wait(sbi, FDEV(i).bdev);
725			if (ret)
726				f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
727		} while (ret && --count);
728
729		if (ret) {
730			f2fs_stop_checkpoint(sbi, false,
731					STOP_CP_REASON_FLUSH_FAIL);
732			break;
733		}
734
735		spin_lock(&sbi->dev_lock);
736		f2fs_clear_bit(i, (char *)&sbi->dirty_device);
737		spin_unlock(&sbi->dev_lock);
738	}
739
740	return ret;
741}
742
743static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
744		enum dirty_type dirty_type)
745{
746	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
747
748	/* need not be added */
749	if (IS_CURSEG(sbi, segno))
750		return;
751
752	if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
753		dirty_i->nr_dirty[dirty_type]++;
754
755	if (dirty_type == DIRTY) {
756		struct seg_entry *sentry = get_seg_entry(sbi, segno);
757		enum dirty_type t = sentry->type;
758
759		if (unlikely(t >= DIRTY)) {
760			f2fs_bug_on(sbi, 1);
761			return;
762		}
763		if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t]))
764			dirty_i->nr_dirty[t]++;
765
766		if (__is_large_section(sbi)) {
767			unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
768			block_t valid_blocks =
769				get_valid_blocks(sbi, segno, true);
770
771			f2fs_bug_on(sbi, unlikely(!valid_blocks ||
772					valid_blocks == CAP_BLKS_PER_SEC(sbi)));
773
774			if (!IS_CURSEC(sbi, secno))
775				set_bit(secno, dirty_i->dirty_secmap);
776		}
777	}
778}
779
780static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
781		enum dirty_type dirty_type)
782{
783	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
784	block_t valid_blocks;
785
786	if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type]))
787		dirty_i->nr_dirty[dirty_type]--;
788
789	if (dirty_type == DIRTY) {
790		struct seg_entry *sentry = get_seg_entry(sbi, segno);
791		enum dirty_type t = sentry->type;
792
793		if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
794			dirty_i->nr_dirty[t]--;
795
796		valid_blocks = get_valid_blocks(sbi, segno, true);
797		if (valid_blocks == 0) {
798			clear_bit(GET_SEC_FROM_SEG(sbi, segno),
799						dirty_i->victim_secmap);
800#ifdef CONFIG_F2FS_CHECK_FS
801			clear_bit(segno, SIT_I(sbi)->invalid_segmap);
802#endif
803		}
804		if (__is_large_section(sbi)) {
805			unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
806
807			if (!valid_blocks ||
808					valid_blocks == CAP_BLKS_PER_SEC(sbi)) {
809				clear_bit(secno, dirty_i->dirty_secmap);
810				return;
811			}
812
813			if (!IS_CURSEC(sbi, secno))
814				set_bit(secno, dirty_i->dirty_secmap);
815		}
816	}
817}
818
819/*
820 * Should not occur error such as -ENOMEM.
821 * Adding dirty entry into seglist is not critical operation.
822 * If a given segment is one of current working segments, it won't be added.
823 */
824static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
825{
826	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
827	unsigned short valid_blocks, ckpt_valid_blocks;
828	unsigned int usable_blocks;
829
830	if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno))
831		return;
832
833	usable_blocks = f2fs_usable_blks_in_seg(sbi, segno);
834	mutex_lock(&dirty_i->seglist_lock);
835
836	valid_blocks = get_valid_blocks(sbi, segno, false);
837	ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno, false);
838
839	if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) ||
840		ckpt_valid_blocks == usable_blocks)) {
841		__locate_dirty_segment(sbi, segno, PRE);
842		__remove_dirty_segment(sbi, segno, DIRTY);
843	} else if (valid_blocks < usable_blocks) {
844		__locate_dirty_segment(sbi, segno, DIRTY);
845	} else {
846		/* Recovery routine with SSR needs this */
847		__remove_dirty_segment(sbi, segno, DIRTY);
848	}
849
850	mutex_unlock(&dirty_i->seglist_lock);
851}
852
853/* This moves currently empty dirty blocks to prefree. Must hold seglist_lock */
854void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi)
855{
856	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
857	unsigned int segno;
858
859	mutex_lock(&dirty_i->seglist_lock);
860	for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) {
861		if (get_valid_blocks(sbi, segno, false))
862			continue;
863		if (IS_CURSEG(sbi, segno))
864			continue;
865		__locate_dirty_segment(sbi, segno, PRE);
866		__remove_dirty_segment(sbi, segno, DIRTY);
867	}
868	mutex_unlock(&dirty_i->seglist_lock);
869}
870
871block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi)
872{
873	int ovp_hole_segs =
874		(overprovision_segments(sbi) - reserved_segments(sbi));
875	block_t ovp_holes = ovp_hole_segs << sbi->log_blocks_per_seg;
876	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
877	block_t holes[2] = {0, 0};	/* DATA and NODE */
878	block_t unusable;
879	struct seg_entry *se;
880	unsigned int segno;
881
882	mutex_lock(&dirty_i->seglist_lock);
883	for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) {
884		se = get_seg_entry(sbi, segno);
885		if (IS_NODESEG(se->type))
886			holes[NODE] += f2fs_usable_blks_in_seg(sbi, segno) -
887							se->valid_blocks;
888		else
889			holes[DATA] += f2fs_usable_blks_in_seg(sbi, segno) -
890							se->valid_blocks;
891	}
892	mutex_unlock(&dirty_i->seglist_lock);
893
894	unusable = max(holes[DATA], holes[NODE]);
895	if (unusable > ovp_holes)
896		return unusable - ovp_holes;
897	return 0;
898}
899
900int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable)
901{
902	int ovp_hole_segs =
903		(overprovision_segments(sbi) - reserved_segments(sbi));
904	if (unusable > F2FS_OPTION(sbi).unusable_cap)
905		return -EAGAIN;
906	if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) &&
907		dirty_segments(sbi) > ovp_hole_segs)
908		return -EAGAIN;
909	return 0;
910}
911
912/* This is only used by SBI_CP_DISABLED */
913static unsigned int get_free_segment(struct f2fs_sb_info *sbi)
914{
915	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
916	unsigned int segno = 0;
917
918	mutex_lock(&dirty_i->seglist_lock);
919	for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) {
920		if (get_valid_blocks(sbi, segno, false))
921			continue;
922		if (get_ckpt_valid_blocks(sbi, segno, false))
923			continue;
924		mutex_unlock(&dirty_i->seglist_lock);
925		return segno;
926	}
927	mutex_unlock(&dirty_i->seglist_lock);
928	return NULL_SEGNO;
929}
930
931static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi,
932		struct block_device *bdev, block_t lstart,
933		block_t start, block_t len)
934{
935	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
936	struct list_head *pend_list;
937	struct discard_cmd *dc;
938
939	f2fs_bug_on(sbi, !len);
940
941	pend_list = &dcc->pend_list[plist_idx(len)];
942
943	dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS, true, NULL);
944	INIT_LIST_HEAD(&dc->list);
945	dc->bdev = bdev;
946	dc->di.lstart = lstart;
947	dc->di.start = start;
948	dc->di.len = len;
949	dc->ref = 0;
950	dc->state = D_PREP;
951	dc->queued = 0;
952	dc->error = 0;
953	init_completion(&dc->wait);
954	list_add_tail(&dc->list, pend_list);
955	spin_lock_init(&dc->lock);
956	dc->bio_ref = 0;
957	atomic_inc(&dcc->discard_cmd_cnt);
958	dcc->undiscard_blks += len;
959
960	return dc;
961}
962
963static bool f2fs_check_discard_tree(struct f2fs_sb_info *sbi)
964{
965#ifdef CONFIG_F2FS_CHECK_FS
966	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
967	struct rb_node *cur = rb_first_cached(&dcc->root), *next;
968	struct discard_cmd *cur_dc, *next_dc;
969
970	while (cur) {
971		next = rb_next(cur);
972		if (!next)
973			return true;
974
975		cur_dc = rb_entry(cur, struct discard_cmd, rb_node);
976		next_dc = rb_entry(next, struct discard_cmd, rb_node);
977
978		if (cur_dc->di.lstart + cur_dc->di.len > next_dc->di.lstart) {
979			f2fs_info(sbi, "broken discard_rbtree, "
980				"cur(%u, %u) next(%u, %u)",
981				cur_dc->di.lstart, cur_dc->di.len,
982				next_dc->di.lstart, next_dc->di.len);
983			return false;
984		}
985		cur = next;
986	}
987#endif
988	return true;
989}
990
991static struct discard_cmd *__lookup_discard_cmd(struct f2fs_sb_info *sbi,
992						block_t blkaddr)
993{
994	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
995	struct rb_node *node = dcc->root.rb_root.rb_node;
996	struct discard_cmd *dc;
997
998	while (node) {
999		dc = rb_entry(node, struct discard_cmd, rb_node);
1000
1001		if (blkaddr < dc->di.lstart)
1002			node = node->rb_left;
1003		else if (blkaddr >= dc->di.lstart + dc->di.len)
1004			node = node->rb_right;
1005		else
1006			return dc;
1007	}
1008	return NULL;
1009}
1010
1011static struct discard_cmd *__lookup_discard_cmd_ret(struct rb_root_cached *root,
1012				block_t blkaddr,
1013				struct discard_cmd **prev_entry,
1014				struct discard_cmd **next_entry,
1015				struct rb_node ***insert_p,
1016				struct rb_node **insert_parent)
1017{
1018	struct rb_node **pnode = &root->rb_root.rb_node;
1019	struct rb_node *parent = NULL, *tmp_node;
1020	struct discard_cmd *dc;
1021
1022	*insert_p = NULL;
1023	*insert_parent = NULL;
1024	*prev_entry = NULL;
1025	*next_entry = NULL;
1026
1027	if (RB_EMPTY_ROOT(&root->rb_root))
1028		return NULL;
1029
1030	while (*pnode) {
1031		parent = *pnode;
1032		dc = rb_entry(*pnode, struct discard_cmd, rb_node);
1033
1034		if (blkaddr < dc->di.lstart)
1035			pnode = &(*pnode)->rb_left;
1036		else if (blkaddr >= dc->di.lstart + dc->di.len)
1037			pnode = &(*pnode)->rb_right;
1038		else
1039			goto lookup_neighbors;
1040	}
1041
1042	*insert_p = pnode;
1043	*insert_parent = parent;
1044
1045	dc = rb_entry(parent, struct discard_cmd, rb_node);
1046	tmp_node = parent;
1047	if (parent && blkaddr > dc->di.lstart)
1048		tmp_node = rb_next(parent);
1049	*next_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node);
1050
1051	tmp_node = parent;
1052	if (parent && blkaddr < dc->di.lstart)
1053		tmp_node = rb_prev(parent);
1054	*prev_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node);
1055	return NULL;
1056
1057lookup_neighbors:
1058	/* lookup prev node for merging backward later */
1059	tmp_node = rb_prev(&dc->rb_node);
1060	*prev_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node);
1061
1062	/* lookup next node for merging frontward later */
1063	tmp_node = rb_next(&dc->rb_node);
1064	*next_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node);
1065	return dc;
1066}
1067
1068static void __detach_discard_cmd(struct discard_cmd_control *dcc,
1069							struct discard_cmd *dc)
1070{
1071	if (dc->state == D_DONE)
1072		atomic_sub(dc->queued, &dcc->queued_discard);
1073
1074	list_del(&dc->list);
1075	rb_erase_cached(&dc->rb_node, &dcc->root);
1076	dcc->undiscard_blks -= dc->di.len;
1077
1078	kmem_cache_free(discard_cmd_slab, dc);
1079
1080	atomic_dec(&dcc->discard_cmd_cnt);
1081}
1082
1083static void __remove_discard_cmd(struct f2fs_sb_info *sbi,
1084							struct discard_cmd *dc)
1085{
1086	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1087	unsigned long flags;
1088
1089	trace_f2fs_remove_discard(dc->bdev, dc->di.start, dc->di.len);
1090
1091	spin_lock_irqsave(&dc->lock, flags);
1092	if (dc->bio_ref) {
1093		spin_unlock_irqrestore(&dc->lock, flags);
1094		return;
1095	}
1096	spin_unlock_irqrestore(&dc->lock, flags);
1097
1098	f2fs_bug_on(sbi, dc->ref);
1099
1100	if (dc->error == -EOPNOTSUPP)
1101		dc->error = 0;
1102
1103	if (dc->error)
1104		printk_ratelimited(
1105			"%sF2FS-fs (%s): Issue discard(%u, %u, %u) failed, ret: %d",
1106			KERN_INFO, sbi->sb->s_id,
1107			dc->di.lstart, dc->di.start, dc->di.len, dc->error);
1108	__detach_discard_cmd(dcc, dc);
1109}
1110
1111static void f2fs_submit_discard_endio(struct bio *bio)
1112{
1113	struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private;
1114	unsigned long flags;
1115
1116	spin_lock_irqsave(&dc->lock, flags);
1117	if (!dc->error)
1118		dc->error = blk_status_to_errno(bio->bi_status);
1119	dc->bio_ref--;
1120	if (!dc->bio_ref && dc->state == D_SUBMIT) {
1121		dc->state = D_DONE;
1122		complete_all(&dc->wait);
1123	}
1124	spin_unlock_irqrestore(&dc->lock, flags);
1125	bio_put(bio);
1126}
1127
1128static void __check_sit_bitmap(struct f2fs_sb_info *sbi,
1129				block_t start, block_t end)
1130{
1131#ifdef CONFIG_F2FS_CHECK_FS
1132	struct seg_entry *sentry;
1133	unsigned int segno;
1134	block_t blk = start;
1135	unsigned long offset, size, max_blocks = sbi->blocks_per_seg;
1136	unsigned long *map;
1137
1138	while (blk < end) {
1139		segno = GET_SEGNO(sbi, blk);
1140		sentry = get_seg_entry(sbi, segno);
1141		offset = GET_BLKOFF_FROM_SEG0(sbi, blk);
1142
1143		if (end < START_BLOCK(sbi, segno + 1))
1144			size = GET_BLKOFF_FROM_SEG0(sbi, end);
1145		else
1146			size = max_blocks;
1147		map = (unsigned long *)(sentry->cur_valid_map);
1148		offset = __find_rev_next_bit(map, size, offset);
1149		f2fs_bug_on(sbi, offset != size);
1150		blk = START_BLOCK(sbi, segno + 1);
1151	}
1152#endif
1153}
1154
1155static void __init_discard_policy(struct f2fs_sb_info *sbi,
1156				struct discard_policy *dpolicy,
1157				int discard_type, unsigned int granularity)
1158{
1159	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1160
1161	/* common policy */
1162	dpolicy->type = discard_type;
1163	dpolicy->sync = true;
1164	dpolicy->ordered = false;
1165	dpolicy->granularity = granularity;
1166
1167	dpolicy->max_requests = dcc->max_discard_request;
1168	dpolicy->io_aware_gran = dcc->discard_io_aware_gran;
1169	dpolicy->timeout = false;
1170
1171	if (discard_type == DPOLICY_BG) {
1172		dpolicy->min_interval = dcc->min_discard_issue_time;
1173		dpolicy->mid_interval = dcc->mid_discard_issue_time;
1174		dpolicy->max_interval = dcc->max_discard_issue_time;
1175		dpolicy->io_aware = true;
1176		dpolicy->sync = false;
1177		dpolicy->ordered = true;
1178		if (utilization(sbi) > dcc->discard_urgent_util) {
1179			dpolicy->granularity = MIN_DISCARD_GRANULARITY;
1180			if (atomic_read(&dcc->discard_cmd_cnt))
1181				dpolicy->max_interval =
1182					dcc->min_discard_issue_time;
1183		}
1184	} else if (discard_type == DPOLICY_FORCE) {
1185		dpolicy->min_interval = dcc->min_discard_issue_time;
1186		dpolicy->mid_interval = dcc->mid_discard_issue_time;
1187		dpolicy->max_interval = dcc->max_discard_issue_time;
1188		dpolicy->io_aware = false;
1189	} else if (discard_type == DPOLICY_FSTRIM) {
1190		dpolicy->io_aware = false;
1191	} else if (discard_type == DPOLICY_UMOUNT) {
1192		dpolicy->io_aware = false;
1193		/* we need to issue all to keep CP_TRIMMED_FLAG */
1194		dpolicy->granularity = MIN_DISCARD_GRANULARITY;
1195		dpolicy->timeout = true;
1196	}
1197}
1198
1199static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
1200				struct block_device *bdev, block_t lstart,
1201				block_t start, block_t len);
1202
1203#ifdef CONFIG_BLK_DEV_ZONED
1204static void __submit_zone_reset_cmd(struct f2fs_sb_info *sbi,
1205				   struct discard_cmd *dc, blk_opf_t flag,
1206				   struct list_head *wait_list,
1207				   unsigned int *issued)
1208{
1209	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1210	struct block_device *bdev = dc->bdev;
1211	struct bio *bio = bio_alloc(bdev, 0, REQ_OP_ZONE_RESET | flag, GFP_NOFS);
1212	unsigned long flags;
1213
1214	trace_f2fs_issue_reset_zone(bdev, dc->di.start);
1215
1216	spin_lock_irqsave(&dc->lock, flags);
1217	dc->state = D_SUBMIT;
1218	dc->bio_ref++;
1219	spin_unlock_irqrestore(&dc->lock, flags);
1220
1221	if (issued)
1222		(*issued)++;
1223
1224	atomic_inc(&dcc->queued_discard);
1225	dc->queued++;
1226	list_move_tail(&dc->list, wait_list);
1227
1228	/* sanity check on discard range */
1229	__check_sit_bitmap(sbi, dc->di.lstart, dc->di.lstart + dc->di.len);
1230
1231	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(dc->di.start);
1232	bio->bi_private = dc;
1233	bio->bi_end_io = f2fs_submit_discard_endio;
1234	submit_bio(bio);
1235
1236	atomic_inc(&dcc->issued_discard);
1237	f2fs_update_iostat(sbi, NULL, FS_ZONE_RESET_IO, dc->di.len * F2FS_BLKSIZE);
1238}
1239#endif
1240
1241/* this function is copied from blkdev_issue_discard from block/blk-lib.c */
1242static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
1243				struct discard_policy *dpolicy,
1244				struct discard_cmd *dc, int *issued)
1245{
1246	struct block_device *bdev = dc->bdev;
1247	unsigned int max_discard_blocks =
1248			SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev));
1249	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1250	struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
1251					&(dcc->fstrim_list) : &(dcc->wait_list);
1252	blk_opf_t flag = dpolicy->sync ? REQ_SYNC : 0;
1253	block_t lstart, start, len, total_len;
1254	int err = 0;
1255
1256	if (dc->state != D_PREP)
1257		return 0;
1258
1259	if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
1260		return 0;
1261
1262#ifdef CONFIG_BLK_DEV_ZONED
1263	if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) {
1264		int devi = f2fs_bdev_index(sbi, bdev);
1265
1266		if (devi < 0)
1267			return -EINVAL;
1268
1269		if (f2fs_blkz_is_seq(sbi, devi, dc->di.start)) {
1270			__submit_zone_reset_cmd(sbi, dc, flag,
1271						wait_list, issued);
1272			return 0;
1273		}
1274	}
1275#endif
1276
1277	trace_f2fs_issue_discard(bdev, dc->di.start, dc->di.len);
1278
1279	lstart = dc->di.lstart;
1280	start = dc->di.start;
1281	len = dc->di.len;
1282	total_len = len;
1283
1284	dc->di.len = 0;
1285
1286	while (total_len && *issued < dpolicy->max_requests && !err) {
1287		struct bio *bio = NULL;
1288		unsigned long flags;
1289		bool last = true;
1290
1291		if (len > max_discard_blocks) {
1292			len = max_discard_blocks;
1293			last = false;
1294		}
1295
1296		(*issued)++;
1297		if (*issued == dpolicy->max_requests)
1298			last = true;
1299
1300		dc->di.len += len;
1301
1302		if (time_to_inject(sbi, FAULT_DISCARD)) {
1303			err = -EIO;
1304		} else {
1305			err = __blkdev_issue_discard(bdev,
1306					SECTOR_FROM_BLOCK(start),
1307					SECTOR_FROM_BLOCK(len),
1308					GFP_NOFS, &bio);
1309		}
1310		if (err) {
1311			spin_lock_irqsave(&dc->lock, flags);
1312			if (dc->state == D_PARTIAL)
1313				dc->state = D_SUBMIT;
1314			spin_unlock_irqrestore(&dc->lock, flags);
1315
1316			break;
1317		}
1318
1319		f2fs_bug_on(sbi, !bio);
1320
1321		/*
1322		 * should keep before submission to avoid D_DONE
1323		 * right away
1324		 */
1325		spin_lock_irqsave(&dc->lock, flags);
1326		if (last)
1327			dc->state = D_SUBMIT;
1328		else
1329			dc->state = D_PARTIAL;
1330		dc->bio_ref++;
1331		spin_unlock_irqrestore(&dc->lock, flags);
1332
1333		atomic_inc(&dcc->queued_discard);
1334		dc->queued++;
1335		list_move_tail(&dc->list, wait_list);
1336
1337		/* sanity check on discard range */
1338		__check_sit_bitmap(sbi, lstart, lstart + len);
1339
1340		bio->bi_private = dc;
1341		bio->bi_end_io = f2fs_submit_discard_endio;
1342		bio->bi_opf |= flag;
1343		submit_bio(bio);
1344
1345		atomic_inc(&dcc->issued_discard);
1346
1347		f2fs_update_iostat(sbi, NULL, FS_DISCARD_IO, len * F2FS_BLKSIZE);
1348
1349		lstart += len;
1350		start += len;
1351		total_len -= len;
1352		len = total_len;
1353	}
1354
1355	if (!err && len) {
1356		dcc->undiscard_blks -= len;
1357		__update_discard_tree_range(sbi, bdev, lstart, start, len);
1358	}
1359	return err;
1360}
1361
1362static void __insert_discard_cmd(struct f2fs_sb_info *sbi,
1363				struct block_device *bdev, block_t lstart,
1364				block_t start, block_t len)
1365{
1366	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1367	struct rb_node **p = &dcc->root.rb_root.rb_node;
1368	struct rb_node *parent = NULL;
1369	struct discard_cmd *dc;
1370	bool leftmost = true;
1371
1372	/* look up rb tree to find parent node */
1373	while (*p) {
1374		parent = *p;
1375		dc = rb_entry(parent, struct discard_cmd, rb_node);
1376
1377		if (lstart < dc->di.lstart) {
1378			p = &(*p)->rb_left;
1379		} else if (lstart >= dc->di.lstart + dc->di.len) {
1380			p = &(*p)->rb_right;
1381			leftmost = false;
1382		} else {
1383			f2fs_bug_on(sbi, 1);
1384		}
1385	}
1386
1387	dc = __create_discard_cmd(sbi, bdev, lstart, start, len);
1388
1389	rb_link_node(&dc->rb_node, parent, p);
1390	rb_insert_color_cached(&dc->rb_node, &dcc->root, leftmost);
1391}
1392
1393static void __relocate_discard_cmd(struct discard_cmd_control *dcc,
1394						struct discard_cmd *dc)
1395{
1396	list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->di.len)]);
1397}
1398
1399static void __punch_discard_cmd(struct f2fs_sb_info *sbi,
1400				struct discard_cmd *dc, block_t blkaddr)
1401{
1402	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1403	struct discard_info di = dc->di;
1404	bool modified = false;
1405
1406	if (dc->state == D_DONE || dc->di.len == 1) {
1407		__remove_discard_cmd(sbi, dc);
1408		return;
1409	}
1410
1411	dcc->undiscard_blks -= di.len;
1412
1413	if (blkaddr > di.lstart) {
1414		dc->di.len = blkaddr - dc->di.lstart;
1415		dcc->undiscard_blks += dc->di.len;
1416		__relocate_discard_cmd(dcc, dc);
1417		modified = true;
1418	}
1419
1420	if (blkaddr < di.lstart + di.len - 1) {
1421		if (modified) {
1422			__insert_discard_cmd(sbi, dc->bdev, blkaddr + 1,
1423					di.start + blkaddr + 1 - di.lstart,
1424					di.lstart + di.len - 1 - blkaddr);
1425		} else {
1426			dc->di.lstart++;
1427			dc->di.len--;
1428			dc->di.start++;
1429			dcc->undiscard_blks += dc->di.len;
1430			__relocate_discard_cmd(dcc, dc);
1431		}
1432	}
1433}
1434
1435static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
1436				struct block_device *bdev, block_t lstart,
1437				block_t start, block_t len)
1438{
1439	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1440	struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
1441	struct discard_cmd *dc;
1442	struct discard_info di = {0};
1443	struct rb_node **insert_p = NULL, *insert_parent = NULL;
1444	unsigned int max_discard_blocks =
1445			SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev));
1446	block_t end = lstart + len;
1447
1448	dc = __lookup_discard_cmd_ret(&dcc->root, lstart,
1449				&prev_dc, &next_dc, &insert_p, &insert_parent);
1450	if (dc)
1451		prev_dc = dc;
1452
1453	if (!prev_dc) {
1454		di.lstart = lstart;
1455		di.len = next_dc ? next_dc->di.lstart - lstart : len;
1456		di.len = min(di.len, len);
1457		di.start = start;
1458	}
1459
1460	while (1) {
1461		struct rb_node *node;
1462		bool merged = false;
1463		struct discard_cmd *tdc = NULL;
1464
1465		if (prev_dc) {
1466			di.lstart = prev_dc->di.lstart + prev_dc->di.len;
1467			if (di.lstart < lstart)
1468				di.lstart = lstart;
1469			if (di.lstart >= end)
1470				break;
1471
1472			if (!next_dc || next_dc->di.lstart > end)
1473				di.len = end - di.lstart;
1474			else
1475				di.len = next_dc->di.lstart - di.lstart;
1476			di.start = start + di.lstart - lstart;
1477		}
1478
1479		if (!di.len)
1480			goto next;
1481
1482		if (prev_dc && prev_dc->state == D_PREP &&
1483			prev_dc->bdev == bdev &&
1484			__is_discard_back_mergeable(&di, &prev_dc->di,
1485							max_discard_blocks)) {
1486			prev_dc->di.len += di.len;
1487			dcc->undiscard_blks += di.len;
1488			__relocate_discard_cmd(dcc, prev_dc);
1489			di = prev_dc->di;
1490			tdc = prev_dc;
1491			merged = true;
1492		}
1493
1494		if (next_dc && next_dc->state == D_PREP &&
1495			next_dc->bdev == bdev &&
1496			__is_discard_front_mergeable(&di, &next_dc->di,
1497							max_discard_blocks)) {
1498			next_dc->di.lstart = di.lstart;
1499			next_dc->di.len += di.len;
1500			next_dc->di.start = di.start;
1501			dcc->undiscard_blks += di.len;
1502			__relocate_discard_cmd(dcc, next_dc);
1503			if (tdc)
1504				__remove_discard_cmd(sbi, tdc);
1505			merged = true;
1506		}
1507
1508		if (!merged)
1509			__insert_discard_cmd(sbi, bdev,
1510						di.lstart, di.start, di.len);
1511 next:
1512		prev_dc = next_dc;
1513		if (!prev_dc)
1514			break;
1515
1516		node = rb_next(&prev_dc->rb_node);
1517		next_dc = rb_entry_safe(node, struct discard_cmd, rb_node);
1518	}
1519}
1520
1521#ifdef CONFIG_BLK_DEV_ZONED
1522static void __queue_zone_reset_cmd(struct f2fs_sb_info *sbi,
1523		struct block_device *bdev, block_t blkstart, block_t lblkstart,
1524		block_t blklen)
1525{
1526	trace_f2fs_queue_reset_zone(bdev, blkstart);
1527
1528	mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock);
1529	__insert_discard_cmd(sbi, bdev, lblkstart, blkstart, blklen);
1530	mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock);
1531}
1532#endif
1533
1534static void __queue_discard_cmd(struct f2fs_sb_info *sbi,
1535		struct block_device *bdev, block_t blkstart, block_t blklen)
1536{
1537	block_t lblkstart = blkstart;
1538
1539	if (!f2fs_bdev_support_discard(bdev))
1540		return;
1541
1542	trace_f2fs_queue_discard(bdev, blkstart, blklen);
1543
1544	if (f2fs_is_multi_device(sbi)) {
1545		int devi = f2fs_target_device_index(sbi, blkstart);
1546
1547		blkstart -= FDEV(devi).start_blk;
1548	}
1549	mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock);
1550	__update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen);
1551	mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock);
1552}
1553
1554static void __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi,
1555		struct discard_policy *dpolicy, int *issued)
1556{
1557	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1558	struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
1559	struct rb_node **insert_p = NULL, *insert_parent = NULL;
1560	struct discard_cmd *dc;
1561	struct blk_plug plug;
1562	bool io_interrupted = false;
1563
1564	mutex_lock(&dcc->cmd_lock);
1565	dc = __lookup_discard_cmd_ret(&dcc->root, dcc->next_pos,
1566				&prev_dc, &next_dc, &insert_p, &insert_parent);
1567	if (!dc)
1568		dc = next_dc;
1569
1570	blk_start_plug(&plug);
1571
1572	while (dc) {
1573		struct rb_node *node;
1574		int err = 0;
1575
1576		if (dc->state != D_PREP)
1577			goto next;
1578
1579		if (dpolicy->io_aware && !is_idle(sbi, DISCARD_TIME)) {
1580			io_interrupted = true;
1581			break;
1582		}
1583
1584		dcc->next_pos = dc->di.lstart + dc->di.len;
1585		err = __submit_discard_cmd(sbi, dpolicy, dc, issued);
1586
1587		if (*issued >= dpolicy->max_requests)
1588			break;
1589next:
1590		node = rb_next(&dc->rb_node);
1591		if (err)
1592			__remove_discard_cmd(sbi, dc);
1593		dc = rb_entry_safe(node, struct discard_cmd, rb_node);
1594	}
1595
1596	blk_finish_plug(&plug);
1597
1598	if (!dc)
1599		dcc->next_pos = 0;
1600
1601	mutex_unlock(&dcc->cmd_lock);
1602
1603	if (!(*issued) && io_interrupted)
1604		*issued = -1;
1605}
1606static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi,
1607					struct discard_policy *dpolicy);
1608
1609static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
1610					struct discard_policy *dpolicy)
1611{
1612	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1613	struct list_head *pend_list;
1614	struct discard_cmd *dc, *tmp;
1615	struct blk_plug plug;
1616	int i, issued;
1617	bool io_interrupted = false;
1618
1619	if (dpolicy->timeout)
1620		f2fs_update_time(sbi, UMOUNT_DISCARD_TIMEOUT);
1621
1622retry:
1623	issued = 0;
1624	for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
1625		if (dpolicy->timeout &&
1626				f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT))
1627			break;
1628
1629		if (i + 1 < dpolicy->granularity)
1630			break;
1631
1632		if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered) {
1633			__issue_discard_cmd_orderly(sbi, dpolicy, &issued);
1634			return issued;
1635		}
1636
1637		pend_list = &dcc->pend_list[i];
1638
1639		mutex_lock(&dcc->cmd_lock);
1640		if (list_empty(pend_list))
1641			goto next;
1642		if (unlikely(dcc->rbtree_check))
1643			f2fs_bug_on(sbi, !f2fs_check_discard_tree(sbi));
1644		blk_start_plug(&plug);
1645		list_for_each_entry_safe(dc, tmp, pend_list, list) {
1646			f2fs_bug_on(sbi, dc->state != D_PREP);
1647
1648			if (dpolicy->timeout &&
1649				f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT))
1650				break;
1651
1652			if (dpolicy->io_aware && i < dpolicy->io_aware_gran &&
1653						!is_idle(sbi, DISCARD_TIME)) {
1654				io_interrupted = true;
1655				break;
1656			}
1657
1658			__submit_discard_cmd(sbi, dpolicy, dc, &issued);
1659
1660			if (issued >= dpolicy->max_requests)
1661				break;
1662		}
1663		blk_finish_plug(&plug);
1664next:
1665		mutex_unlock(&dcc->cmd_lock);
1666
1667		if (issued >= dpolicy->max_requests || io_interrupted)
1668			break;
1669	}
1670
1671	if (dpolicy->type == DPOLICY_UMOUNT && issued) {
1672		__wait_all_discard_cmd(sbi, dpolicy);
1673		goto retry;
1674	}
1675
1676	if (!issued && io_interrupted)
1677		issued = -1;
1678
1679	return issued;
1680}
1681
1682static bool __drop_discard_cmd(struct f2fs_sb_info *sbi)
1683{
1684	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1685	struct list_head *pend_list;
1686	struct discard_cmd *dc, *tmp;
1687	int i;
1688	bool dropped = false;
1689
1690	mutex_lock(&dcc->cmd_lock);
1691	for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
1692		pend_list = &dcc->pend_list[i];
1693		list_for_each_entry_safe(dc, tmp, pend_list, list) {
1694			f2fs_bug_on(sbi, dc->state != D_PREP);
1695			__remove_discard_cmd(sbi, dc);
1696			dropped = true;
1697		}
1698	}
1699	mutex_unlock(&dcc->cmd_lock);
1700
1701	return dropped;
1702}
1703
1704void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi)
1705{
1706	__drop_discard_cmd(sbi);
1707}
1708
1709static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi,
1710							struct discard_cmd *dc)
1711{
1712	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1713	unsigned int len = 0;
1714
1715	wait_for_completion_io(&dc->wait);
1716	mutex_lock(&dcc->cmd_lock);
1717	f2fs_bug_on(sbi, dc->state != D_DONE);
1718	dc->ref--;
1719	if (!dc->ref) {
1720		if (!dc->error)
1721			len = dc->di.len;
1722		__remove_discard_cmd(sbi, dc);
1723	}
1724	mutex_unlock(&dcc->cmd_lock);
1725
1726	return len;
1727}
1728
1729static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi,
1730						struct discard_policy *dpolicy,
1731						block_t start, block_t end)
1732{
1733	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1734	struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
1735					&(dcc->fstrim_list) : &(dcc->wait_list);
1736	struct discard_cmd *dc = NULL, *iter, *tmp;
1737	unsigned int trimmed = 0;
1738
1739next:
1740	dc = NULL;
1741
1742	mutex_lock(&dcc->cmd_lock);
1743	list_for_each_entry_safe(iter, tmp, wait_list, list) {
1744		if (iter->di.lstart + iter->di.len <= start ||
1745					end <= iter->di.lstart)
1746			continue;
1747		if (iter->di.len < dpolicy->granularity)
1748			continue;
1749		if (iter->state == D_DONE && !iter->ref) {
1750			wait_for_completion_io(&iter->wait);
1751			if (!iter->error)
1752				trimmed += iter->di.len;
1753			__remove_discard_cmd(sbi, iter);
1754		} else {
1755			iter->ref++;
1756			dc = iter;
1757			break;
1758		}
1759	}
1760	mutex_unlock(&dcc->cmd_lock);
1761
1762	if (dc) {
1763		trimmed += __wait_one_discard_bio(sbi, dc);
1764		goto next;
1765	}
1766
1767	return trimmed;
1768}
1769
1770static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi,
1771						struct discard_policy *dpolicy)
1772{
1773	struct discard_policy dp;
1774	unsigned int discard_blks;
1775
1776	if (dpolicy)
1777		return __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX);
1778
1779	/* wait all */
1780	__init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, MIN_DISCARD_GRANULARITY);
1781	discard_blks = __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX);
1782	__init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, MIN_DISCARD_GRANULARITY);
1783	discard_blks += __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX);
1784
1785	return discard_blks;
1786}
1787
1788/* This should be covered by global mutex, &sit_i->sentry_lock */
1789static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr)
1790{
1791	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1792	struct discard_cmd *dc;
1793	bool need_wait = false;
1794
1795	mutex_lock(&dcc->cmd_lock);
1796	dc = __lookup_discard_cmd(sbi, blkaddr);
1797#ifdef CONFIG_BLK_DEV_ZONED
1798	if (dc && f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(dc->bdev)) {
1799		int devi = f2fs_bdev_index(sbi, dc->bdev);
1800
1801		if (devi < 0) {
1802			mutex_unlock(&dcc->cmd_lock);
1803			return;
1804		}
1805
1806		if (f2fs_blkz_is_seq(sbi, devi, dc->di.start)) {
1807			/* force submit zone reset */
1808			if (dc->state == D_PREP)
1809				__submit_zone_reset_cmd(sbi, dc, REQ_SYNC,
1810							&dcc->wait_list, NULL);
1811			dc->ref++;
1812			mutex_unlock(&dcc->cmd_lock);
1813			/* wait zone reset */
1814			__wait_one_discard_bio(sbi, dc);
1815			return;
1816		}
1817	}
1818#endif
1819	if (dc) {
1820		if (dc->state == D_PREP) {
1821			__punch_discard_cmd(sbi, dc, blkaddr);
1822		} else {
1823			dc->ref++;
1824			need_wait = true;
1825		}
1826	}
1827	mutex_unlock(&dcc->cmd_lock);
1828
1829	if (need_wait)
1830		__wait_one_discard_bio(sbi, dc);
1831}
1832
1833void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi)
1834{
1835	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1836
1837	if (dcc && dcc->f2fs_issue_discard) {
1838		struct task_struct *discard_thread = dcc->f2fs_issue_discard;
1839
1840		dcc->f2fs_issue_discard = NULL;
1841		kthread_stop(discard_thread);
1842	}
1843}
1844
1845/**
1846 * f2fs_issue_discard_timeout() - Issue all discard cmd within UMOUNT_DISCARD_TIMEOUT
1847 * @sbi: the f2fs_sb_info data for discard cmd to issue
1848 *
1849 * When UMOUNT_DISCARD_TIMEOUT is exceeded, all remaining discard commands will be dropped
1850 *
1851 * Return true if issued all discard cmd or no discard cmd need issue, otherwise return false.
1852 */
1853bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi)
1854{
1855	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1856	struct discard_policy dpolicy;
1857	bool dropped;
1858
1859	if (!atomic_read(&dcc->discard_cmd_cnt))
1860		return true;
1861
1862	__init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT,
1863					dcc->discard_granularity);
1864	__issue_discard_cmd(sbi, &dpolicy);
1865	dropped = __drop_discard_cmd(sbi);
1866
1867	/* just to make sure there is no pending discard commands */
1868	__wait_all_discard_cmd(sbi, NULL);
1869
1870	f2fs_bug_on(sbi, atomic_read(&dcc->discard_cmd_cnt));
1871	return !dropped;
1872}
1873
1874static int issue_discard_thread(void *data)
1875{
1876	struct f2fs_sb_info *sbi = data;
1877	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
1878	wait_queue_head_t *q = &dcc->discard_wait_queue;
1879	struct discard_policy dpolicy;
1880	unsigned int wait_ms = dcc->min_discard_issue_time;
1881	int issued;
1882
1883	set_freezable();
1884
1885	do {
1886		wait_event_interruptible_timeout(*q,
1887				kthread_should_stop() || freezing(current) ||
1888				dcc->discard_wake,
1889				msecs_to_jiffies(wait_ms));
1890
1891		if (sbi->gc_mode == GC_URGENT_HIGH ||
1892			!f2fs_available_free_memory(sbi, DISCARD_CACHE))
1893			__init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE,
1894						MIN_DISCARD_GRANULARITY);
1895		else
1896			__init_discard_policy(sbi, &dpolicy, DPOLICY_BG,
1897						dcc->discard_granularity);
1898
1899		if (dcc->discard_wake)
1900			dcc->discard_wake = false;
1901
1902		/* clean up pending candidates before going to sleep */
1903		if (atomic_read(&dcc->queued_discard))
1904			__wait_all_discard_cmd(sbi, NULL);
1905
1906		if (try_to_freeze())
1907			continue;
1908		if (f2fs_readonly(sbi->sb))
1909			continue;
1910		if (kthread_should_stop())
1911			return 0;
1912		if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) ||
1913			!atomic_read(&dcc->discard_cmd_cnt)) {
1914			wait_ms = dpolicy.max_interval;
1915			continue;
1916		}
1917
1918		sb_start_intwrite(sbi->sb);
1919
1920		issued = __issue_discard_cmd(sbi, &dpolicy);
1921		if (issued > 0) {
1922			__wait_all_discard_cmd(sbi, &dpolicy);
1923			wait_ms = dpolicy.min_interval;
1924		} else if (issued == -1) {
1925			wait_ms = f2fs_time_to_wait(sbi, DISCARD_TIME);
1926			if (!wait_ms)
1927				wait_ms = dpolicy.mid_interval;
1928		} else {
1929			wait_ms = dpolicy.max_interval;
1930		}
1931		if (!atomic_read(&dcc->discard_cmd_cnt))
1932			wait_ms = dpolicy.max_interval;
1933
1934		sb_end_intwrite(sbi->sb);
1935
1936	} while (!kthread_should_stop());
1937	return 0;
1938}
1939
1940#ifdef CONFIG_BLK_DEV_ZONED
1941static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
1942		struct block_device *bdev, block_t blkstart, block_t blklen)
1943{
1944	sector_t sector, nr_sects;
1945	block_t lblkstart = blkstart;
1946	int devi = 0;
1947	u64 remainder = 0;
1948
1949	if (f2fs_is_multi_device(sbi)) {
1950		devi = f2fs_target_device_index(sbi, blkstart);
1951		if (blkstart < FDEV(devi).start_blk ||
1952		    blkstart > FDEV(devi).end_blk) {
1953			f2fs_err(sbi, "Invalid block %x", blkstart);
1954			return -EIO;
1955		}
1956		blkstart -= FDEV(devi).start_blk;
1957	}
1958
1959	/* For sequential zones, reset the zone write pointer */
1960	if (f2fs_blkz_is_seq(sbi, devi, blkstart)) {
1961		sector = SECTOR_FROM_BLOCK(blkstart);
1962		nr_sects = SECTOR_FROM_BLOCK(blklen);
1963		div64_u64_rem(sector, bdev_zone_sectors(bdev), &remainder);
1964
1965		if (remainder || nr_sects != bdev_zone_sectors(bdev)) {
1966			f2fs_err(sbi, "(%d) %s: Unaligned zone reset attempted (block %x + %x)",
1967				 devi, sbi->s_ndevs ? FDEV(devi).path : "",
1968				 blkstart, blklen);
1969			return -EIO;
1970		}
1971
1972		if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) {
1973			trace_f2fs_issue_reset_zone(bdev, blkstart);
1974			return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
1975						sector, nr_sects, GFP_NOFS);
1976		}
1977
1978		__queue_zone_reset_cmd(sbi, bdev, blkstart, lblkstart, blklen);
1979		return 0;
1980	}
1981
1982	/* For conventional zones, use regular discard if supported */
1983	__queue_discard_cmd(sbi, bdev, lblkstart, blklen);
1984	return 0;
1985}
1986#endif
1987
1988static int __issue_discard_async(struct f2fs_sb_info *sbi,
1989		struct block_device *bdev, block_t blkstart, block_t blklen)
1990{
1991#ifdef CONFIG_BLK_DEV_ZONED
1992	if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev))
1993		return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
1994#endif
1995	__queue_discard_cmd(sbi, bdev, blkstart, blklen);
1996	return 0;
1997}
1998
1999static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
2000				block_t blkstart, block_t blklen)
2001{
2002	sector_t start = blkstart, len = 0;
2003	struct block_device *bdev;
2004	struct seg_entry *se;
2005	unsigned int offset;
2006	block_t i;
2007	int err = 0;
2008
2009	bdev = f2fs_target_device(sbi, blkstart, NULL);
2010
2011	for (i = blkstart; i < blkstart + blklen; i++, len++) {
2012		if (i != start) {
2013			struct block_device *bdev2 =
2014				f2fs_target_device(sbi, i, NULL);
2015
2016			if (bdev2 != bdev) {
2017				err = __issue_discard_async(sbi, bdev,
2018						start, len);
2019				if (err)
2020					return err;
2021				bdev = bdev2;
2022				start = i;
2023				len = 0;
2024			}
2025		}
2026
2027		se = get_seg_entry(sbi, GET_SEGNO(sbi, i));
2028		offset = GET_BLKOFF_FROM_SEG0(sbi, i);
2029
2030		if (f2fs_block_unit_discard(sbi) &&
2031				!f2fs_test_and_set_bit(offset, se->discard_map))
2032			sbi->discard_blks--;
2033	}
2034
2035	if (len)
2036		err = __issue_discard_async(sbi, bdev, start, len);
2037	return err;
2038}
2039
2040static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
2041							bool check_only)
2042{
2043	int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
2044	int max_blocks = sbi->blocks_per_seg;
2045	struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
2046	unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
2047	unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
2048	unsigned long *discard_map = (unsigned long *)se->discard_map;
2049	unsigned long *dmap = SIT_I(sbi)->tmp_map;
2050	unsigned int start = 0, end = -1;
2051	bool force = (cpc->reason & CP_DISCARD);
2052	struct discard_entry *de = NULL;
2053	struct list_head *head = &SM_I(sbi)->dcc_info->entry_list;
2054	int i;
2055
2056	if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi) ||
2057			!f2fs_block_unit_discard(sbi))
2058		return false;
2059
2060	if (!force) {
2061		if (!f2fs_realtime_discard_enable(sbi) || !se->valid_blocks ||
2062			SM_I(sbi)->dcc_info->nr_discards >=
2063				SM_I(sbi)->dcc_info->max_discards)
2064			return false;
2065	}
2066
2067	/* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
2068	for (i = 0; i < entries; i++)
2069		dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] :
2070				(cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
2071
2072	while (force || SM_I(sbi)->dcc_info->nr_discards <=
2073				SM_I(sbi)->dcc_info->max_discards) {
2074		start = __find_rev_next_bit(dmap, max_blocks, end + 1);
2075		if (start >= max_blocks)
2076			break;
2077
2078		end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
2079		if (force && start && end != max_blocks
2080					&& (end - start) < cpc->trim_minlen)
2081			continue;
2082
2083		if (check_only)
2084			return true;
2085
2086		if (!de) {
2087			de = f2fs_kmem_cache_alloc(discard_entry_slab,
2088						GFP_F2FS_ZERO, true, NULL);
2089			de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start);
2090			list_add_tail(&de->list, head);
2091		}
2092
2093		for (i = start; i < end; i++)
2094			__set_bit_le(i, (void *)de->discard_map);
2095
2096		SM_I(sbi)->dcc_info->nr_discards += end - start;
2097	}
2098	return false;
2099}
2100
2101static void release_discard_addr(struct discard_entry *entry)
2102{
2103	list_del(&entry->list);
2104	kmem_cache_free(discard_entry_slab, entry);
2105}
2106
2107void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi)
2108{
2109	struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list);
2110	struct discard_entry *entry, *this;
2111
2112	/* drop caches */
2113	list_for_each_entry_safe(entry, this, head, list)
2114		release_discard_addr(entry);
2115}
2116
2117/*
2118 * Should call f2fs_clear_prefree_segments after checkpoint is done.
2119 */
2120static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
2121{
2122	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
2123	unsigned int segno;
2124
2125	mutex_lock(&dirty_i->seglist_lock);
2126	for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
2127		__set_test_and_free(sbi, segno, false);
2128	mutex_unlock(&dirty_i->seglist_lock);
2129}
2130
2131void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
2132						struct cp_control *cpc)
2133{
2134	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
2135	struct list_head *head = &dcc->entry_list;
2136	struct discard_entry *entry, *this;
2137	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
2138	unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
2139	unsigned int start = 0, end = -1;
2140	unsigned int secno, start_segno;
2141	bool force = (cpc->reason & CP_DISCARD);
2142	bool section_alignment = F2FS_OPTION(sbi).discard_unit ==
2143						DISCARD_UNIT_SECTION;
2144
2145	if (f2fs_lfs_mode(sbi) && __is_large_section(sbi))
2146		section_alignment = true;
2147
2148	mutex_lock(&dirty_i->seglist_lock);
2149
2150	while (1) {
2151		int i;
2152
2153		if (section_alignment && end != -1)
2154			end--;
2155		start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
2156		if (start >= MAIN_SEGS(sbi))
2157			break;
2158		end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi),
2159								start + 1);
2160
2161		if (section_alignment) {
2162			start = rounddown(start, sbi->segs_per_sec);
2163			end = roundup(end, sbi->segs_per_sec);
2164		}
2165
2166		for (i = start; i < end; i++) {
2167			if (test_and_clear_bit(i, prefree_map))
2168				dirty_i->nr_dirty[PRE]--;
2169		}
2170
2171		if (!f2fs_realtime_discard_enable(sbi))
2172			continue;
2173
2174		if (force && start >= cpc->trim_start &&
2175					(end - 1) <= cpc->trim_end)
2176			continue;
2177
2178		/* Should cover 2MB zoned device for zone-based reset */
2179		if (!f2fs_sb_has_blkzoned(sbi) &&
2180		    (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi))) {
2181			f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
2182				(end - start) << sbi->log_blocks_per_seg);
2183			continue;
2184		}
2185next:
2186		secno = GET_SEC_FROM_SEG(sbi, start);
2187		start_segno = GET_SEG_FROM_SEC(sbi, secno);
2188		if (!IS_CURSEC(sbi, secno) &&
2189			!get_valid_blocks(sbi, start, true))
2190			f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno),
2191				sbi->segs_per_sec << sbi->log_blocks_per_seg);
2192
2193		start = start_segno + sbi->segs_per_sec;
2194		if (start < end)
2195			goto next;
2196		else
2197			end = start - 1;
2198	}
2199	mutex_unlock(&dirty_i->seglist_lock);
2200
2201	if (!f2fs_block_unit_discard(sbi))
2202		goto wakeup;
2203
2204	/* send small discards */
2205	list_for_each_entry_safe(entry, this, head, list) {
2206		unsigned int cur_pos = 0, next_pos, len, total_len = 0;
2207		bool is_valid = test_bit_le(0, entry->discard_map);
2208
2209find_next:
2210		if (is_valid) {
2211			next_pos = find_next_zero_bit_le(entry->discard_map,
2212					sbi->blocks_per_seg, cur_pos);
2213			len = next_pos - cur_pos;
2214
2215			if (f2fs_sb_has_blkzoned(sbi) ||
2216			    (force && len < cpc->trim_minlen))
2217				goto skip;
2218
2219			f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos,
2220									len);
2221			total_len += len;
2222		} else {
2223			next_pos = find_next_bit_le(entry->discard_map,
2224					sbi->blocks_per_seg, cur_pos);
2225		}
2226skip:
2227		cur_pos = next_pos;
2228		is_valid = !is_valid;
2229
2230		if (cur_pos < sbi->blocks_per_seg)
2231			goto find_next;
2232
2233		release_discard_addr(entry);
2234		dcc->nr_discards -= total_len;
2235	}
2236
2237wakeup:
2238	wake_up_discard_thread(sbi, false);
2239}
2240
2241int f2fs_start_discard_thread(struct f2fs_sb_info *sbi)
2242{
2243	dev_t dev = sbi->sb->s_bdev->bd_dev;
2244	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
2245	int err = 0;
2246
2247	if (!f2fs_realtime_discard_enable(sbi))
2248		return 0;
2249
2250	dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi,
2251				"f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev));
2252	if (IS_ERR(dcc->f2fs_issue_discard)) {
2253		err = PTR_ERR(dcc->f2fs_issue_discard);
2254		dcc->f2fs_issue_discard = NULL;
2255	}
2256
2257	return err;
2258}
2259
2260static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
2261{
2262	struct discard_cmd_control *dcc;
2263	int err = 0, i;
2264
2265	if (SM_I(sbi)->dcc_info) {
2266		dcc = SM_I(sbi)->dcc_info;
2267		goto init_thread;
2268	}
2269
2270	dcc = f2fs_kzalloc(sbi, sizeof(struct discard_cmd_control), GFP_KERNEL);
2271	if (!dcc)
2272		return -ENOMEM;
2273
2274	dcc->discard_io_aware_gran = MAX_PLIST_NUM;
2275	dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
2276	dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY;
2277	if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
2278		dcc->discard_granularity = sbi->blocks_per_seg;
2279	else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
2280		dcc->discard_granularity = BLKS_PER_SEC(sbi);
2281
2282	INIT_LIST_HEAD(&dcc->entry_list);
2283	for (i = 0; i < MAX_PLIST_NUM; i++)
2284		INIT_LIST_HEAD(&dcc->pend_list[i]);
2285	INIT_LIST_HEAD(&dcc->wait_list);
2286	INIT_LIST_HEAD(&dcc->fstrim_list);
2287	mutex_init(&dcc->cmd_lock);
2288	atomic_set(&dcc->issued_discard, 0);
2289	atomic_set(&dcc->queued_discard, 0);
2290	atomic_set(&dcc->discard_cmd_cnt, 0);
2291	dcc->nr_discards = 0;
2292	dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg;
2293	dcc->max_discard_request = DEF_MAX_DISCARD_REQUEST;
2294	dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME;
2295	dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME;
2296	dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME;
2297	dcc->discard_urgent_util = DEF_DISCARD_URGENT_UTIL;
2298	dcc->undiscard_blks = 0;
2299	dcc->next_pos = 0;
2300	dcc->root = RB_ROOT_CACHED;
2301	dcc->rbtree_check = false;
2302
2303	init_waitqueue_head(&dcc->discard_wait_queue);
2304	SM_I(sbi)->dcc_info = dcc;
2305init_thread:
2306	err = f2fs_start_discard_thread(sbi);
2307	if (err) {
2308		kfree(dcc);
2309		SM_I(sbi)->dcc_info = NULL;
2310	}
2311
2312	return err;
2313}
2314
2315static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi)
2316{
2317	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
2318
2319	if (!dcc)
2320		return;
2321
2322	f2fs_stop_discard_thread(sbi);
2323
2324	/*
2325	 * Recovery can cache discard commands, so in error path of
2326	 * fill_super(), it needs to give a chance to handle them.
2327	 */
2328	f2fs_issue_discard_timeout(sbi);
2329
2330	kfree(dcc);
2331	SM_I(sbi)->dcc_info = NULL;
2332}
2333
2334static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
2335{
2336	struct sit_info *sit_i = SIT_I(sbi);
2337
2338	if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) {
2339		sit_i->dirty_sentries++;
2340		return false;
2341	}
2342
2343	return true;
2344}
2345
2346static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type,
2347					unsigned int segno, int modified)
2348{
2349	struct seg_entry *se = get_seg_entry(sbi, segno);
2350
2351	se->type = type;
2352	if (modified)
2353		__mark_sit_entry_dirty(sbi, segno);
2354}
2355
2356static inline unsigned long long get_segment_mtime(struct f2fs_sb_info *sbi,
2357								block_t blkaddr)
2358{
2359	unsigned int segno = GET_SEGNO(sbi, blkaddr);
2360
2361	if (segno == NULL_SEGNO)
2362		return 0;
2363	return get_seg_entry(sbi, segno)->mtime;
2364}
2365
2366static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr,
2367						unsigned long long old_mtime)
2368{
2369	struct seg_entry *se;
2370	unsigned int segno = GET_SEGNO(sbi, blkaddr);
2371	unsigned long long ctime = get_mtime(sbi, false);
2372	unsigned long long mtime = old_mtime ? old_mtime : ctime;
2373
2374	if (segno == NULL_SEGNO)
2375		return;
2376
2377	se = get_seg_entry(sbi, segno);
2378
2379	if (!se->mtime)
2380		se->mtime = mtime;
2381	else
2382		se->mtime = div_u64(se->mtime * se->valid_blocks + mtime,
2383						se->valid_blocks + 1);
2384
2385	if (ctime > SIT_I(sbi)->max_mtime)
2386		SIT_I(sbi)->max_mtime = ctime;
2387}
2388
2389static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
2390{
2391	struct seg_entry *se;
2392	unsigned int segno, offset;
2393	long int new_vblocks;
2394	bool exist;
2395#ifdef CONFIG_F2FS_CHECK_FS
2396	bool mir_exist;
2397#endif
2398
2399	segno = GET_SEGNO(sbi, blkaddr);
2400
2401	se = get_seg_entry(sbi, segno);
2402	new_vblocks = se->valid_blocks + del;
2403	offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
2404
2405	f2fs_bug_on(sbi, (new_vblocks < 0 ||
2406			(new_vblocks > f2fs_usable_blks_in_seg(sbi, segno))));
2407
2408	se->valid_blocks = new_vblocks;
2409
2410	/* Update valid block bitmap */
2411	if (del > 0) {
2412		exist = f2fs_test_and_set_bit(offset, se->cur_valid_map);
2413#ifdef CONFIG_F2FS_CHECK_FS
2414		mir_exist = f2fs_test_and_set_bit(offset,
2415						se->cur_valid_map_mir);
2416		if (unlikely(exist != mir_exist)) {
2417			f2fs_err(sbi, "Inconsistent error when setting bitmap, blk:%u, old bit:%d",
2418				 blkaddr, exist);
2419			f2fs_bug_on(sbi, 1);
2420		}
2421#endif
2422		if (unlikely(exist)) {
2423			f2fs_err(sbi, "Bitmap was wrongly set, blk:%u",
2424				 blkaddr);
2425			f2fs_bug_on(sbi, 1);
2426			se->valid_blocks--;
2427			del = 0;
2428		}
2429
2430		if (f2fs_block_unit_discard(sbi) &&
2431				!f2fs_test_and_set_bit(offset, se->discard_map))
2432			sbi->discard_blks--;
2433
2434		/*
2435		 * SSR should never reuse block which is checkpointed
2436		 * or newly invalidated.
2437		 */
2438		if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED)) {
2439			if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map))
2440				se->ckpt_valid_blocks++;
2441		}
2442	} else {
2443		exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map);
2444#ifdef CONFIG_F2FS_CHECK_FS
2445		mir_exist = f2fs_test_and_clear_bit(offset,
2446						se->cur_valid_map_mir);
2447		if (unlikely(exist != mir_exist)) {
2448			f2fs_err(sbi, "Inconsistent error when clearing bitmap, blk:%u, old bit:%d",
2449				 blkaddr, exist);
2450			f2fs_bug_on(sbi, 1);
2451		}
2452#endif
2453		if (unlikely(!exist)) {
2454			f2fs_err(sbi, "Bitmap was wrongly cleared, blk:%u",
2455				 blkaddr);
2456			f2fs_bug_on(sbi, 1);
2457			se->valid_blocks++;
2458			del = 0;
2459		} else if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
2460			/*
2461			 * If checkpoints are off, we must not reuse data that
2462			 * was used in the previous checkpoint. If it was used
2463			 * before, we must track that to know how much space we
2464			 * really have.
2465			 */
2466			if (f2fs_test_bit(offset, se->ckpt_valid_map)) {
2467				spin_lock(&sbi->stat_lock);
2468				sbi->unusable_block_count++;
2469				spin_unlock(&sbi->stat_lock);
2470			}
2471		}
2472
2473		if (f2fs_block_unit_discard(sbi) &&
2474			f2fs_test_and_clear_bit(offset, se->discard_map))
2475			sbi->discard_blks++;
2476	}
2477	if (!f2fs_test_bit(offset, se->ckpt_valid_map))
2478		se->ckpt_valid_blocks += del;
2479
2480	__mark_sit_entry_dirty(sbi, segno);
2481
2482	/* update total number of valid blocks to be written in ckpt area */
2483	SIT_I(sbi)->written_valid_blocks += del;
2484
2485	if (__is_large_section(sbi))
2486		get_sec_entry(sbi, segno)->valid_blocks += del;
2487}
2488
2489void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
2490{
2491	unsigned int segno = GET_SEGNO(sbi, addr);
2492	struct sit_info *sit_i = SIT_I(sbi);
2493
2494	f2fs_bug_on(sbi, addr == NULL_ADDR);
2495	if (addr == NEW_ADDR || addr == COMPRESS_ADDR)
2496		return;
2497
2498	f2fs_invalidate_internal_cache(sbi, addr);
2499
2500	/* add it into sit main buffer */
2501	down_write(&sit_i->sentry_lock);
2502
2503	update_segment_mtime(sbi, addr, 0);
2504	update_sit_entry(sbi, addr, -1);
2505
2506	/* add it into dirty seglist */
2507	locate_dirty_segment(sbi, segno);
2508
2509	up_write(&sit_i->sentry_lock);
2510}
2511
2512bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
2513{
2514	struct sit_info *sit_i = SIT_I(sbi);
2515	unsigned int segno, offset;
2516	struct seg_entry *se;
2517	bool is_cp = false;
2518
2519	if (!__is_valid_data_blkaddr(blkaddr))
2520		return true;
2521
2522	down_read(&sit_i->sentry_lock);
2523
2524	segno = GET_SEGNO(sbi, blkaddr);
2525	se = get_seg_entry(sbi, segno);
2526	offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
2527
2528	if (f2fs_test_bit(offset, se->ckpt_valid_map))
2529		is_cp = true;
2530
2531	up_read(&sit_i->sentry_lock);
2532
2533	return is_cp;
2534}
2535
2536static unsigned short f2fs_curseg_valid_blocks(struct f2fs_sb_info *sbi, int type)
2537{
2538	struct curseg_info *curseg = CURSEG_I(sbi, type);
2539
2540	if (sbi->ckpt->alloc_type[type] == SSR)
2541		return sbi->blocks_per_seg;
2542	return curseg->next_blkoff;
2543}
2544
2545/*
2546 * Calculate the number of current summary pages for writing
2547 */
2548int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
2549{
2550	int valid_sum_count = 0;
2551	int i, sum_in_page;
2552
2553	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
2554		if (sbi->ckpt->alloc_type[i] != SSR && for_ra)
2555			valid_sum_count +=
2556				le16_to_cpu(F2FS_CKPT(sbi)->cur_data_blkoff[i]);
2557		else
2558			valid_sum_count += f2fs_curseg_valid_blocks(sbi, i);
2559	}
2560
2561	sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE -
2562			SUM_FOOTER_SIZE) / SUMMARY_SIZE;
2563	if (valid_sum_count <= sum_in_page)
2564		return 1;
2565	else if ((valid_sum_count - sum_in_page) <=
2566		(PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE)
2567		return 2;
2568	return 3;
2569}
2570
2571/*
2572 * Caller should put this summary page
2573 */
2574struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno)
2575{
2576	if (unlikely(f2fs_cp_error(sbi)))
2577		return ERR_PTR(-EIO);
2578	return f2fs_get_meta_page_retry(sbi, GET_SUM_BLOCK(sbi, segno));
2579}
2580
2581void f2fs_update_meta_page(struct f2fs_sb_info *sbi,
2582					void *src, block_t blk_addr)
2583{
2584	struct page *page = f2fs_grab_meta_page(sbi, blk_addr);
2585
2586	memcpy(page_address(page), src, PAGE_SIZE);
2587	set_page_dirty(page);
2588	f2fs_put_page(page, 1);
2589}
2590
2591static void write_sum_page(struct f2fs_sb_info *sbi,
2592			struct f2fs_summary_block *sum_blk, block_t blk_addr)
2593{
2594	f2fs_update_meta_page(sbi, (void *)sum_blk, blk_addr);
2595}
2596
2597static void write_current_sum_page(struct f2fs_sb_info *sbi,
2598						int type, block_t blk_addr)
2599{
2600	struct curseg_info *curseg = CURSEG_I(sbi, type);
2601	struct page *page = f2fs_grab_meta_page(sbi, blk_addr);
2602	struct f2fs_summary_block *src = curseg->sum_blk;
2603	struct f2fs_summary_block *dst;
2604
2605	dst = (struct f2fs_summary_block *)page_address(page);
2606	memset(dst, 0, PAGE_SIZE);
2607
2608	mutex_lock(&curseg->curseg_mutex);
2609
2610	down_read(&curseg->journal_rwsem);
2611	memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE);
2612	up_read(&curseg->journal_rwsem);
2613
2614	memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE);
2615	memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE);
2616
2617	mutex_unlock(&curseg->curseg_mutex);
2618
2619	set_page_dirty(page);
2620	f2fs_put_page(page, 1);
2621}
2622
2623static int is_next_segment_free(struct f2fs_sb_info *sbi,
2624				struct curseg_info *curseg, int type)
2625{
2626	unsigned int segno = curseg->segno + 1;
2627	struct free_segmap_info *free_i = FREE_I(sbi);
2628
2629	if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec)
2630		return !test_bit(segno, free_i->free_segmap);
2631	return 0;
2632}
2633
2634/*
2635 * Find a new segment from the free segments bitmap to right order
2636 * This function should be returned with success, otherwise BUG
2637 */
2638static void get_new_segment(struct f2fs_sb_info *sbi,
2639			unsigned int *newseg, bool new_sec, int dir)
2640{
2641	struct free_segmap_info *free_i = FREE_I(sbi);
2642	unsigned int segno, secno, zoneno;
2643	unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
2644	unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg);
2645	unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg);
2646	unsigned int left_start = hint;
2647	bool init = true;
2648	int go_left = 0;
2649	int i;
2650
2651	spin_lock(&free_i->segmap_lock);
2652
2653	if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
2654		segno = find_next_zero_bit(free_i->free_segmap,
2655			GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1);
2656		if (segno < GET_SEG_FROM_SEC(sbi, hint + 1))
2657			goto got_it;
2658	}
2659find_other_zone:
2660	secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
2661	if (secno >= MAIN_SECS(sbi)) {
2662		if (dir == ALLOC_RIGHT) {
2663			secno = find_first_zero_bit(free_i->free_secmap,
2664							MAIN_SECS(sbi));
2665			f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi));
2666		} else {
2667			go_left = 1;
2668			left_start = hint - 1;
2669		}
2670	}
2671	if (go_left == 0)
2672		goto skip_left;
2673
2674	while (test_bit(left_start, free_i->free_secmap)) {
2675		if (left_start > 0) {
2676			left_start--;
2677			continue;
2678		}
2679		left_start = find_first_zero_bit(free_i->free_secmap,
2680							MAIN_SECS(sbi));
2681		f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi));
2682		break;
2683	}
2684	secno = left_start;
2685skip_left:
2686	segno = GET_SEG_FROM_SEC(sbi, secno);
2687	zoneno = GET_ZONE_FROM_SEC(sbi, secno);
2688
2689	/* give up on finding another zone */
2690	if (!init)
2691		goto got_it;
2692	if (sbi->secs_per_zone == 1)
2693		goto got_it;
2694	if (zoneno == old_zoneno)
2695		goto got_it;
2696	if (dir == ALLOC_LEFT) {
2697		if (!go_left && zoneno + 1 >= total_zones)
2698			goto got_it;
2699		if (go_left && zoneno == 0)
2700			goto got_it;
2701	}
2702	for (i = 0; i < NR_CURSEG_TYPE; i++)
2703		if (CURSEG_I(sbi, i)->zone == zoneno)
2704			break;
2705
2706	if (i < NR_CURSEG_TYPE) {
2707		/* zone is in user, try another */
2708		if (go_left)
2709			hint = zoneno * sbi->secs_per_zone - 1;
2710		else if (zoneno + 1 >= total_zones)
2711			hint = 0;
2712		else
2713			hint = (zoneno + 1) * sbi->secs_per_zone;
2714		init = false;
2715		goto find_other_zone;
2716	}
2717got_it:
2718	/* set it as dirty segment in free segmap */
2719	f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
2720	__set_inuse(sbi, segno);
2721	*newseg = segno;
2722	spin_unlock(&free_i->segmap_lock);
2723}
2724
2725static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
2726{
2727	struct curseg_info *curseg = CURSEG_I(sbi, type);
2728	struct summary_footer *sum_footer;
2729	unsigned short seg_type = curseg->seg_type;
2730
2731	curseg->inited = true;
2732	curseg->segno = curseg->next_segno;
2733	curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno);
2734	curseg->next_blkoff = 0;
2735	curseg->next_segno = NULL_SEGNO;
2736
2737	sum_footer = &(curseg->sum_blk->footer);
2738	memset(sum_footer, 0, sizeof(struct summary_footer));
2739
2740	sanity_check_seg_type(sbi, seg_type);
2741
2742	if (IS_DATASEG(seg_type))
2743		SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA);
2744	if (IS_NODESEG(seg_type))
2745		SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE);
2746	__set_sit_entry_type(sbi, seg_type, curseg->segno, modified);
2747}
2748
2749static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
2750{
2751	struct curseg_info *curseg = CURSEG_I(sbi, type);
2752	unsigned short seg_type = curseg->seg_type;
2753
2754	sanity_check_seg_type(sbi, seg_type);
2755	if (f2fs_need_rand_seg(sbi))
2756		return get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec);
2757
2758	/* if segs_per_sec is large than 1, we need to keep original policy. */
2759	if (__is_large_section(sbi))
2760		return curseg->segno;
2761
2762	/* inmem log may not locate on any segment after mount */
2763	if (!curseg->inited)
2764		return 0;
2765
2766	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
2767		return 0;
2768
2769	if (test_opt(sbi, NOHEAP) &&
2770		(seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type)))
2771		return 0;
2772
2773	if (SIT_I(sbi)->last_victim[ALLOC_NEXT])
2774		return SIT_I(sbi)->last_victim[ALLOC_NEXT];
2775
2776	/* find segments from 0 to reuse freed segments */
2777	if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE)
2778		return 0;
2779
2780	return curseg->segno;
2781}
2782
2783/*
2784 * Allocate a current working segment.
2785 * This function always allocates a free segment in LFS manner.
2786 */
2787static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
2788{
2789	struct curseg_info *curseg = CURSEG_I(sbi, type);
2790	unsigned short seg_type = curseg->seg_type;
2791	unsigned int segno = curseg->segno;
2792	int dir = ALLOC_LEFT;
2793
2794	if (curseg->inited)
2795		write_sum_page(sbi, curseg->sum_blk,
2796				GET_SUM_BLOCK(sbi, segno));
2797	if (seg_type == CURSEG_WARM_DATA || seg_type == CURSEG_COLD_DATA)
2798		dir = ALLOC_RIGHT;
2799
2800	if (test_opt(sbi, NOHEAP))
2801		dir = ALLOC_RIGHT;
2802
2803	segno = __get_next_segno(sbi, type);
2804	get_new_segment(sbi, &segno, new_sec, dir);
2805	curseg->next_segno = segno;
2806	reset_curseg(sbi, type, 1);
2807	curseg->alloc_type = LFS;
2808	if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK)
2809		curseg->fragment_remained_chunk =
2810				get_random_u32_inclusive(1, sbi->max_fragment_chunk);
2811}
2812
2813static int __next_free_blkoff(struct f2fs_sb_info *sbi,
2814					int segno, block_t start)
2815{
2816	struct seg_entry *se = get_seg_entry(sbi, segno);
2817	int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
2818	unsigned long *target_map = SIT_I(sbi)->tmp_map;
2819	unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
2820	unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
2821	int i;
2822
2823	for (i = 0; i < entries; i++)
2824		target_map[i] = ckpt_map[i] | cur_map[i];
2825
2826	return __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start);
2827}
2828
2829static int f2fs_find_next_ssr_block(struct f2fs_sb_info *sbi,
2830		struct curseg_info *seg)
2831{
2832	return __next_free_blkoff(sbi, seg->segno, seg->next_blkoff + 1);
2833}
2834
2835bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno)
2836{
2837	return __next_free_blkoff(sbi, segno, 0) < sbi->blocks_per_seg;
2838}
2839
2840/*
2841 * This function always allocates a used segment(from dirty seglist) by SSR
2842 * manner, so it should recover the existing segment information of valid blocks
2843 */
2844static void change_curseg(struct f2fs_sb_info *sbi, int type)
2845{
2846	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
2847	struct curseg_info *curseg = CURSEG_I(sbi, type);
2848	unsigned int new_segno = curseg->next_segno;
2849	struct f2fs_summary_block *sum_node;
2850	struct page *sum_page;
2851
2852	write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno));
2853
2854	__set_test_and_inuse(sbi, new_segno);
2855
2856	mutex_lock(&dirty_i->seglist_lock);
2857	__remove_dirty_segment(sbi, new_segno, PRE);
2858	__remove_dirty_segment(sbi, new_segno, DIRTY);
2859	mutex_unlock(&dirty_i->seglist_lock);
2860
2861	reset_curseg(sbi, type, 1);
2862	curseg->alloc_type = SSR;
2863	curseg->next_blkoff = __next_free_blkoff(sbi, curseg->segno, 0);
2864
2865	sum_page = f2fs_get_sum_page(sbi, new_segno);
2866	if (IS_ERR(sum_page)) {
2867		/* GC won't be able to use stale summary pages by cp_error */
2868		memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE);
2869		return;
2870	}
2871	sum_node = (struct f2fs_summary_block *)page_address(sum_page);
2872	memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
2873	f2fs_put_page(sum_page, 1);
2874}
2875
2876static int get_ssr_segment(struct f2fs_sb_info *sbi, int type,
2877				int alloc_mode, unsigned long long age);
2878
2879static void get_atssr_segment(struct f2fs_sb_info *sbi, int type,
2880					int target_type, int alloc_mode,
2881					unsigned long long age)
2882{
2883	struct curseg_info *curseg = CURSEG_I(sbi, type);
2884
2885	curseg->seg_type = target_type;
2886
2887	if (get_ssr_segment(sbi, type, alloc_mode, age)) {
2888		struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno);
2889
2890		curseg->seg_type = se->type;
2891		change_curseg(sbi, type);
2892	} else {
2893		/* allocate cold segment by default */
2894		curseg->seg_type = CURSEG_COLD_DATA;
2895		new_curseg(sbi, type, true);
2896	}
2897	stat_inc_seg_type(sbi, curseg);
2898}
2899
2900static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi)
2901{
2902	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC);
2903
2904	if (!sbi->am.atgc_enabled)
2905		return;
2906
2907	f2fs_down_read(&SM_I(sbi)->curseg_lock);
2908
2909	mutex_lock(&curseg->curseg_mutex);
2910	down_write(&SIT_I(sbi)->sentry_lock);
2911
2912	get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC, CURSEG_COLD_DATA, SSR, 0);
2913
2914	up_write(&SIT_I(sbi)->sentry_lock);
2915	mutex_unlock(&curseg->curseg_mutex);
2916
2917	f2fs_up_read(&SM_I(sbi)->curseg_lock);
2918
2919}
2920void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi)
2921{
2922	__f2fs_init_atgc_curseg(sbi);
2923}
2924
2925static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type)
2926{
2927	struct curseg_info *curseg = CURSEG_I(sbi, type);
2928
2929	mutex_lock(&curseg->curseg_mutex);
2930	if (!curseg->inited)
2931		goto out;
2932
2933	if (get_valid_blocks(sbi, curseg->segno, false)) {
2934		write_sum_page(sbi, curseg->sum_blk,
2935				GET_SUM_BLOCK(sbi, curseg->segno));
2936	} else {
2937		mutex_lock(&DIRTY_I(sbi)->seglist_lock);
2938		__set_test_and_free(sbi, curseg->segno, true);
2939		mutex_unlock(&DIRTY_I(sbi)->seglist_lock);
2940	}
2941out:
2942	mutex_unlock(&curseg->curseg_mutex);
2943}
2944
2945void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi)
2946{
2947	__f2fs_save_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED);
2948
2949	if (sbi->am.atgc_enabled)
2950		__f2fs_save_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC);
2951}
2952
2953static void __f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi, int type)
2954{
2955	struct curseg_info *curseg = CURSEG_I(sbi, type);
2956
2957	mutex_lock(&curseg->curseg_mutex);
2958	if (!curseg->inited)
2959		goto out;
2960	if (get_valid_blocks(sbi, curseg->segno, false))
2961		goto out;
2962
2963	mutex_lock(&DIRTY_I(sbi)->seglist_lock);
2964	__set_test_and_inuse(sbi, curseg->segno);
2965	mutex_unlock(&DIRTY_I(sbi)->seglist_lock);
2966out:
2967	mutex_unlock(&curseg->curseg_mutex);
2968}
2969
2970void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi)
2971{
2972	__f2fs_restore_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED);
2973
2974	if (sbi->am.atgc_enabled)
2975		__f2fs_restore_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC);
2976}
2977
2978static int get_ssr_segment(struct f2fs_sb_info *sbi, int type,
2979				int alloc_mode, unsigned long long age)
2980{
2981	struct curseg_info *curseg = CURSEG_I(sbi, type);
2982	unsigned segno = NULL_SEGNO;
2983	unsigned short seg_type = curseg->seg_type;
2984	int i, cnt;
2985	bool reversed = false;
2986
2987	sanity_check_seg_type(sbi, seg_type);
2988
2989	/* f2fs_need_SSR() already forces to do this */
2990	if (!f2fs_get_victim(sbi, &segno, BG_GC, seg_type, alloc_mode, age)) {
2991		curseg->next_segno = segno;
2992		return 1;
2993	}
2994
2995	/* For node segments, let's do SSR more intensively */
2996	if (IS_NODESEG(seg_type)) {
2997		if (seg_type >= CURSEG_WARM_NODE) {
2998			reversed = true;
2999			i = CURSEG_COLD_NODE;
3000		} else {
3001			i = CURSEG_HOT_NODE;
3002		}
3003		cnt = NR_CURSEG_NODE_TYPE;
3004	} else {
3005		if (seg_type >= CURSEG_WARM_DATA) {
3006			reversed = true;
3007			i = CURSEG_COLD_DATA;
3008		} else {
3009			i = CURSEG_HOT_DATA;
3010		}
3011		cnt = NR_CURSEG_DATA_TYPE;
3012	}
3013
3014	for (; cnt-- > 0; reversed ? i-- : i++) {
3015		if (i == seg_type)
3016			continue;
3017		if (!f2fs_get_victim(sbi, &segno, BG_GC, i, alloc_mode, age)) {
3018			curseg->next_segno = segno;
3019			return 1;
3020		}
3021	}
3022
3023	/* find valid_blocks=0 in dirty list */
3024	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
3025		segno = get_free_segment(sbi);
3026		if (segno != NULL_SEGNO) {
3027			curseg->next_segno = segno;
3028			return 1;
3029		}
3030	}
3031	return 0;
3032}
3033
3034static bool need_new_seg(struct f2fs_sb_info *sbi, int type)
3035{
3036	struct curseg_info *curseg = CURSEG_I(sbi, type);
3037
3038	if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
3039	    curseg->seg_type == CURSEG_WARM_NODE)
3040		return true;
3041	if (curseg->alloc_type == LFS &&
3042	    is_next_segment_free(sbi, curseg, type) &&
3043	    likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
3044		return true;
3045	if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0))
3046		return true;
3047	return false;
3048}
3049
3050void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
3051					unsigned int start, unsigned int end)
3052{
3053	struct curseg_info *curseg = CURSEG_I(sbi, type);
3054	unsigned int segno;
3055
3056	f2fs_down_read(&SM_I(sbi)->curseg_lock);
3057	mutex_lock(&curseg->curseg_mutex);
3058	down_write(&SIT_I(sbi)->sentry_lock);
3059
3060	segno = CURSEG_I(sbi, type)->segno;
3061	if (segno < start || segno > end)
3062		goto unlock;
3063
3064	if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0))
3065		change_curseg(sbi, type);
3066	else
3067		new_curseg(sbi, type, true);
3068
3069	stat_inc_seg_type(sbi, curseg);
3070
3071	locate_dirty_segment(sbi, segno);
3072unlock:
3073	up_write(&SIT_I(sbi)->sentry_lock);
3074
3075	if (segno != curseg->segno)
3076		f2fs_notice(sbi, "For resize: curseg of type %d: %u ==> %u",
3077			    type, segno, curseg->segno);
3078
3079	mutex_unlock(&curseg->curseg_mutex);
3080	f2fs_up_read(&SM_I(sbi)->curseg_lock);
3081}
3082
3083static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
3084						bool new_sec, bool force)
3085{
3086	struct curseg_info *curseg = CURSEG_I(sbi, type);
3087	unsigned int old_segno;
3088
3089	if (!force && curseg->inited &&
3090	    !curseg->next_blkoff &&
3091	    !get_valid_blocks(sbi, curseg->segno, new_sec) &&
3092	    !get_ckpt_valid_blocks(sbi, curseg->segno, new_sec))
3093		return;
3094
3095	old_segno = curseg->segno;
3096	new_curseg(sbi, type, true);
3097	stat_inc_seg_type(sbi, curseg);
3098	locate_dirty_segment(sbi, old_segno);
3099}
3100
3101void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force)
3102{
3103	f2fs_down_read(&SM_I(sbi)->curseg_lock);
3104	down_write(&SIT_I(sbi)->sentry_lock);
3105	__allocate_new_segment(sbi, type, true, force);
3106	up_write(&SIT_I(sbi)->sentry_lock);
3107	f2fs_up_read(&SM_I(sbi)->curseg_lock);
3108}
3109
3110void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
3111{
3112	int i;
3113
3114	f2fs_down_read(&SM_I(sbi)->curseg_lock);
3115	down_write(&SIT_I(sbi)->sentry_lock);
3116	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
3117		__allocate_new_segment(sbi, i, false, false);
3118	up_write(&SIT_I(sbi)->sentry_lock);
3119	f2fs_up_read(&SM_I(sbi)->curseg_lock);
3120}
3121
3122bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
3123						struct cp_control *cpc)
3124{
3125	__u64 trim_start = cpc->trim_start;
3126	bool has_candidate = false;
3127
3128	down_write(&SIT_I(sbi)->sentry_lock);
3129	for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) {
3130		if (add_discard_addrs(sbi, cpc, true)) {
3131			has_candidate = true;
3132			break;
3133		}
3134	}
3135	up_write(&SIT_I(sbi)->sentry_lock);
3136
3137	cpc->trim_start = trim_start;
3138	return has_candidate;
3139}
3140
3141static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi,
3142					struct discard_policy *dpolicy,
3143					unsigned int start, unsigned int end)
3144{
3145	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
3146	struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
3147	struct rb_node **insert_p = NULL, *insert_parent = NULL;
3148	struct discard_cmd *dc;
3149	struct blk_plug plug;
3150	int issued;
3151	unsigned int trimmed = 0;
3152
3153next:
3154	issued = 0;
3155
3156	mutex_lock(&dcc->cmd_lock);
3157	if (unlikely(dcc->rbtree_check))
3158		f2fs_bug_on(sbi, !f2fs_check_discard_tree(sbi));
3159
3160	dc = __lookup_discard_cmd_ret(&dcc->root, start,
3161				&prev_dc, &next_dc, &insert_p, &insert_parent);
3162	if (!dc)
3163		dc = next_dc;
3164
3165	blk_start_plug(&plug);
3166
3167	while (dc && dc->di.lstart <= end) {
3168		struct rb_node *node;
3169		int err = 0;
3170
3171		if (dc->di.len < dpolicy->granularity)
3172			goto skip;
3173
3174		if (dc->state != D_PREP) {
3175			list_move_tail(&dc->list, &dcc->fstrim_list);
3176			goto skip;
3177		}
3178
3179		err = __submit_discard_cmd(sbi, dpolicy, dc, &issued);
3180
3181		if (issued >= dpolicy->max_requests) {
3182			start = dc->di.lstart + dc->di.len;
3183
3184			if (err)
3185				__remove_discard_cmd(sbi, dc);
3186
3187			blk_finish_plug(&plug);
3188			mutex_unlock(&dcc->cmd_lock);
3189			trimmed += __wait_all_discard_cmd(sbi, NULL);
3190			f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
3191			goto next;
3192		}
3193skip:
3194		node = rb_next(&dc->rb_node);
3195		if (err)
3196			__remove_discard_cmd(sbi, dc);
3197		dc = rb_entry_safe(node, struct discard_cmd, rb_node);
3198
3199		if (fatal_signal_pending(current))
3200			break;
3201	}
3202
3203	blk_finish_plug(&plug);
3204	mutex_unlock(&dcc->cmd_lock);
3205
3206	return trimmed;
3207}
3208
3209int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
3210{
3211	__u64 start = F2FS_BYTES_TO_BLK(range->start);
3212	__u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
3213	unsigned int start_segno, end_segno;
3214	block_t start_block, end_block;
3215	struct cp_control cpc;
3216	struct discard_policy dpolicy;
3217	unsigned long long trimmed = 0;
3218	int err = 0;
3219	bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi);
3220
3221	if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
3222		return -EINVAL;
3223
3224	if (end < MAIN_BLKADDR(sbi))
3225		goto out;
3226
3227	if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) {
3228		f2fs_warn(sbi, "Found FS corruption, run fsck to fix.");
3229		return -EFSCORRUPTED;
3230	}
3231
3232	/* start/end segment number in main_area */
3233	start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start);
3234	end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
3235						GET_SEGNO(sbi, end);
3236	if (need_align) {
3237		start_segno = rounddown(start_segno, sbi->segs_per_sec);
3238		end_segno = roundup(end_segno + 1, sbi->segs_per_sec) - 1;
3239	}
3240
3241	cpc.reason = CP_DISCARD;
3242	cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen));
3243	cpc.trim_start = start_segno;
3244	cpc.trim_end = end_segno;
3245
3246	if (sbi->discard_blks == 0)
3247		goto out;
3248
3249	f2fs_down_write(&sbi->gc_lock);
3250	stat_inc_cp_call_count(sbi, TOTAL_CALL);
3251	err = f2fs_write_checkpoint(sbi, &cpc);
3252	f2fs_up_write(&sbi->gc_lock);
3253	if (err)
3254		goto out;
3255
3256	/*
3257	 * We filed discard candidates, but actually we don't need to wait for
3258	 * all of them, since they'll be issued in idle time along with runtime
3259	 * discard option. User configuration looks like using runtime discard
3260	 * or periodic fstrim instead of it.
3261	 */
3262	if (f2fs_realtime_discard_enable(sbi))
3263		goto out;
3264
3265	start_block = START_BLOCK(sbi, start_segno);
3266	end_block = START_BLOCK(sbi, end_segno + 1);
3267
3268	__init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen);
3269	trimmed = __issue_discard_cmd_range(sbi, &dpolicy,
3270					start_block, end_block);
3271
3272	trimmed += __wait_discard_cmd_range(sbi, &dpolicy,
3273					start_block, end_block);
3274out:
3275	if (!err)
3276		range->len = F2FS_BLK_TO_BYTES(trimmed);
3277	return err;
3278}
3279
3280int f2fs_rw_hint_to_seg_type(enum rw_hint hint)
3281{
3282	switch (hint) {
3283	case WRITE_LIFE_SHORT:
3284		return CURSEG_HOT_DATA;
3285	case WRITE_LIFE_EXTREME:
3286		return CURSEG_COLD_DATA;
3287	default:
3288		return CURSEG_WARM_DATA;
3289	}
3290}
3291
3292static int __get_segment_type_2(struct f2fs_io_info *fio)
3293{
3294	if (fio->type == DATA)
3295		return CURSEG_HOT_DATA;
3296	else
3297		return CURSEG_HOT_NODE;
3298}
3299
3300static int __get_segment_type_4(struct f2fs_io_info *fio)
3301{
3302	if (fio->type == DATA) {
3303		struct inode *inode = fio->page->mapping->host;
3304
3305		if (S_ISDIR(inode->i_mode))
3306			return CURSEG_HOT_DATA;
3307		else
3308			return CURSEG_COLD_DATA;
3309	} else {
3310		if (IS_DNODE(fio->page) && is_cold_node(fio->page))
3311			return CURSEG_WARM_NODE;
3312		else
3313			return CURSEG_COLD_NODE;
3314	}
3315}
3316
3317static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs)
3318{
3319	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
3320	struct extent_info ei = {};
3321
3322	if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) {
3323		if (!ei.age)
3324			return NO_CHECK_TYPE;
3325		if (ei.age <= sbi->hot_data_age_threshold)
3326			return CURSEG_HOT_DATA;
3327		if (ei.age <= sbi->warm_data_age_threshold)
3328			return CURSEG_WARM_DATA;
3329		return CURSEG_COLD_DATA;
3330	}
3331	return NO_CHECK_TYPE;
3332}
3333
3334static int __get_segment_type_6(struct f2fs_io_info *fio)
3335{
3336	if (fio->type == DATA) {
3337		struct inode *inode = fio->page->mapping->host;
3338		int type;
3339
3340		if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
3341			return CURSEG_COLD_DATA_PINNED;
3342
3343		if (page_private_gcing(fio->page)) {
3344			if (fio->sbi->am.atgc_enabled &&
3345				(fio->io_type == FS_DATA_IO) &&
3346				(fio->sbi->gc_mode != GC_URGENT_HIGH))
3347				return CURSEG_ALL_DATA_ATGC;
3348			else
3349				return CURSEG_COLD_DATA;
3350		}
3351		if (file_is_cold(inode) || f2fs_need_compress_data(inode))
3352			return CURSEG_COLD_DATA;
3353
3354		type = __get_age_segment_type(inode, fio->page->index);
3355		if (type != NO_CHECK_TYPE)
3356			return type;
3357
3358		if (file_is_hot(inode) ||
3359				is_inode_flag_set(inode, FI_HOT_DATA) ||
3360				f2fs_is_cow_file(inode))
3361			return CURSEG_HOT_DATA;
3362		return f2fs_rw_hint_to_seg_type(inode->i_write_hint);
3363	} else {
3364		if (IS_DNODE(fio->page))
3365			return is_cold_node(fio->page) ? CURSEG_WARM_NODE :
3366						CURSEG_HOT_NODE;
3367		return CURSEG_COLD_NODE;
3368	}
3369}
3370
3371static int __get_segment_type(struct f2fs_io_info *fio)
3372{
3373	int type = 0;
3374
3375	switch (F2FS_OPTION(fio->sbi).active_logs) {
3376	case 2:
3377		type = __get_segment_type_2(fio);
3378		break;
3379	case 4:
3380		type = __get_segment_type_4(fio);
3381		break;
3382	case 6:
3383		type = __get_segment_type_6(fio);
3384		break;
3385	default:
3386		f2fs_bug_on(fio->sbi, true);
3387	}
3388
3389	if (IS_HOT(type))
3390		fio->temp = HOT;
3391	else if (IS_WARM(type))
3392		fio->temp = WARM;
3393	else
3394		fio->temp = COLD;
3395	return type;
3396}
3397
3398static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi,
3399		struct curseg_info *seg)
3400{
3401	/* To allocate block chunks in different sizes, use random number */
3402	if (--seg->fragment_remained_chunk > 0)
3403		return;
3404
3405	seg->fragment_remained_chunk =
3406		get_random_u32_inclusive(1, sbi->max_fragment_chunk);
3407	seg->next_blkoff +=
3408		get_random_u32_inclusive(1, sbi->max_fragment_hole);
3409}
3410
3411void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
3412		block_t old_blkaddr, block_t *new_blkaddr,
3413		struct f2fs_summary *sum, int type,
3414		struct f2fs_io_info *fio)
3415{
3416	struct sit_info *sit_i = SIT_I(sbi);
3417	struct curseg_info *curseg = CURSEG_I(sbi, type);
3418	unsigned long long old_mtime;
3419	bool from_gc = (type == CURSEG_ALL_DATA_ATGC);
3420	struct seg_entry *se = NULL;
3421	bool segment_full = false;
3422
3423	f2fs_down_read(&SM_I(sbi)->curseg_lock);
3424
3425	mutex_lock(&curseg->curseg_mutex);
3426	down_write(&sit_i->sentry_lock);
3427
3428	if (from_gc) {
3429		f2fs_bug_on(sbi, GET_SEGNO(sbi, old_blkaddr) == NULL_SEGNO);
3430		se = get_seg_entry(sbi, GET_SEGNO(sbi, old_blkaddr));
3431		sanity_check_seg_type(sbi, se->type);
3432		f2fs_bug_on(sbi, IS_NODESEG(se->type));
3433	}
3434	*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
3435
3436	f2fs_bug_on(sbi, curseg->next_blkoff >= sbi->blocks_per_seg);
3437
3438	f2fs_wait_discard_bio(sbi, *new_blkaddr);
3439
3440	curseg->sum_blk->entries[curseg->next_blkoff] = *sum;
3441	if (curseg->alloc_type == SSR) {
3442		curseg->next_blkoff = f2fs_find_next_ssr_block(sbi, curseg);
3443	} else {
3444		curseg->next_blkoff++;
3445		if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK)
3446			f2fs_randomize_chunk(sbi, curseg);
3447	}
3448	if (curseg->next_blkoff >= f2fs_usable_blks_in_seg(sbi, curseg->segno))
3449		segment_full = true;
3450	stat_inc_block_count(sbi, curseg);
3451
3452	if (from_gc) {
3453		old_mtime = get_segment_mtime(sbi, old_blkaddr);
3454	} else {
3455		update_segment_mtime(sbi, old_blkaddr, 0);
3456		old_mtime = 0;
3457	}
3458	update_segment_mtime(sbi, *new_blkaddr, old_mtime);
3459
3460	/*
3461	 * SIT information should be updated before segment allocation,
3462	 * since SSR needs latest valid block information.
3463	 */
3464	update_sit_entry(sbi, *new_blkaddr, 1);
3465	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
3466		update_sit_entry(sbi, old_blkaddr, -1);
3467
3468	/*
3469	 * If the current segment is full, flush it out and replace it with a
3470	 * new segment.
3471	 */
3472	if (segment_full) {
3473		if (from_gc) {
3474			get_atssr_segment(sbi, type, se->type,
3475						AT_SSR, se->mtime);
3476		} else {
3477			if (need_new_seg(sbi, type))
3478				new_curseg(sbi, type, false);
3479			else
3480				change_curseg(sbi, type);
3481			stat_inc_seg_type(sbi, curseg);
3482		}
3483	}
3484	/*
3485	 * segment dirty status should be updated after segment allocation,
3486	 * so we just need to update status only one time after previous
3487	 * segment being closed.
3488	 */
3489	locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
3490	locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr));
3491
3492	if (IS_DATASEG(curseg->seg_type))
3493		atomic64_inc(&sbi->allocated_data_blocks);
3494
3495	up_write(&sit_i->sentry_lock);
3496
3497	if (page && IS_NODESEG(curseg->seg_type)) {
3498		fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
3499
3500		f2fs_inode_chksum_set(sbi, page);
3501	}
3502
3503	if (fio) {
3504		struct f2fs_bio_info *io;
3505
3506		if (F2FS_IO_ALIGNED(sbi))
3507			fio->retry = 0;
3508
3509		INIT_LIST_HEAD(&fio->list);
3510		fio->in_list = 1;
3511		io = sbi->write_io[fio->type] + fio->temp;
3512		spin_lock(&io->io_lock);
3513		list_add_tail(&fio->list, &io->io_list);
3514		spin_unlock(&io->io_lock);
3515	}
3516
3517	mutex_unlock(&curseg->curseg_mutex);
3518
3519	f2fs_up_read(&SM_I(sbi)->curseg_lock);
3520}
3521
3522void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino,
3523					block_t blkaddr, unsigned int blkcnt)
3524{
3525	if (!f2fs_is_multi_device(sbi))
3526		return;
3527
3528	while (1) {
3529		unsigned int devidx = f2fs_target_device_index(sbi, blkaddr);
3530		unsigned int blks = FDEV(devidx).end_blk - blkaddr + 1;
3531
3532		/* update device state for fsync */
3533		f2fs_set_dirty_device(sbi, ino, devidx, FLUSH_INO);
3534
3535		/* update device state for checkpoint */
3536		if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) {
3537			spin_lock(&sbi->dev_lock);
3538			f2fs_set_bit(devidx, (char *)&sbi->dirty_device);
3539			spin_unlock(&sbi->dev_lock);
3540		}
3541
3542		if (blkcnt <= blks)
3543			break;
3544		blkcnt -= blks;
3545		blkaddr += blks;
3546	}
3547}
3548
3549static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
3550{
3551	int type = __get_segment_type(fio);
3552	bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA);
3553
3554	if (keep_order)
3555		f2fs_down_read(&fio->sbi->io_order_lock);
3556reallocate:
3557	f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
3558			&fio->new_blkaddr, sum, type, fio);
3559	if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO)
3560		f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr);
3561
3562	/* writeout dirty page into bdev */
3563	f2fs_submit_page_write(fio);
3564	if (fio->retry) {
3565		fio->old_blkaddr = fio->new_blkaddr;
3566		goto reallocate;
3567	}
3568
3569	f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1);
3570
3571	if (keep_order)
3572		f2fs_up_read(&fio->sbi->io_order_lock);
3573}
3574
3575void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
3576					enum iostat_type io_type)
3577{
3578	struct f2fs_io_info fio = {
3579		.sbi = sbi,
3580		.type = META,
3581		.temp = HOT,
3582		.op = REQ_OP_WRITE,
3583		.op_flags = REQ_SYNC | REQ_META | REQ_PRIO,
3584		.old_blkaddr = page->index,
3585		.new_blkaddr = page->index,
3586		.page = page,
3587		.encrypted_page = NULL,
3588		.in_list = 0,
3589	};
3590
3591	if (unlikely(page->index >= MAIN_BLKADDR(sbi)))
3592		fio.op_flags &= ~REQ_META;
3593
3594	set_page_writeback(page);
3595	f2fs_submit_page_write(&fio);
3596
3597	stat_inc_meta_count(sbi, page->index);
3598	f2fs_update_iostat(sbi, NULL, io_type, F2FS_BLKSIZE);
3599}
3600
3601void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio)
3602{
3603	struct f2fs_summary sum;
3604
3605	set_summary(&sum, nid, 0, 0);
3606	do_write_page(&sum, fio);
3607
3608	f2fs_update_iostat(fio->sbi, NULL, fio->io_type, F2FS_BLKSIZE);
3609}
3610
3611void f2fs_outplace_write_data(struct dnode_of_data *dn,
3612					struct f2fs_io_info *fio)
3613{
3614	struct f2fs_sb_info *sbi = fio->sbi;
3615	struct f2fs_summary sum;
3616
3617	f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
3618	if (fio->io_type == FS_DATA_IO || fio->io_type == FS_CP_DATA_IO)
3619		f2fs_update_age_extent_cache(dn);
3620	set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version);
3621	do_write_page(&sum, fio);
3622	f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
3623
3624	f2fs_update_iostat(sbi, dn->inode, fio->io_type, F2FS_BLKSIZE);
3625}
3626
3627int f2fs_inplace_write_data(struct f2fs_io_info *fio)
3628{
3629	int err;
3630	struct f2fs_sb_info *sbi = fio->sbi;
3631	unsigned int segno;
3632
3633	fio->new_blkaddr = fio->old_blkaddr;
3634	/* i/o temperature is needed for passing down write hints */
3635	__get_segment_type(fio);
3636
3637	segno = GET_SEGNO(sbi, fio->new_blkaddr);
3638
3639	if (!IS_DATASEG(get_seg_entry(sbi, segno)->type)) {
3640		set_sbi_flag(sbi, SBI_NEED_FSCK);
3641		f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.",
3642			  __func__, segno);
3643		err = -EFSCORRUPTED;
3644		f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE);
3645		goto drop_bio;
3646	}
3647
3648	if (f2fs_cp_error(sbi)) {
3649		err = -EIO;
3650		goto drop_bio;
3651	}
3652
3653	if (fio->post_read)
3654		f2fs_truncate_meta_inode_pages(sbi, fio->new_blkaddr, 1);
3655
3656	stat_inc_inplace_blocks(fio->sbi);
3657
3658	if (fio->bio && !IS_F2FS_IPU_NOCACHE(sbi))
3659		err = f2fs_merge_page_bio(fio);
3660	else
3661		err = f2fs_submit_page_bio(fio);
3662	if (!err) {
3663		f2fs_update_device_state(fio->sbi, fio->ino,
3664						fio->new_blkaddr, 1);
3665		f2fs_update_iostat(fio->sbi, fio->page->mapping->host,
3666						fio->io_type, F2FS_BLKSIZE);
3667	}
3668
3669	return err;
3670drop_bio:
3671	if (fio->bio && *(fio->bio)) {
3672		struct bio *bio = *(fio->bio);
3673
3674		bio->bi_status = BLK_STS_IOERR;
3675		bio_endio(bio);
3676		*(fio->bio) = NULL;
3677	}
3678	return err;
3679}
3680
3681static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi,
3682						unsigned int segno)
3683{
3684	int i;
3685
3686	for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) {
3687		if (CURSEG_I(sbi, i)->segno == segno)
3688			break;
3689	}
3690	return i;
3691}
3692
3693void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
3694				block_t old_blkaddr, block_t new_blkaddr,
3695				bool recover_curseg, bool recover_newaddr,
3696				bool from_gc)
3697{
3698	struct sit_info *sit_i = SIT_I(sbi);
3699	struct curseg_info *curseg;
3700	unsigned int segno, old_cursegno;
3701	struct seg_entry *se;
3702	int type;
3703	unsigned short old_blkoff;
3704	unsigned char old_alloc_type;
3705
3706	segno = GET_SEGNO(sbi, new_blkaddr);
3707	se = get_seg_entry(sbi, segno);
3708	type = se->type;
3709
3710	f2fs_down_write(&SM_I(sbi)->curseg_lock);
3711
3712	if (!recover_curseg) {
3713		/* for recovery flow */
3714		if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
3715			if (old_blkaddr == NULL_ADDR)
3716				type = CURSEG_COLD_DATA;
3717			else
3718				type = CURSEG_WARM_DATA;
3719		}
3720	} else {
3721		if (IS_CURSEG(sbi, segno)) {
3722			/* se->type is volatile as SSR allocation */
3723			type = __f2fs_get_curseg(sbi, segno);
3724			f2fs_bug_on(sbi, type == NO_CHECK_TYPE);
3725		} else {
3726			type = CURSEG_WARM_DATA;
3727		}
3728	}
3729
3730	f2fs_bug_on(sbi, !IS_DATASEG(type));
3731	curseg = CURSEG_I(sbi, type);
3732
3733	mutex_lock(&curseg->curseg_mutex);
3734	down_write(&sit_i->sentry_lock);
3735
3736	old_cursegno = curseg->segno;
3737	old_blkoff = curseg->next_blkoff;
3738	old_alloc_type = curseg->alloc_type;
3739
3740	/* change the current segment */
3741	if (segno != curseg->segno) {
3742		curseg->next_segno = segno;
3743		change_curseg(sbi, type);
3744	}
3745
3746	curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
3747	curseg->sum_blk->entries[curseg->next_blkoff] = *sum;
3748
3749	if (!recover_curseg || recover_newaddr) {
3750		if (!from_gc)
3751			update_segment_mtime(sbi, new_blkaddr, 0);
3752		update_sit_entry(sbi, new_blkaddr, 1);
3753	}
3754	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) {
3755		f2fs_invalidate_internal_cache(sbi, old_blkaddr);
3756		if (!from_gc)
3757			update_segment_mtime(sbi, old_blkaddr, 0);
3758		update_sit_entry(sbi, old_blkaddr, -1);
3759	}
3760
3761	locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
3762	locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr));
3763
3764	locate_dirty_segment(sbi, old_cursegno);
3765
3766	if (recover_curseg) {
3767		if (old_cursegno != curseg->segno) {
3768			curseg->next_segno = old_cursegno;
3769			change_curseg(sbi, type);
3770		}
3771		curseg->next_blkoff = old_blkoff;
3772		curseg->alloc_type = old_alloc_type;
3773	}
3774
3775	up_write(&sit_i->sentry_lock);
3776	mutex_unlock(&curseg->curseg_mutex);
3777	f2fs_up_write(&SM_I(sbi)->curseg_lock);
3778}
3779
3780void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
3781				block_t old_addr, block_t new_addr,
3782				unsigned char version, bool recover_curseg,
3783				bool recover_newaddr)
3784{
3785	struct f2fs_summary sum;
3786
3787	set_summary(&sum, dn->nid, dn->ofs_in_node, version);
3788
3789	f2fs_do_replace_block(sbi, &sum, old_addr, new_addr,
3790					recover_curseg, recover_newaddr, false);
3791
3792	f2fs_update_data_blkaddr(dn, new_addr);
3793}
3794
3795void f2fs_wait_on_page_writeback(struct page *page,
3796				enum page_type type, bool ordered, bool locked)
3797{
3798	if (PageWriteback(page)) {
3799		struct f2fs_sb_info *sbi = F2FS_P_SB(page);
3800
3801		/* submit cached LFS IO */
3802		f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type);
3803		/* submit cached IPU IO */
3804		f2fs_submit_merged_ipu_write(sbi, NULL, page);
3805		if (ordered) {
3806			wait_on_page_writeback(page);
3807			f2fs_bug_on(sbi, locked && PageWriteback(page));
3808		} else {
3809			wait_for_stable_page(page);
3810		}
3811	}
3812}
3813
3814void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr)
3815{
3816	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
3817	struct page *cpage;
3818
3819	if (!f2fs_post_read_required(inode))
3820		return;
3821
3822	if (!__is_valid_data_blkaddr(blkaddr))
3823		return;
3824
3825	cpage = find_lock_page(META_MAPPING(sbi), blkaddr);
3826	if (cpage) {
3827		f2fs_wait_on_page_writeback(cpage, DATA, true, true);
3828		f2fs_put_page(cpage, 1);
3829	}
3830}
3831
3832void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
3833								block_t len)
3834{
3835	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
3836	block_t i;
3837
3838	if (!f2fs_post_read_required(inode))
3839		return;
3840
3841	for (i = 0; i < len; i++)
3842		f2fs_wait_on_block_writeback(inode, blkaddr + i);
3843
3844	f2fs_truncate_meta_inode_pages(sbi, blkaddr, len);
3845}
3846
3847static int read_compacted_summaries(struct f2fs_sb_info *sbi)
3848{
3849	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
3850	struct curseg_info *seg_i;
3851	unsigned char *kaddr;
3852	struct page *page;
3853	block_t start;
3854	int i, j, offset;
3855
3856	start = start_sum_block(sbi);
3857
3858	page = f2fs_get_meta_page(sbi, start++);
3859	if (IS_ERR(page))
3860		return PTR_ERR(page);
3861	kaddr = (unsigned char *)page_address(page);
3862
3863	/* Step 1: restore nat cache */
3864	seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
3865	memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE);
3866
3867	/* Step 2: restore sit cache */
3868	seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
3869	memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE);
3870	offset = 2 * SUM_JOURNAL_SIZE;
3871
3872	/* Step 3: restore summary entries */
3873	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
3874		unsigned short blk_off;
3875		unsigned int segno;
3876
3877		seg_i = CURSEG_I(sbi, i);
3878		segno = le32_to_cpu(ckpt->cur_data_segno[i]);
3879		blk_off = le16_to_cpu(ckpt->cur_data_blkoff[i]);
3880		seg_i->next_segno = segno;
3881		reset_curseg(sbi, i, 0);
3882		seg_i->alloc_type = ckpt->alloc_type[i];
3883		seg_i->next_blkoff = blk_off;
3884
3885		if (seg_i->alloc_type == SSR)
3886			blk_off = sbi->blocks_per_seg;
3887
3888		for (j = 0; j < blk_off; j++) {
3889			struct f2fs_summary *s;
3890
3891			s = (struct f2fs_summary *)(kaddr + offset);
3892			seg_i->sum_blk->entries[j] = *s;
3893			offset += SUMMARY_SIZE;
3894			if (offset + SUMMARY_SIZE <= PAGE_SIZE -
3895						SUM_FOOTER_SIZE)
3896				continue;
3897
3898			f2fs_put_page(page, 1);
3899			page = NULL;
3900
3901			page = f2fs_get_meta_page(sbi, start++);
3902			if (IS_ERR(page))
3903				return PTR_ERR(page);
3904			kaddr = (unsigned char *)page_address(page);
3905			offset = 0;
3906		}
3907	}
3908	f2fs_put_page(page, 1);
3909	return 0;
3910}
3911
3912static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
3913{
3914	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
3915	struct f2fs_summary_block *sum;
3916	struct curseg_info *curseg;
3917	struct page *new;
3918	unsigned short blk_off;
3919	unsigned int segno = 0;
3920	block_t blk_addr = 0;
3921	int err = 0;
3922
3923	/* get segment number and block addr */
3924	if (IS_DATASEG(type)) {
3925		segno = le32_to_cpu(ckpt->cur_data_segno[type]);
3926		blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type -
3927							CURSEG_HOT_DATA]);
3928		if (__exist_node_summaries(sbi))
3929			blk_addr = sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type);
3930		else
3931			blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type);
3932	} else {
3933		segno = le32_to_cpu(ckpt->cur_node_segno[type -
3934							CURSEG_HOT_NODE]);
3935		blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type -
3936							CURSEG_HOT_NODE]);
3937		if (__exist_node_summaries(sbi))
3938			blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE,
3939							type - CURSEG_HOT_NODE);
3940		else
3941			blk_addr = GET_SUM_BLOCK(sbi, segno);
3942	}
3943
3944	new = f2fs_get_meta_page(sbi, blk_addr);
3945	if (IS_ERR(new))
3946		return PTR_ERR(new);
3947	sum = (struct f2fs_summary_block *)page_address(new);
3948
3949	if (IS_NODESEG(type)) {
3950		if (__exist_node_summaries(sbi)) {
3951			struct f2fs_summary *ns = &sum->entries[0];
3952			int i;
3953
3954			for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
3955				ns->version = 0;
3956				ns->ofs_in_node = 0;
3957			}
3958		} else {
3959			err = f2fs_restore_node_summary(sbi, segno, sum);
3960			if (err)
3961				goto out;
3962		}
3963	}
3964
3965	/* set uncompleted segment to curseg */
3966	curseg = CURSEG_I(sbi, type);
3967	mutex_lock(&curseg->curseg_mutex);
3968
3969	/* update journal info */
3970	down_write(&curseg->journal_rwsem);
3971	memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE);
3972	up_write(&curseg->journal_rwsem);
3973
3974	memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE);
3975	memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE);
3976	curseg->next_segno = segno;
3977	reset_curseg(sbi, type, 0);
3978	curseg->alloc_type = ckpt->alloc_type[type];
3979	curseg->next_blkoff = blk_off;
3980	mutex_unlock(&curseg->curseg_mutex);
3981out:
3982	f2fs_put_page(new, 1);
3983	return err;
3984}
3985
3986static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
3987{
3988	struct f2fs_journal *sit_j = CURSEG_I(sbi, CURSEG_COLD_DATA)->journal;
3989	struct f2fs_journal *nat_j = CURSEG_I(sbi, CURSEG_HOT_DATA)->journal;
3990	int type = CURSEG_HOT_DATA;
3991	int err;
3992
3993	if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) {
3994		int npages = f2fs_npages_for_summary_flush(sbi, true);
3995
3996		if (npages >= 2)
3997			f2fs_ra_meta_pages(sbi, start_sum_block(sbi), npages,
3998							META_CP, true);
3999
4000		/* restore for compacted data summary */
4001		err = read_compacted_summaries(sbi);
4002		if (err)
4003			return err;
4004		type = CURSEG_HOT_NODE;
4005	}
4006
4007	if (__exist_node_summaries(sbi))
4008		f2fs_ra_meta_pages(sbi,
4009				sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type),
4010				NR_CURSEG_PERSIST_TYPE - type, META_CP, true);
4011
4012	for (; type <= CURSEG_COLD_NODE; type++) {
4013		err = read_normal_summaries(sbi, type);
4014		if (err)
4015			return err;
4016	}
4017
4018	/* sanity check for summary blocks */
4019	if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES ||
4020			sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) {
4021		f2fs_err(sbi, "invalid journal entries nats %u sits %u",
4022			 nats_in_cursum(nat_j), sits_in_cursum(sit_j));
4023		return -EINVAL;
4024	}
4025
4026	return 0;
4027}
4028
4029static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
4030{
4031	struct page *page;
4032	unsigned char *kaddr;
4033	struct f2fs_summary *summary;
4034	struct curseg_info *seg_i;
4035	int written_size = 0;
4036	int i, j;
4037
4038	page = f2fs_grab_meta_page(sbi, blkaddr++);
4039	kaddr = (unsigned char *)page_address(page);
4040	memset(kaddr, 0, PAGE_SIZE);
4041
4042	/* Step 1: write nat cache */
4043	seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
4044	memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE);
4045	written_size += SUM_JOURNAL_SIZE;
4046
4047	/* Step 2: write sit cache */
4048	seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
4049	memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE);
4050	written_size += SUM_JOURNAL_SIZE;
4051
4052	/* Step 3: write summary entries */
4053	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
4054		seg_i = CURSEG_I(sbi, i);
4055		for (j = 0; j < f2fs_curseg_valid_blocks(sbi, i); j++) {
4056			if (!page) {
4057				page = f2fs_grab_meta_page(sbi, blkaddr++);
4058				kaddr = (unsigned char *)page_address(page);
4059				memset(kaddr, 0, PAGE_SIZE);
4060				written_size = 0;
4061			}
4062			summary = (struct f2fs_summary *)(kaddr + written_size);
4063			*summary = seg_i->sum_blk->entries[j];
4064			written_size += SUMMARY_SIZE;
4065
4066			if (written_size + SUMMARY_SIZE <= PAGE_SIZE -
4067							SUM_FOOTER_SIZE)
4068				continue;
4069
4070			set_page_dirty(page);
4071			f2fs_put_page(page, 1);
4072			page = NULL;
4073		}
4074	}
4075	if (page) {
4076		set_page_dirty(page);
4077		f2fs_put_page(page, 1);
4078	}
4079}
4080
4081static void write_normal_summaries(struct f2fs_sb_info *sbi,
4082					block_t blkaddr, int type)
4083{
4084	int i, end;
4085
4086	if (IS_DATASEG(type))
4087		end = type + NR_CURSEG_DATA_TYPE;
4088	else
4089		end = type + NR_CURSEG_NODE_TYPE;
4090
4091	for (i = type; i < end; i++)
4092		write_current_sum_page(sbi, i, blkaddr + (i - type));
4093}
4094
4095void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
4096{
4097	if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG))
4098		write_compacted_summaries(sbi, start_blk);
4099	else
4100		write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA);
4101}
4102
4103void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
4104{
4105	write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
4106}
4107
4108int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
4109					unsigned int val, int alloc)
4110{
4111	int i;
4112
4113	if (type == NAT_JOURNAL) {
4114		for (i = 0; i < nats_in_cursum(journal); i++) {
4115			if (le32_to_cpu(nid_in_journal(journal, i)) == val)
4116				return i;
4117		}
4118		if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL))
4119			return update_nats_in_cursum(journal, 1);
4120	} else if (type == SIT_JOURNAL) {
4121		for (i = 0; i < sits_in_cursum(journal); i++)
4122			if (le32_to_cpu(segno_in_journal(journal, i)) == val)
4123				return i;
4124		if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL))
4125			return update_sits_in_cursum(journal, 1);
4126	}
4127	return -1;
4128}
4129
4130static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
4131					unsigned int segno)
4132{
4133	return f2fs_get_meta_page(sbi, current_sit_addr(sbi, segno));
4134}
4135
4136static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
4137					unsigned int start)
4138{
4139	struct sit_info *sit_i = SIT_I(sbi);
4140	struct page *page;
4141	pgoff_t src_off, dst_off;
4142
4143	src_off = current_sit_addr(sbi, start);
4144	dst_off = next_sit_addr(sbi, src_off);
4145
4146	page = f2fs_grab_meta_page(sbi, dst_off);
4147	seg_info_to_sit_page(sbi, page, start);
4148
4149	set_page_dirty(page);
4150	set_to_next_sit(sit_i, start);
4151
4152	return page;
4153}
4154
4155static struct sit_entry_set *grab_sit_entry_set(void)
4156{
4157	struct sit_entry_set *ses =
4158			f2fs_kmem_cache_alloc(sit_entry_set_slab,
4159						GFP_NOFS, true, NULL);
4160
4161	ses->entry_cnt = 0;
4162	INIT_LIST_HEAD(&ses->set_list);
4163	return ses;
4164}
4165
4166static void release_sit_entry_set(struct sit_entry_set *ses)
4167{
4168	list_del(&ses->set_list);
4169	kmem_cache_free(sit_entry_set_slab, ses);
4170}
4171
4172static void adjust_sit_entry_set(struct sit_entry_set *ses,
4173						struct list_head *head)
4174{
4175	struct sit_entry_set *next = ses;
4176
4177	if (list_is_last(&ses->set_list, head))
4178		return;
4179
4180	list_for_each_entry_continue(next, head, set_list)
4181		if (ses->entry_cnt <= next->entry_cnt) {
4182			list_move_tail(&ses->set_list, &next->set_list);
4183			return;
4184		}
4185
4186	list_move_tail(&ses->set_list, head);
4187}
4188
4189static void add_sit_entry(unsigned int segno, struct list_head *head)
4190{
4191	struct sit_entry_set *ses;
4192	unsigned int start_segno = START_SEGNO(segno);
4193
4194	list_for_each_entry(ses, head, set_list) {
4195		if (ses->start_segno == start_segno) {
4196			ses->entry_cnt++;
4197			adjust_sit_entry_set(ses, head);
4198			return;
4199		}
4200	}
4201
4202	ses = grab_sit_entry_set();
4203
4204	ses->start_segno = start_segno;
4205	ses->entry_cnt++;
4206	list_add(&ses->set_list, head);
4207}
4208
4209static void add_sits_in_set(struct f2fs_sb_info *sbi)
4210{
4211	struct f2fs_sm_info *sm_info = SM_I(sbi);
4212	struct list_head *set_list = &sm_info->sit_entry_set;
4213	unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap;
4214	unsigned int segno;
4215
4216	for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi))
4217		add_sit_entry(segno, set_list);
4218}
4219
4220static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
4221{
4222	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
4223	struct f2fs_journal *journal = curseg->journal;
4224	int i;
4225
4226	down_write(&curseg->journal_rwsem);
4227	for (i = 0; i < sits_in_cursum(journal); i++) {
4228		unsigned int segno;
4229		bool dirtied;
4230
4231		segno = le32_to_cpu(segno_in_journal(journal, i));
4232		dirtied = __mark_sit_entry_dirty(sbi, segno);
4233
4234		if (!dirtied)
4235			add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
4236	}
4237	update_sits_in_cursum(journal, -i);
4238	up_write(&curseg->journal_rwsem);
4239}
4240
4241/*
4242 * CP calls this function, which flushes SIT entries including sit_journal,
4243 * and moves prefree segs to free segs.
4244 */
4245void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
4246{
4247	struct sit_info *sit_i = SIT_I(sbi);
4248	unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
4249	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
4250	struct f2fs_journal *journal = curseg->journal;
4251	struct sit_entry_set *ses, *tmp;
4252	struct list_head *head = &SM_I(sbi)->sit_entry_set;
4253	bool to_journal = !is_sbi_flag_set(sbi, SBI_IS_RESIZEFS);
4254	struct seg_entry *se;
4255
4256	down_write(&sit_i->sentry_lock);
4257
4258	if (!sit_i->dirty_sentries)
4259		goto out;
4260
4261	/*
4262	 * add and account sit entries of dirty bitmap in sit entry
4263	 * set temporarily
4264	 */
4265	add_sits_in_set(sbi);
4266
4267	/*
4268	 * if there are no enough space in journal to store dirty sit
4269	 * entries, remove all entries from journal and add and account
4270	 * them in sit entry set.
4271	 */
4272	if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL) ||
4273								!to_journal)
4274		remove_sits_in_journal(sbi);
4275
4276	/*
4277	 * there are two steps to flush sit entries:
4278	 * #1, flush sit entries to journal in current cold data summary block.
4279	 * #2, flush sit entries to sit page.
4280	 */
4281	list_for_each_entry_safe(ses, tmp, head, set_list) {
4282		struct page *page = NULL;
4283		struct f2fs_sit_block *raw_sit = NULL;
4284		unsigned int start_segno = ses->start_segno;
4285		unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK,
4286						(unsigned long)MAIN_SEGS(sbi));
4287		unsigned int segno = start_segno;
4288
4289		if (to_journal &&
4290			!__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL))
4291			to_journal = false;
4292
4293		if (to_journal) {
4294			down_write(&curseg->journal_rwsem);
4295		} else {
4296			page = get_next_sit_page(sbi, start_segno);
4297			raw_sit = page_address(page);
4298		}
4299
4300		/* flush dirty sit entries in region of current sit set */
4301		for_each_set_bit_from(segno, bitmap, end) {
4302			int offset, sit_offset;
4303
4304			se = get_seg_entry(sbi, segno);
4305#ifdef CONFIG_F2FS_CHECK_FS
4306			if (memcmp(se->cur_valid_map, se->cur_valid_map_mir,
4307						SIT_VBLOCK_MAP_SIZE))
4308				f2fs_bug_on(sbi, 1);
4309#endif
4310
4311			/* add discard candidates */
4312			if (!(cpc->reason & CP_DISCARD)) {
4313				cpc->trim_start = segno;
4314				add_discard_addrs(sbi, cpc, false);
4315			}
4316
4317			if (to_journal) {
4318				offset = f2fs_lookup_journal_in_cursum(journal,
4319							SIT_JOURNAL, segno, 1);
4320				f2fs_bug_on(sbi, offset < 0);
4321				segno_in_journal(journal, offset) =
4322							cpu_to_le32(segno);
4323				seg_info_to_raw_sit(se,
4324					&sit_in_journal(journal, offset));
4325				check_block_count(sbi, segno,
4326					&sit_in_journal(journal, offset));
4327			} else {
4328				sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
4329				seg_info_to_raw_sit(se,
4330						&raw_sit->entries[sit_offset]);
4331				check_block_count(sbi, segno,
4332						&raw_sit->entries[sit_offset]);
4333			}
4334
4335			__clear_bit(segno, bitmap);
4336			sit_i->dirty_sentries--;
4337			ses->entry_cnt--;
4338		}
4339
4340		if (to_journal)
4341			up_write(&curseg->journal_rwsem);
4342		else
4343			f2fs_put_page(page, 1);
4344
4345		f2fs_bug_on(sbi, ses->entry_cnt);
4346		release_sit_entry_set(ses);
4347	}
4348
4349	f2fs_bug_on(sbi, !list_empty(head));
4350	f2fs_bug_on(sbi, sit_i->dirty_sentries);
4351out:
4352	if (cpc->reason & CP_DISCARD) {
4353		__u64 trim_start = cpc->trim_start;
4354
4355		for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++)
4356			add_discard_addrs(sbi, cpc, false);
4357
4358		cpc->trim_start = trim_start;
4359	}
4360	up_write(&sit_i->sentry_lock);
4361
4362	set_prefree_as_free_segments(sbi);
4363}
4364
4365static int build_sit_info(struct f2fs_sb_info *sbi)
4366{
4367	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
4368	struct sit_info *sit_i;
4369	unsigned int sit_segs, start;
4370	char *src_bitmap, *bitmap;
4371	unsigned int bitmap_size, main_bitmap_size, sit_bitmap_size;
4372	unsigned int discard_map = f2fs_block_unit_discard(sbi) ? 1 : 0;
4373
4374	/* allocate memory for SIT information */
4375	sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL);
4376	if (!sit_i)
4377		return -ENOMEM;
4378
4379	SM_I(sbi)->sit_info = sit_i;
4380
4381	sit_i->sentries =
4382		f2fs_kvzalloc(sbi, array_size(sizeof(struct seg_entry),
4383					      MAIN_SEGS(sbi)),
4384			      GFP_KERNEL);
4385	if (!sit_i->sentries)
4386		return -ENOMEM;
4387
4388	main_bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
4389	sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(sbi, main_bitmap_size,
4390								GFP_KERNEL);
4391	if (!sit_i->dirty_sentries_bitmap)
4392		return -ENOMEM;
4393
4394#ifdef CONFIG_F2FS_CHECK_FS
4395	bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (3 + discard_map);
4396#else
4397	bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (2 + discard_map);
4398#endif
4399	sit_i->bitmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL);
4400	if (!sit_i->bitmap)
4401		return -ENOMEM;
4402
4403	bitmap = sit_i->bitmap;
4404
4405	for (start = 0; start < MAIN_SEGS(sbi); start++) {
4406		sit_i->sentries[start].cur_valid_map = bitmap;
4407		bitmap += SIT_VBLOCK_MAP_SIZE;
4408
4409		sit_i->sentries[start].ckpt_valid_map = bitmap;
4410		bitmap += SIT_VBLOCK_MAP_SIZE;
4411
4412#ifdef CONFIG_F2FS_CHECK_FS
4413		sit_i->sentries[start].cur_valid_map_mir = bitmap;
4414		bitmap += SIT_VBLOCK_MAP_SIZE;
4415#endif
4416
4417		if (discard_map) {
4418			sit_i->sentries[start].discard_map = bitmap;
4419			bitmap += SIT_VBLOCK_MAP_SIZE;
4420		}
4421	}
4422
4423	sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
4424	if (!sit_i->tmp_map)
4425		return -ENOMEM;
4426
4427	if (__is_large_section(sbi)) {
4428		sit_i->sec_entries =
4429			f2fs_kvzalloc(sbi, array_size(sizeof(struct sec_entry),
4430						      MAIN_SECS(sbi)),
4431				      GFP_KERNEL);
4432		if (!sit_i->sec_entries)
4433			return -ENOMEM;
4434	}
4435
4436	/* get information related with SIT */
4437	sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1;
4438
4439	/* setup SIT bitmap from ckeckpoint pack */
4440	sit_bitmap_size = __bitmap_size(sbi, SIT_BITMAP);
4441	src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP);
4442
4443	sit_i->sit_bitmap = kmemdup(src_bitmap, sit_bitmap_size, GFP_KERNEL);
4444	if (!sit_i->sit_bitmap)
4445		return -ENOMEM;
4446
4447#ifdef CONFIG_F2FS_CHECK_FS
4448	sit_i->sit_bitmap_mir = kmemdup(src_bitmap,
4449					sit_bitmap_size, GFP_KERNEL);
4450	if (!sit_i->sit_bitmap_mir)
4451		return -ENOMEM;
4452
4453	sit_i->invalid_segmap = f2fs_kvzalloc(sbi,
4454					main_bitmap_size, GFP_KERNEL);
4455	if (!sit_i->invalid_segmap)
4456		return -ENOMEM;
4457#endif
4458
4459	sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
4460	sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
4461	sit_i->written_valid_blocks = 0;
4462	sit_i->bitmap_size = sit_bitmap_size;
4463	sit_i->dirty_sentries = 0;
4464	sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
4465	sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
4466	sit_i->mounted_time = ktime_get_boottime_seconds();
4467	init_rwsem(&sit_i->sentry_lock);
4468	return 0;
4469}
4470
4471static int build_free_segmap(struct f2fs_sb_info *sbi)
4472{
4473	struct free_segmap_info *free_i;
4474	unsigned int bitmap_size, sec_bitmap_size;
4475
4476	/* allocate memory for free segmap information */
4477	free_i = f2fs_kzalloc(sbi, sizeof(struct free_segmap_info), GFP_KERNEL);
4478	if (!free_i)
4479		return -ENOMEM;
4480
4481	SM_I(sbi)->free_info = free_i;
4482
4483	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
4484	free_i->free_segmap = f2fs_kvmalloc(sbi, bitmap_size, GFP_KERNEL);
4485	if (!free_i->free_segmap)
4486		return -ENOMEM;
4487
4488	sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
4489	free_i->free_secmap = f2fs_kvmalloc(sbi, sec_bitmap_size, GFP_KERNEL);
4490	if (!free_i->free_secmap)
4491		return -ENOMEM;
4492
4493	/* set all segments as dirty temporarily */
4494	memset(free_i->free_segmap, 0xff, bitmap_size);
4495	memset(free_i->free_secmap, 0xff, sec_bitmap_size);
4496
4497	/* init free segmap information */
4498	free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi));
4499	free_i->free_segments = 0;
4500	free_i->free_sections = 0;
4501	spin_lock_init(&free_i->segmap_lock);
4502	return 0;
4503}
4504
4505static int build_curseg(struct f2fs_sb_info *sbi)
4506{
4507	struct curseg_info *array;
4508	int i;
4509
4510	array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE,
4511					sizeof(*array)), GFP_KERNEL);
4512	if (!array)
4513		return -ENOMEM;
4514
4515	SM_I(sbi)->curseg_array = array;
4516
4517	for (i = 0; i < NO_CHECK_TYPE; i++) {
4518		mutex_init(&array[i].curseg_mutex);
4519		array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL);
4520		if (!array[i].sum_blk)
4521			return -ENOMEM;
4522		init_rwsem(&array[i].journal_rwsem);
4523		array[i].journal = f2fs_kzalloc(sbi,
4524				sizeof(struct f2fs_journal), GFP_KERNEL);
4525		if (!array[i].journal)
4526			return -ENOMEM;
4527		if (i < NR_PERSISTENT_LOG)
4528			array[i].seg_type = CURSEG_HOT_DATA + i;
4529		else if (i == CURSEG_COLD_DATA_PINNED)
4530			array[i].seg_type = CURSEG_COLD_DATA;
4531		else if (i == CURSEG_ALL_DATA_ATGC)
4532			array[i].seg_type = CURSEG_COLD_DATA;
4533		array[i].segno = NULL_SEGNO;
4534		array[i].next_blkoff = 0;
4535		array[i].inited = false;
4536	}
4537	return restore_curseg_summaries(sbi);
4538}
4539
4540static int build_sit_entries(struct f2fs_sb_info *sbi)
4541{
4542	struct sit_info *sit_i = SIT_I(sbi);
4543	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
4544	struct f2fs_journal *journal = curseg->journal;
4545	struct seg_entry *se;
4546	struct f2fs_sit_entry sit;
4547	int sit_blk_cnt = SIT_BLK_CNT(sbi);
4548	unsigned int i, start, end;
4549	unsigned int readed, start_blk = 0;
4550	int err = 0;
4551	block_t sit_valid_blocks[2] = {0, 0};
4552
4553	do {
4554		readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS,
4555							META_SIT, true);
4556
4557		start = start_blk * sit_i->sents_per_block;
4558		end = (start_blk + readed) * sit_i->sents_per_block;
4559
4560		for (; start < end && start < MAIN_SEGS(sbi); start++) {
4561			struct f2fs_sit_block *sit_blk;
4562			struct page *page;
4563
4564			se = &sit_i->sentries[start];
4565			page = get_current_sit_page(sbi, start);
4566			if (IS_ERR(page))
4567				return PTR_ERR(page);
4568			sit_blk = (struct f2fs_sit_block *)page_address(page);
4569			sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
4570			f2fs_put_page(page, 1);
4571
4572			err = check_block_count(sbi, start, &sit);
4573			if (err)
4574				return err;
4575			seg_info_from_raw_sit(se, &sit);
4576
4577			if (se->type >= NR_PERSISTENT_LOG) {
4578				f2fs_err(sbi, "Invalid segment type: %u, segno: %u",
4579							se->type, start);
4580				f2fs_handle_error(sbi,
4581						ERROR_INCONSISTENT_SUM_TYPE);
4582				return -EFSCORRUPTED;
4583			}
4584
4585			sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks;
4586
4587			if (f2fs_block_unit_discard(sbi)) {
4588				/* build discard map only one time */
4589				if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
4590					memset(se->discard_map, 0xff,
4591						SIT_VBLOCK_MAP_SIZE);
4592				} else {
4593					memcpy(se->discard_map,
4594						se->cur_valid_map,
4595						SIT_VBLOCK_MAP_SIZE);
4596					sbi->discard_blks +=
4597						sbi->blocks_per_seg -
4598						se->valid_blocks;
4599				}
4600			}
4601
4602			if (__is_large_section(sbi))
4603				get_sec_entry(sbi, start)->valid_blocks +=
4604							se->valid_blocks;
4605		}
4606		start_blk += readed;
4607	} while (start_blk < sit_blk_cnt);
4608
4609	down_read(&curseg->journal_rwsem);
4610	for (i = 0; i < sits_in_cursum(journal); i++) {
4611		unsigned int old_valid_blocks;
4612
4613		start = le32_to_cpu(segno_in_journal(journal, i));
4614		if (start >= MAIN_SEGS(sbi)) {
4615			f2fs_err(sbi, "Wrong journal entry on segno %u",
4616				 start);
4617			err = -EFSCORRUPTED;
4618			f2fs_handle_error(sbi, ERROR_CORRUPTED_JOURNAL);
4619			break;
4620		}
4621
4622		se = &sit_i->sentries[start];
4623		sit = sit_in_journal(journal, i);
4624
4625		old_valid_blocks = se->valid_blocks;
4626
4627		sit_valid_blocks[SE_PAGETYPE(se)] -= old_valid_blocks;
4628
4629		err = check_block_count(sbi, start, &sit);
4630		if (err)
4631			break;
4632		seg_info_from_raw_sit(se, &sit);
4633
4634		if (se->type >= NR_PERSISTENT_LOG) {
4635			f2fs_err(sbi, "Invalid segment type: %u, segno: %u",
4636							se->type, start);
4637			err = -EFSCORRUPTED;
4638			f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE);
4639			break;
4640		}
4641
4642		sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks;
4643
4644		if (f2fs_block_unit_discard(sbi)) {
4645			if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
4646				memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE);
4647			} else {
4648				memcpy(se->discard_map, se->cur_valid_map,
4649							SIT_VBLOCK_MAP_SIZE);
4650				sbi->discard_blks += old_valid_blocks;
4651				sbi->discard_blks -= se->valid_blocks;
4652			}
4653		}
4654
4655		if (__is_large_section(sbi)) {
4656			get_sec_entry(sbi, start)->valid_blocks +=
4657							se->valid_blocks;
4658			get_sec_entry(sbi, start)->valid_blocks -=
4659							old_valid_blocks;
4660		}
4661	}
4662	up_read(&curseg->journal_rwsem);
4663
4664	if (err)
4665		return err;
4666
4667	if (sit_valid_blocks[NODE] != valid_node_count(sbi)) {
4668		f2fs_err(sbi, "SIT is corrupted node# %u vs %u",
4669			 sit_valid_blocks[NODE], valid_node_count(sbi));
4670		f2fs_handle_error(sbi, ERROR_INCONSISTENT_NODE_COUNT);
4671		return -EFSCORRUPTED;
4672	}
4673
4674	if (sit_valid_blocks[DATA] + sit_valid_blocks[NODE] >
4675				valid_user_blocks(sbi)) {
4676		f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u",
4677			 sit_valid_blocks[DATA], sit_valid_blocks[NODE],
4678			 valid_user_blocks(sbi));
4679		f2fs_handle_error(sbi, ERROR_INCONSISTENT_BLOCK_COUNT);
4680		return -EFSCORRUPTED;
4681	}
4682
4683	return 0;
4684}
4685
4686static void init_free_segmap(struct f2fs_sb_info *sbi)
4687{
4688	unsigned int start;
4689	int type;
4690	struct seg_entry *sentry;
4691
4692	for (start = 0; start < MAIN_SEGS(sbi); start++) {
4693		if (f2fs_usable_blks_in_seg(sbi, start) == 0)
4694			continue;
4695		sentry = get_seg_entry(sbi, start);
4696		if (!sentry->valid_blocks)
4697			__set_free(sbi, start);
4698		else
4699			SIT_I(sbi)->written_valid_blocks +=
4700						sentry->valid_blocks;
4701	}
4702
4703	/* set use the current segments */
4704	for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) {
4705		struct curseg_info *curseg_t = CURSEG_I(sbi, type);
4706
4707		__set_test_and_inuse(sbi, curseg_t->segno);
4708	}
4709}
4710
4711static void init_dirty_segmap(struct f2fs_sb_info *sbi)
4712{
4713	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
4714	struct free_segmap_info *free_i = FREE_I(sbi);
4715	unsigned int segno = 0, offset = 0, secno;
4716	block_t valid_blocks, usable_blks_in_seg;
4717
4718	while (1) {
4719		/* find dirty segment based on free segmap */
4720		segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset);
4721		if (segno >= MAIN_SEGS(sbi))
4722			break;
4723		offset = segno + 1;
4724		valid_blocks = get_valid_blocks(sbi, segno, false);
4725		usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno);
4726		if (valid_blocks == usable_blks_in_seg || !valid_blocks)
4727			continue;
4728		if (valid_blocks > usable_blks_in_seg) {
4729			f2fs_bug_on(sbi, 1);
4730			continue;
4731		}
4732		mutex_lock(&dirty_i->seglist_lock);
4733		__locate_dirty_segment(sbi, segno, DIRTY);
4734		mutex_unlock(&dirty_i->seglist_lock);
4735	}
4736
4737	if (!__is_large_section(sbi))
4738		return;
4739
4740	mutex_lock(&dirty_i->seglist_lock);
4741	for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
4742		valid_blocks = get_valid_blocks(sbi, segno, true);
4743		secno = GET_SEC_FROM_SEG(sbi, segno);
4744
4745		if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi))
4746			continue;
4747		if (IS_CURSEC(sbi, secno))
4748			continue;
4749		set_bit(secno, dirty_i->dirty_secmap);
4750	}
4751	mutex_unlock(&dirty_i->seglist_lock);
4752}
4753
4754static int init_victim_secmap(struct f2fs_sb_info *sbi)
4755{
4756	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
4757	unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
4758
4759	dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL);
4760	if (!dirty_i->victim_secmap)
4761		return -ENOMEM;
4762
4763	dirty_i->pinned_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL);
4764	if (!dirty_i->pinned_secmap)
4765		return -ENOMEM;
4766
4767	dirty_i->pinned_secmap_cnt = 0;
4768	dirty_i->enable_pin_section = true;
4769	return 0;
4770}
4771
4772static int build_dirty_segmap(struct f2fs_sb_info *sbi)
4773{
4774	struct dirty_seglist_info *dirty_i;
4775	unsigned int bitmap_size, i;
4776
4777	/* allocate memory for dirty segments list information */
4778	dirty_i = f2fs_kzalloc(sbi, sizeof(struct dirty_seglist_info),
4779								GFP_KERNEL);
4780	if (!dirty_i)
4781		return -ENOMEM;
4782
4783	SM_I(sbi)->dirty_info = dirty_i;
4784	mutex_init(&dirty_i->seglist_lock);
4785
4786	bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
4787
4788	for (i = 0; i < NR_DIRTY_TYPE; i++) {
4789		dirty_i->dirty_segmap[i] = f2fs_kvzalloc(sbi, bitmap_size,
4790								GFP_KERNEL);
4791		if (!dirty_i->dirty_segmap[i])
4792			return -ENOMEM;
4793	}
4794
4795	if (__is_large_section(sbi)) {
4796		bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
4797		dirty_i->dirty_secmap = f2fs_kvzalloc(sbi,
4798						bitmap_size, GFP_KERNEL);
4799		if (!dirty_i->dirty_secmap)
4800			return -ENOMEM;
4801	}
4802
4803	init_dirty_segmap(sbi);
4804	return init_victim_secmap(sbi);
4805}
4806
4807static int sanity_check_curseg(struct f2fs_sb_info *sbi)
4808{
4809	int i;
4810
4811	/*
4812	 * In LFS/SSR curseg, .next_blkoff should point to an unused blkaddr;
4813	 * In LFS curseg, all blkaddr after .next_blkoff should be unused.
4814	 */
4815	for (i = 0; i < NR_PERSISTENT_LOG; i++) {
4816		struct curseg_info *curseg = CURSEG_I(sbi, i);
4817		struct seg_entry *se = get_seg_entry(sbi, curseg->segno);
4818		unsigned int blkofs = curseg->next_blkoff;
4819
4820		if (f2fs_sb_has_readonly(sbi) &&
4821			i != CURSEG_HOT_DATA && i != CURSEG_HOT_NODE)
4822			continue;
4823
4824		sanity_check_seg_type(sbi, curseg->seg_type);
4825
4826		if (curseg->alloc_type != LFS && curseg->alloc_type != SSR) {
4827			f2fs_err(sbi,
4828				 "Current segment has invalid alloc_type:%d",
4829				 curseg->alloc_type);
4830			f2fs_handle_error(sbi, ERROR_INVALID_CURSEG);
4831			return -EFSCORRUPTED;
4832		}
4833
4834		if (f2fs_test_bit(blkofs, se->cur_valid_map))
4835			goto out;
4836
4837		if (curseg->alloc_type == SSR)
4838			continue;
4839
4840		for (blkofs += 1; blkofs < sbi->blocks_per_seg; blkofs++) {
4841			if (!f2fs_test_bit(blkofs, se->cur_valid_map))
4842				continue;
4843out:
4844			f2fs_err(sbi,
4845				 "Current segment's next free block offset is inconsistent with bitmap, logtype:%u, segno:%u, type:%u, next_blkoff:%u, blkofs:%u",
4846				 i, curseg->segno, curseg->alloc_type,
4847				 curseg->next_blkoff, blkofs);
4848			f2fs_handle_error(sbi, ERROR_INVALID_CURSEG);
4849			return -EFSCORRUPTED;
4850		}
4851	}
4852	return 0;
4853}
4854
4855#ifdef CONFIG_BLK_DEV_ZONED
4856
4857static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
4858				    struct f2fs_dev_info *fdev,
4859				    struct blk_zone *zone)
4860{
4861	unsigned int wp_segno, wp_blkoff, zone_secno, zone_segno, segno;
4862	block_t zone_block, wp_block, last_valid_block;
4863	unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT;
4864	int i, s, b, ret;
4865	struct seg_entry *se;
4866
4867	if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
4868		return 0;
4869
4870	wp_block = fdev->start_blk + (zone->wp >> log_sectors_per_block);
4871	wp_segno = GET_SEGNO(sbi, wp_block);
4872	wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno);
4873	zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block);
4874	zone_segno = GET_SEGNO(sbi, zone_block);
4875	zone_secno = GET_SEC_FROM_SEG(sbi, zone_segno);
4876
4877	if (zone_segno >= MAIN_SEGS(sbi))
4878		return 0;
4879
4880	/*
4881	 * Skip check of zones cursegs point to, since
4882	 * fix_curseg_write_pointer() checks them.
4883	 */
4884	for (i = 0; i < NO_CHECK_TYPE; i++)
4885		if (zone_secno == GET_SEC_FROM_SEG(sbi,
4886						   CURSEG_I(sbi, i)->segno))
4887			return 0;
4888
4889	/*
4890	 * Get last valid block of the zone.
4891	 */
4892	last_valid_block = zone_block - 1;
4893	for (s = sbi->segs_per_sec - 1; s >= 0; s--) {
4894		segno = zone_segno + s;
4895		se = get_seg_entry(sbi, segno);
4896		for (b = sbi->blocks_per_seg - 1; b >= 0; b--)
4897			if (f2fs_test_bit(b, se->cur_valid_map)) {
4898				last_valid_block = START_BLOCK(sbi, segno) + b;
4899				break;
4900			}
4901		if (last_valid_block >= zone_block)
4902			break;
4903	}
4904
4905	/*
4906	 * The write pointer matches with the valid blocks or
4907	 * already points to the end of the zone.
4908	 */
4909	if ((last_valid_block + 1 == wp_block) ||
4910			(zone->wp == zone->start + zone->len))
4911		return 0;
4912
4913	if (last_valid_block + 1 == zone_block) {
4914		/*
4915		 * If there is no valid block in the zone and if write pointer
4916		 * is not at zone start, reset the write pointer.
4917		 */
4918		f2fs_notice(sbi,
4919			    "Zone without valid block has non-zero write "
4920			    "pointer. Reset the write pointer: wp[0x%x,0x%x]",
4921			    wp_segno, wp_blkoff);
4922		ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block,
4923					zone->len >> log_sectors_per_block);
4924		if (ret)
4925			f2fs_err(sbi, "Discard zone failed: %s (errno=%d)",
4926				 fdev->path, ret);
4927
4928		return ret;
4929	}
4930
4931	/*
4932	 * If there are valid blocks and the write pointer doesn't
4933	 * match with them, we need to report the inconsistency and
4934	 * fill the zone till the end to close the zone. This inconsistency
4935	 * does not cause write error because the zone will not be selected
4936	 * for write operation until it get discarded.
4937	 */
4938	f2fs_notice(sbi, "Valid blocks are not aligned with write pointer: "
4939		    "valid block[0x%x,0x%x] wp[0x%x,0x%x]",
4940		    GET_SEGNO(sbi, last_valid_block),
4941		    GET_BLKOFF_FROM_SEG0(sbi, last_valid_block),
4942		    wp_segno, wp_blkoff);
4943
4944	ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH,
4945				zone->start, zone->len, GFP_NOFS);
4946	if (ret == -EOPNOTSUPP) {
4947		ret = blkdev_issue_zeroout(fdev->bdev, zone->wp,
4948					zone->len - (zone->wp - zone->start),
4949					GFP_NOFS, 0);
4950		if (ret)
4951			f2fs_err(sbi, "Fill up zone failed: %s (errno=%d)",
4952					fdev->path, ret);
4953	} else if (ret) {
4954		f2fs_err(sbi, "Finishing zone failed: %s (errno=%d)",
4955				fdev->path, ret);
4956	}
4957
4958	return ret;
4959}
4960
4961static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi,
4962						  block_t zone_blkaddr)
4963{
4964	int i;
4965
4966	for (i = 0; i < sbi->s_ndevs; i++) {
4967		if (!bdev_is_zoned(FDEV(i).bdev))
4968			continue;
4969		if (sbi->s_ndevs == 1 || (FDEV(i).start_blk <= zone_blkaddr &&
4970				zone_blkaddr <= FDEV(i).end_blk))
4971			return &FDEV(i);
4972	}
4973
4974	return NULL;
4975}
4976
4977static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx,
4978			      void *data)
4979{
4980	memcpy(data, zone, sizeof(struct blk_zone));
4981	return 0;
4982}
4983
4984static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type)
4985{
4986	struct curseg_info *cs = CURSEG_I(sbi, type);
4987	struct f2fs_dev_info *zbd;
4988	struct blk_zone zone;
4989	unsigned int cs_section, wp_segno, wp_blkoff, wp_sector_off;
4990	block_t cs_zone_block, wp_block;
4991	unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT;
4992	sector_t zone_sector;
4993	int err;
4994
4995	cs_section = GET_SEC_FROM_SEG(sbi, cs->segno);
4996	cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section));
4997
4998	zbd = get_target_zoned_dev(sbi, cs_zone_block);
4999	if (!zbd)
5000		return 0;
5001
5002	/* report zone for the sector the curseg points to */
5003	zone_sector = (sector_t)(cs_zone_block - zbd->start_blk)
5004		<< log_sectors_per_block;
5005	err = blkdev_report_zones(zbd->bdev, zone_sector, 1,
5006				  report_one_zone_cb, &zone);
5007	if (err != 1) {
5008		f2fs_err(sbi, "Report zone failed: %s errno=(%d)",
5009			 zbd->path, err);
5010		return err;
5011	}
5012
5013	if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ)
5014		return 0;
5015
5016	wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block);
5017	wp_segno = GET_SEGNO(sbi, wp_block);
5018	wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno);
5019	wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0);
5020
5021	if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff &&
5022		wp_sector_off == 0)
5023		return 0;
5024
5025	f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: "
5026		    "curseg[0x%x,0x%x] wp[0x%x,0x%x]",
5027		    type, cs->segno, cs->next_blkoff, wp_segno, wp_blkoff);
5028
5029	f2fs_notice(sbi, "Assign new section to curseg[%d]: "
5030		    "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff);
5031
5032	f2fs_allocate_new_section(sbi, type, true);
5033
5034	/* check consistency of the zone curseg pointed to */
5035	if (check_zone_write_pointer(sbi, zbd, &zone))
5036		return -EIO;
5037
5038	/* check newly assigned zone */
5039	cs_section = GET_SEC_FROM_SEG(sbi, cs->segno);
5040	cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section));
5041
5042	zbd = get_target_zoned_dev(sbi, cs_zone_block);
5043	if (!zbd)
5044		return 0;
5045
5046	zone_sector = (sector_t)(cs_zone_block - zbd->start_blk)
5047		<< log_sectors_per_block;
5048	err = blkdev_report_zones(zbd->bdev, zone_sector, 1,
5049				  report_one_zone_cb, &zone);
5050	if (err != 1) {
5051		f2fs_err(sbi, "Report zone failed: %s errno=(%d)",
5052			 zbd->path, err);
5053		return err;
5054	}
5055
5056	if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ)
5057		return 0;
5058
5059	if (zone.wp != zone.start) {
5060		f2fs_notice(sbi,
5061			    "New zone for curseg[%d] is not yet discarded. "
5062			    "Reset the zone: curseg[0x%x,0x%x]",
5063			    type, cs->segno, cs->next_blkoff);
5064		err = __f2fs_issue_discard_zone(sbi, zbd->bdev,	cs_zone_block,
5065					zone.len >> log_sectors_per_block);
5066		if (err) {
5067			f2fs_err(sbi, "Discard zone failed: %s (errno=%d)",
5068				 zbd->path, err);
5069			return err;
5070		}
5071	}
5072
5073	return 0;
5074}
5075
5076int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi)
5077{
5078	int i, ret;
5079
5080	for (i = 0; i < NR_PERSISTENT_LOG; i++) {
5081		ret = fix_curseg_write_pointer(sbi, i);
5082		if (ret)
5083			return ret;
5084	}
5085
5086	return 0;
5087}
5088
5089struct check_zone_write_pointer_args {
5090	struct f2fs_sb_info *sbi;
5091	struct f2fs_dev_info *fdev;
5092};
5093
5094static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx,
5095				      void *data)
5096{
5097	struct check_zone_write_pointer_args *args;
5098
5099	args = (struct check_zone_write_pointer_args *)data;
5100
5101	return check_zone_write_pointer(args->sbi, args->fdev, zone);
5102}
5103
5104int f2fs_check_write_pointer(struct f2fs_sb_info *sbi)
5105{
5106	int i, ret;
5107	struct check_zone_write_pointer_args args;
5108
5109	for (i = 0; i < sbi->s_ndevs; i++) {
5110		if (!bdev_is_zoned(FDEV(i).bdev))
5111			continue;
5112
5113		args.sbi = sbi;
5114		args.fdev = &FDEV(i);
5115		ret = blkdev_report_zones(FDEV(i).bdev, 0, BLK_ALL_ZONES,
5116					  check_zone_write_pointer_cb, &args);
5117		if (ret < 0)
5118			return ret;
5119	}
5120
5121	return 0;
5122}
5123
5124/*
5125 * Return the number of usable blocks in a segment. The number of blocks
5126 * returned is always equal to the number of blocks in a segment for
5127 * segments fully contained within a sequential zone capacity or a
5128 * conventional zone. For segments partially contained in a sequential
5129 * zone capacity, the number of usable blocks up to the zone capacity
5130 * is returned. 0 is returned in all other cases.
5131 */
5132static inline unsigned int f2fs_usable_zone_blks_in_seg(
5133			struct f2fs_sb_info *sbi, unsigned int segno)
5134{
5135	block_t seg_start, sec_start_blkaddr, sec_cap_blkaddr;
5136	unsigned int secno;
5137
5138	if (!sbi->unusable_blocks_per_sec)
5139		return sbi->blocks_per_seg;
5140
5141	secno = GET_SEC_FROM_SEG(sbi, segno);
5142	seg_start = START_BLOCK(sbi, segno);
5143	sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno));
5144	sec_cap_blkaddr = sec_start_blkaddr + CAP_BLKS_PER_SEC(sbi);
5145
5146	/*
5147	 * If segment starts before zone capacity and spans beyond
5148	 * zone capacity, then usable blocks are from seg start to
5149	 * zone capacity. If the segment starts after the zone capacity,
5150	 * then there are no usable blocks.
5151	 */
5152	if (seg_start >= sec_cap_blkaddr)
5153		return 0;
5154	if (seg_start + sbi->blocks_per_seg > sec_cap_blkaddr)
5155		return sec_cap_blkaddr - seg_start;
5156
5157	return sbi->blocks_per_seg;
5158}
5159#else
5160int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi)
5161{
5162	return 0;
5163}
5164
5165int f2fs_check_write_pointer(struct f2fs_sb_info *sbi)
5166{
5167	return 0;
5168}
5169
5170static inline unsigned int f2fs_usable_zone_blks_in_seg(struct f2fs_sb_info *sbi,
5171							unsigned int segno)
5172{
5173	return 0;
5174}
5175
5176#endif
5177unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi,
5178					unsigned int segno)
5179{
5180	if (f2fs_sb_has_blkzoned(sbi))
5181		return f2fs_usable_zone_blks_in_seg(sbi, segno);
5182
5183	return sbi->blocks_per_seg;
5184}
5185
5186unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi,
5187					unsigned int segno)
5188{
5189	if (f2fs_sb_has_blkzoned(sbi))
5190		return CAP_SEGS_PER_SEC(sbi);
5191
5192	return sbi->segs_per_sec;
5193}
5194
5195/*
5196 * Update min, max modified time for cost-benefit GC algorithm
5197 */
5198static void init_min_max_mtime(struct f2fs_sb_info *sbi)
5199{
5200	struct sit_info *sit_i = SIT_I(sbi);
5201	unsigned int segno;
5202
5203	down_write(&sit_i->sentry_lock);
5204
5205	sit_i->min_mtime = ULLONG_MAX;
5206
5207	for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
5208		unsigned int i;
5209		unsigned long long mtime = 0;
5210
5211		for (i = 0; i < sbi->segs_per_sec; i++)
5212			mtime += get_seg_entry(sbi, segno + i)->mtime;
5213
5214		mtime = div_u64(mtime, sbi->segs_per_sec);
5215
5216		if (sit_i->min_mtime > mtime)
5217			sit_i->min_mtime = mtime;
5218	}
5219	sit_i->max_mtime = get_mtime(sbi, false);
5220	sit_i->dirty_max_mtime = 0;
5221	up_write(&sit_i->sentry_lock);
5222}
5223
5224int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
5225{
5226	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
5227	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
5228	struct f2fs_sm_info *sm_info;
5229	int err;
5230
5231	sm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_sm_info), GFP_KERNEL);
5232	if (!sm_info)
5233		return -ENOMEM;
5234
5235	/* init sm info */
5236	sbi->sm_info = sm_info;
5237	sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr);
5238	sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr);
5239	sm_info->segment_count = le32_to_cpu(raw_super->segment_count);
5240	sm_info->reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count);
5241	sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
5242	sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
5243	sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
5244	sm_info->rec_prefree_segments = sm_info->main_segments *
5245					DEF_RECLAIM_PREFREE_SEGMENTS / 100;
5246	if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS)
5247		sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS;
5248
5249	if (!f2fs_lfs_mode(sbi))
5250		sm_info->ipu_policy = BIT(F2FS_IPU_FSYNC);
5251	sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
5252	sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
5253	sm_info->min_seq_blocks = sbi->blocks_per_seg;
5254	sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS;
5255	sm_info->min_ssr_sections = reserved_sections(sbi);
5256
5257	INIT_LIST_HEAD(&sm_info->sit_entry_set);
5258
5259	init_f2fs_rwsem(&sm_info->curseg_lock);
5260
5261	err = f2fs_create_flush_cmd_control(sbi);
5262	if (err)
5263		return err;
5264
5265	err = create_discard_cmd_control(sbi);
5266	if (err)
5267		return err;
5268
5269	err = build_sit_info(sbi);
5270	if (err)
5271		return err;
5272	err = build_free_segmap(sbi);
5273	if (err)
5274		return err;
5275	err = build_curseg(sbi);
5276	if (err)
5277		return err;
5278
5279	/* reinit free segmap based on SIT */
5280	err = build_sit_entries(sbi);
5281	if (err)
5282		return err;
5283
5284	init_free_segmap(sbi);
5285	err = build_dirty_segmap(sbi);
5286	if (err)
5287		return err;
5288
5289	err = sanity_check_curseg(sbi);
5290	if (err)
5291		return err;
5292
5293	init_min_max_mtime(sbi);
5294	return 0;
5295}
5296
5297static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
5298		enum dirty_type dirty_type)
5299{
5300	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
5301
5302	mutex_lock(&dirty_i->seglist_lock);
5303	kvfree(dirty_i->dirty_segmap[dirty_type]);
5304	dirty_i->nr_dirty[dirty_type] = 0;
5305	mutex_unlock(&dirty_i->seglist_lock);
5306}
5307
5308static void destroy_victim_secmap(struct f2fs_sb_info *sbi)
5309{
5310	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
5311
5312	kvfree(dirty_i->pinned_secmap);
5313	kvfree(dirty_i->victim_secmap);
5314}
5315
5316static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
5317{
5318	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
5319	int i;
5320
5321	if (!dirty_i)
5322		return;
5323
5324	/* discard pre-free/dirty segments list */
5325	for (i = 0; i < NR_DIRTY_TYPE; i++)
5326		discard_dirty_segmap(sbi, i);
5327
5328	if (__is_large_section(sbi)) {
5329		mutex_lock(&dirty_i->seglist_lock);
5330		kvfree(dirty_i->dirty_secmap);
5331		mutex_unlock(&dirty_i->seglist_lock);
5332	}
5333
5334	destroy_victim_secmap(sbi);
5335	SM_I(sbi)->dirty_info = NULL;
5336	kfree(dirty_i);
5337}
5338
5339static void destroy_curseg(struct f2fs_sb_info *sbi)
5340{
5341	struct curseg_info *array = SM_I(sbi)->curseg_array;
5342	int i;
5343
5344	if (!array)
5345		return;
5346	SM_I(sbi)->curseg_array = NULL;
5347	for (i = 0; i < NR_CURSEG_TYPE; i++) {
5348		kfree(array[i].sum_blk);
5349		kfree(array[i].journal);
5350	}
5351	kfree(array);
5352}
5353
5354static void destroy_free_segmap(struct f2fs_sb_info *sbi)
5355{
5356	struct free_segmap_info *free_i = SM_I(sbi)->free_info;
5357
5358	if (!free_i)
5359		return;
5360	SM_I(sbi)->free_info = NULL;
5361	kvfree(free_i->free_segmap);
5362	kvfree(free_i->free_secmap);
5363	kfree(free_i);
5364}
5365
5366static void destroy_sit_info(struct f2fs_sb_info *sbi)
5367{
5368	struct sit_info *sit_i = SIT_I(sbi);
5369
5370	if (!sit_i)
5371		return;
5372
5373	if (sit_i->sentries)
5374		kvfree(sit_i->bitmap);
5375	kfree(sit_i->tmp_map);
5376
5377	kvfree(sit_i->sentries);
5378	kvfree(sit_i->sec_entries);
5379	kvfree(sit_i->dirty_sentries_bitmap);
5380
5381	SM_I(sbi)->sit_info = NULL;
5382	kvfree(sit_i->sit_bitmap);
5383#ifdef CONFIG_F2FS_CHECK_FS
5384	kvfree(sit_i->sit_bitmap_mir);
5385	kvfree(sit_i->invalid_segmap);
5386#endif
5387	kfree(sit_i);
5388}
5389
5390void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi)
5391{
5392	struct f2fs_sm_info *sm_info = SM_I(sbi);
5393
5394	if (!sm_info)
5395		return;
5396	f2fs_destroy_flush_cmd_control(sbi, true);
5397	destroy_discard_cmd_control(sbi);
5398	destroy_dirty_segmap(sbi);
5399	destroy_curseg(sbi);
5400	destroy_free_segmap(sbi);
5401	destroy_sit_info(sbi);
5402	sbi->sm_info = NULL;
5403	kfree(sm_info);
5404}
5405
5406int __init f2fs_create_segment_manager_caches(void)
5407{
5408	discard_entry_slab = f2fs_kmem_cache_create("f2fs_discard_entry",
5409			sizeof(struct discard_entry));
5410	if (!discard_entry_slab)
5411		goto fail;
5412
5413	discard_cmd_slab = f2fs_kmem_cache_create("f2fs_discard_cmd",
5414			sizeof(struct discard_cmd));
5415	if (!discard_cmd_slab)
5416		goto destroy_discard_entry;
5417
5418	sit_entry_set_slab = f2fs_kmem_cache_create("f2fs_sit_entry_set",
5419			sizeof(struct sit_entry_set));
5420	if (!sit_entry_set_slab)
5421		goto destroy_discard_cmd;
5422
5423	revoke_entry_slab = f2fs_kmem_cache_create("f2fs_revoke_entry",
5424			sizeof(struct revoke_entry));
5425	if (!revoke_entry_slab)
5426		goto destroy_sit_entry_set;
5427	return 0;
5428
5429destroy_sit_entry_set:
5430	kmem_cache_destroy(sit_entry_set_slab);
5431destroy_discard_cmd:
5432	kmem_cache_destroy(discard_cmd_slab);
5433destroy_discard_entry:
5434	kmem_cache_destroy(discard_entry_slab);
5435fail:
5436	return -ENOMEM;
5437}
5438
5439void f2fs_destroy_segment_manager_caches(void)
5440{
5441	kmem_cache_destroy(sit_entry_set_slab);
5442	kmem_cache_destroy(discard_cmd_slab);
5443	kmem_cache_destroy(discard_entry_slab);
5444	kmem_cache_destroy(revoke_entry_slab);
5445}
5446