xref: /kernel/linux/linux-6.6/fs/btrfs/tree-log.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2008 Oracle.  All rights reserved.
4 */
5
6#include <linux/sched.h>
7#include <linux/slab.h>
8#include <linux/blkdev.h>
9#include <linux/list_sort.h>
10#include <linux/iversion.h>
11#include "misc.h"
12#include "ctree.h"
13#include "tree-log.h"
14#include "disk-io.h"
15#include "locking.h"
16#include "print-tree.h"
17#include "backref.h"
18#include "compression.h"
19#include "qgroup.h"
20#include "block-group.h"
21#include "space-info.h"
22#include "zoned.h"
23#include "inode-item.h"
24#include "fs.h"
25#include "accessors.h"
26#include "extent-tree.h"
27#include "root-tree.h"
28#include "dir-item.h"
29#include "file-item.h"
30#include "file.h"
31#include "orphan.h"
32#include "tree-checker.h"
33
34#define MAX_CONFLICT_INODES 10
35
36/* magic values for the inode_only field in btrfs_log_inode:
37 *
38 * LOG_INODE_ALL means to log everything
39 * LOG_INODE_EXISTS means to log just enough to recreate the inode
40 * during log replay
41 */
42enum {
43	LOG_INODE_ALL,
44	LOG_INODE_EXISTS,
45};
46
47/*
48 * directory trouble cases
49 *
50 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
51 * log, we must force a full commit before doing an fsync of the directory
52 * where the unlink was done.
53 * ---> record transid of last unlink/rename per directory
54 *
55 * mkdir foo/some_dir
56 * normal commit
57 * rename foo/some_dir foo2/some_dir
58 * mkdir foo/some_dir
59 * fsync foo/some_dir/some_file
60 *
61 * The fsync above will unlink the original some_dir without recording
62 * it in its new location (foo2).  After a crash, some_dir will be gone
63 * unless the fsync of some_file forces a full commit
64 *
65 * 2) we must log any new names for any file or dir that is in the fsync
66 * log. ---> check inode while renaming/linking.
67 *
68 * 2a) we must log any new names for any file or dir during rename
69 * when the directory they are being removed from was logged.
70 * ---> check inode and old parent dir during rename
71 *
72 *  2a is actually the more important variant.  With the extra logging
73 *  a crash might unlink the old name without recreating the new one
74 *
75 * 3) after a crash, we must go through any directories with a link count
76 * of zero and redo the rm -rf
77 *
78 * mkdir f1/foo
79 * normal commit
80 * rm -rf f1/foo
81 * fsync(f1)
82 *
83 * The directory f1 was fully removed from the FS, but fsync was never
84 * called on f1, only its parent dir.  After a crash the rm -rf must
85 * be replayed.  This must be able to recurse down the entire
86 * directory tree.  The inode link count fixup code takes care of the
87 * ugly details.
88 */
89
90/*
91 * stages for the tree walking.  The first
92 * stage (0) is to only pin down the blocks we find
93 * the second stage (1) is to make sure that all the inodes
94 * we find in the log are created in the subvolume.
95 *
96 * The last stage is to deal with directories and links and extents
97 * and all the other fun semantics
98 */
99enum {
100	LOG_WALK_PIN_ONLY,
101	LOG_WALK_REPLAY_INODES,
102	LOG_WALK_REPLAY_DIR_INDEX,
103	LOG_WALK_REPLAY_ALL,
104};
105
106static int btrfs_log_inode(struct btrfs_trans_handle *trans,
107			   struct btrfs_inode *inode,
108			   int inode_only,
109			   struct btrfs_log_ctx *ctx);
110static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
111			     struct btrfs_root *root,
112			     struct btrfs_path *path, u64 objectid);
113static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
114				       struct btrfs_root *root,
115				       struct btrfs_root *log,
116				       struct btrfs_path *path,
117				       u64 dirid, int del_all);
118static void wait_log_commit(struct btrfs_root *root, int transid);
119
120/*
121 * tree logging is a special write ahead log used to make sure that
122 * fsyncs and O_SYNCs can happen without doing full tree commits.
123 *
124 * Full tree commits are expensive because they require commonly
125 * modified blocks to be recowed, creating many dirty pages in the
126 * extent tree an 4x-6x higher write load than ext3.
127 *
128 * Instead of doing a tree commit on every fsync, we use the
129 * key ranges and transaction ids to find items for a given file or directory
130 * that have changed in this transaction.  Those items are copied into
131 * a special tree (one per subvolume root), that tree is written to disk
132 * and then the fsync is considered complete.
133 *
134 * After a crash, items are copied out of the log-tree back into the
135 * subvolume tree.  Any file data extents found are recorded in the extent
136 * allocation tree, and the log-tree freed.
137 *
138 * The log tree is read three times, once to pin down all the extents it is
139 * using in ram and once, once to create all the inodes logged in the tree
140 * and once to do all the other items.
141 */
142
143/*
144 * start a sub transaction and setup the log tree
145 * this increments the log tree writer count to make the people
146 * syncing the tree wait for us to finish
147 */
148static int start_log_trans(struct btrfs_trans_handle *trans,
149			   struct btrfs_root *root,
150			   struct btrfs_log_ctx *ctx)
151{
152	struct btrfs_fs_info *fs_info = root->fs_info;
153	struct btrfs_root *tree_root = fs_info->tree_root;
154	const bool zoned = btrfs_is_zoned(fs_info);
155	int ret = 0;
156	bool created = false;
157
158	/*
159	 * First check if the log root tree was already created. If not, create
160	 * it before locking the root's log_mutex, just to keep lockdep happy.
161	 */
162	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
163		mutex_lock(&tree_root->log_mutex);
164		if (!fs_info->log_root_tree) {
165			ret = btrfs_init_log_root_tree(trans, fs_info);
166			if (!ret) {
167				set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
168				created = true;
169			}
170		}
171		mutex_unlock(&tree_root->log_mutex);
172		if (ret)
173			return ret;
174	}
175
176	mutex_lock(&root->log_mutex);
177
178again:
179	if (root->log_root) {
180		int index = (root->log_transid + 1) % 2;
181
182		if (btrfs_need_log_full_commit(trans)) {
183			ret = BTRFS_LOG_FORCE_COMMIT;
184			goto out;
185		}
186
187		if (zoned && atomic_read(&root->log_commit[index])) {
188			wait_log_commit(root, root->log_transid - 1);
189			goto again;
190		}
191
192		if (!root->log_start_pid) {
193			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
194			root->log_start_pid = current->pid;
195		} else if (root->log_start_pid != current->pid) {
196			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
197		}
198	} else {
199		/*
200		 * This means fs_info->log_root_tree was already created
201		 * for some other FS trees. Do the full commit not to mix
202		 * nodes from multiple log transactions to do sequential
203		 * writing.
204		 */
205		if (zoned && !created) {
206			ret = BTRFS_LOG_FORCE_COMMIT;
207			goto out;
208		}
209
210		ret = btrfs_add_log_tree(trans, root);
211		if (ret)
212			goto out;
213
214		set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
215		clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
216		root->log_start_pid = current->pid;
217	}
218
219	atomic_inc(&root->log_writers);
220	if (!ctx->logging_new_name) {
221		int index = root->log_transid % 2;
222		list_add_tail(&ctx->list, &root->log_ctxs[index]);
223		ctx->log_transid = root->log_transid;
224	}
225
226out:
227	mutex_unlock(&root->log_mutex);
228	return ret;
229}
230
231/*
232 * returns 0 if there was a log transaction running and we were able
233 * to join, or returns -ENOENT if there were not transactions
234 * in progress
235 */
236static int join_running_log_trans(struct btrfs_root *root)
237{
238	const bool zoned = btrfs_is_zoned(root->fs_info);
239	int ret = -ENOENT;
240
241	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
242		return ret;
243
244	mutex_lock(&root->log_mutex);
245again:
246	if (root->log_root) {
247		int index = (root->log_transid + 1) % 2;
248
249		ret = 0;
250		if (zoned && atomic_read(&root->log_commit[index])) {
251			wait_log_commit(root, root->log_transid - 1);
252			goto again;
253		}
254		atomic_inc(&root->log_writers);
255	}
256	mutex_unlock(&root->log_mutex);
257	return ret;
258}
259
260/*
261 * This either makes the current running log transaction wait
262 * until you call btrfs_end_log_trans() or it makes any future
263 * log transactions wait until you call btrfs_end_log_trans()
264 */
265void btrfs_pin_log_trans(struct btrfs_root *root)
266{
267	atomic_inc(&root->log_writers);
268}
269
270/*
271 * indicate we're done making changes to the log tree
272 * and wake up anyone waiting to do a sync
273 */
274void btrfs_end_log_trans(struct btrfs_root *root)
275{
276	if (atomic_dec_and_test(&root->log_writers)) {
277		/* atomic_dec_and_test implies a barrier */
278		cond_wake_up_nomb(&root->log_writer_wait);
279	}
280}
281
282/*
283 * the walk control struct is used to pass state down the chain when
284 * processing the log tree.  The stage field tells us which part
285 * of the log tree processing we are currently doing.  The others
286 * are state fields used for that specific part
287 */
288struct walk_control {
289	/* should we free the extent on disk when done?  This is used
290	 * at transaction commit time while freeing a log tree
291	 */
292	int free;
293
294	/* pin only walk, we record which extents on disk belong to the
295	 * log trees
296	 */
297	int pin;
298
299	/* what stage of the replay code we're currently in */
300	int stage;
301
302	/*
303	 * Ignore any items from the inode currently being processed. Needs
304	 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
305	 * the LOG_WALK_REPLAY_INODES stage.
306	 */
307	bool ignore_cur_inode;
308
309	/* the root we are currently replaying */
310	struct btrfs_root *replay_dest;
311
312	/* the trans handle for the current replay */
313	struct btrfs_trans_handle *trans;
314
315	/* the function that gets used to process blocks we find in the
316	 * tree.  Note the extent_buffer might not be up to date when it is
317	 * passed in, and it must be checked or read if you need the data
318	 * inside it
319	 */
320	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
321			    struct walk_control *wc, u64 gen, int level);
322};
323
324/*
325 * process_func used to pin down extents, write them or wait on them
326 */
327static int process_one_buffer(struct btrfs_root *log,
328			      struct extent_buffer *eb,
329			      struct walk_control *wc, u64 gen, int level)
330{
331	struct btrfs_fs_info *fs_info = log->fs_info;
332	int ret = 0;
333
334	/*
335	 * If this fs is mixed then we need to be able to process the leaves to
336	 * pin down any logged extents, so we have to read the block.
337	 */
338	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
339		struct btrfs_tree_parent_check check = {
340			.level = level,
341			.transid = gen
342		};
343
344		ret = btrfs_read_extent_buffer(eb, &check);
345		if (ret)
346			return ret;
347	}
348
349	if (wc->pin) {
350		ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
351						      eb->len);
352		if (ret)
353			return ret;
354
355		if (btrfs_buffer_uptodate(eb, gen, 0) &&
356		    btrfs_header_level(eb) == 0)
357			ret = btrfs_exclude_logged_extents(eb);
358	}
359	return ret;
360}
361
362/*
363 * Item overwrite used by replay and tree logging.  eb, slot and key all refer
364 * to the src data we are copying out.
365 *
366 * root is the tree we are copying into, and path is a scratch
367 * path for use in this function (it should be released on entry and
368 * will be released on exit).
369 *
370 * If the key is already in the destination tree the existing item is
371 * overwritten.  If the existing item isn't big enough, it is extended.
372 * If it is too large, it is truncated.
373 *
374 * If the key isn't in the destination yet, a new item is inserted.
375 */
376static int overwrite_item(struct btrfs_trans_handle *trans,
377			  struct btrfs_root *root,
378			  struct btrfs_path *path,
379			  struct extent_buffer *eb, int slot,
380			  struct btrfs_key *key)
381{
382	int ret;
383	u32 item_size;
384	u64 saved_i_size = 0;
385	int save_old_i_size = 0;
386	unsigned long src_ptr;
387	unsigned long dst_ptr;
388	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
389
390	/*
391	 * This is only used during log replay, so the root is always from a
392	 * fs/subvolume tree. In case we ever need to support a log root, then
393	 * we'll have to clone the leaf in the path, release the path and use
394	 * the leaf before writing into the log tree. See the comments at
395	 * copy_items() for more details.
396	 */
397	ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
398
399	item_size = btrfs_item_size(eb, slot);
400	src_ptr = btrfs_item_ptr_offset(eb, slot);
401
402	/* Look for the key in the destination tree. */
403	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
404	if (ret < 0)
405		return ret;
406
407	if (ret == 0) {
408		char *src_copy;
409		char *dst_copy;
410		u32 dst_size = btrfs_item_size(path->nodes[0],
411						  path->slots[0]);
412		if (dst_size != item_size)
413			goto insert;
414
415		if (item_size == 0) {
416			btrfs_release_path(path);
417			return 0;
418		}
419		dst_copy = kmalloc(item_size, GFP_NOFS);
420		src_copy = kmalloc(item_size, GFP_NOFS);
421		if (!dst_copy || !src_copy) {
422			btrfs_release_path(path);
423			kfree(dst_copy);
424			kfree(src_copy);
425			return -ENOMEM;
426		}
427
428		read_extent_buffer(eb, src_copy, src_ptr, item_size);
429
430		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
431		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
432				   item_size);
433		ret = memcmp(dst_copy, src_copy, item_size);
434
435		kfree(dst_copy);
436		kfree(src_copy);
437		/*
438		 * they have the same contents, just return, this saves
439		 * us from cowing blocks in the destination tree and doing
440		 * extra writes that may not have been done by a previous
441		 * sync
442		 */
443		if (ret == 0) {
444			btrfs_release_path(path);
445			return 0;
446		}
447
448		/*
449		 * We need to load the old nbytes into the inode so when we
450		 * replay the extents we've logged we get the right nbytes.
451		 */
452		if (inode_item) {
453			struct btrfs_inode_item *item;
454			u64 nbytes;
455			u32 mode;
456
457			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
458					      struct btrfs_inode_item);
459			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
460			item = btrfs_item_ptr(eb, slot,
461					      struct btrfs_inode_item);
462			btrfs_set_inode_nbytes(eb, item, nbytes);
463
464			/*
465			 * If this is a directory we need to reset the i_size to
466			 * 0 so that we can set it up properly when replaying
467			 * the rest of the items in this log.
468			 */
469			mode = btrfs_inode_mode(eb, item);
470			if (S_ISDIR(mode))
471				btrfs_set_inode_size(eb, item, 0);
472		}
473	} else if (inode_item) {
474		struct btrfs_inode_item *item;
475		u32 mode;
476
477		/*
478		 * New inode, set nbytes to 0 so that the nbytes comes out
479		 * properly when we replay the extents.
480		 */
481		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
482		btrfs_set_inode_nbytes(eb, item, 0);
483
484		/*
485		 * If this is a directory we need to reset the i_size to 0 so
486		 * that we can set it up properly when replaying the rest of
487		 * the items in this log.
488		 */
489		mode = btrfs_inode_mode(eb, item);
490		if (S_ISDIR(mode))
491			btrfs_set_inode_size(eb, item, 0);
492	}
493insert:
494	btrfs_release_path(path);
495	/* try to insert the key into the destination tree */
496	path->skip_release_on_error = 1;
497	ret = btrfs_insert_empty_item(trans, root, path,
498				      key, item_size);
499	path->skip_release_on_error = 0;
500
501	/* make sure any existing item is the correct size */
502	if (ret == -EEXIST || ret == -EOVERFLOW) {
503		u32 found_size;
504		found_size = btrfs_item_size(path->nodes[0],
505						path->slots[0]);
506		if (found_size > item_size)
507			btrfs_truncate_item(trans, path, item_size, 1);
508		else if (found_size < item_size)
509			btrfs_extend_item(trans, path, item_size - found_size);
510	} else if (ret) {
511		return ret;
512	}
513	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
514					path->slots[0]);
515
516	/* don't overwrite an existing inode if the generation number
517	 * was logged as zero.  This is done when the tree logging code
518	 * is just logging an inode to make sure it exists after recovery.
519	 *
520	 * Also, don't overwrite i_size on directories during replay.
521	 * log replay inserts and removes directory items based on the
522	 * state of the tree found in the subvolume, and i_size is modified
523	 * as it goes
524	 */
525	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
526		struct btrfs_inode_item *src_item;
527		struct btrfs_inode_item *dst_item;
528
529		src_item = (struct btrfs_inode_item *)src_ptr;
530		dst_item = (struct btrfs_inode_item *)dst_ptr;
531
532		if (btrfs_inode_generation(eb, src_item) == 0) {
533			struct extent_buffer *dst_eb = path->nodes[0];
534			const u64 ino_size = btrfs_inode_size(eb, src_item);
535
536			/*
537			 * For regular files an ino_size == 0 is used only when
538			 * logging that an inode exists, as part of a directory
539			 * fsync, and the inode wasn't fsynced before. In this
540			 * case don't set the size of the inode in the fs/subvol
541			 * tree, otherwise we would be throwing valid data away.
542			 */
543			if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
544			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
545			    ino_size != 0)
546				btrfs_set_inode_size(dst_eb, dst_item, ino_size);
547			goto no_copy;
548		}
549
550		if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
551		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
552			save_old_i_size = 1;
553			saved_i_size = btrfs_inode_size(path->nodes[0],
554							dst_item);
555		}
556	}
557
558	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
559			   src_ptr, item_size);
560
561	if (save_old_i_size) {
562		struct btrfs_inode_item *dst_item;
563		dst_item = (struct btrfs_inode_item *)dst_ptr;
564		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
565	}
566
567	/* make sure the generation is filled in */
568	if (key->type == BTRFS_INODE_ITEM_KEY) {
569		struct btrfs_inode_item *dst_item;
570		dst_item = (struct btrfs_inode_item *)dst_ptr;
571		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
572			btrfs_set_inode_generation(path->nodes[0], dst_item,
573						   trans->transid);
574		}
575	}
576no_copy:
577	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
578	btrfs_release_path(path);
579	return 0;
580}
581
582static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
583			       struct fscrypt_str *name)
584{
585	char *buf;
586
587	buf = kmalloc(len, GFP_NOFS);
588	if (!buf)
589		return -ENOMEM;
590
591	read_extent_buffer(eb, buf, (unsigned long)start, len);
592	name->name = buf;
593	name->len = len;
594	return 0;
595}
596
597/*
598 * simple helper to read an inode off the disk from a given root
599 * This can only be called for subvolume roots and not for the log
600 */
601static noinline struct inode *read_one_inode(struct btrfs_root *root,
602					     u64 objectid)
603{
604	struct inode *inode;
605
606	inode = btrfs_iget(root->fs_info->sb, objectid, root);
607	if (IS_ERR(inode))
608		inode = NULL;
609	return inode;
610}
611
612/* replays a single extent in 'eb' at 'slot' with 'key' into the
613 * subvolume 'root'.  path is released on entry and should be released
614 * on exit.
615 *
616 * extents in the log tree have not been allocated out of the extent
617 * tree yet.  So, this completes the allocation, taking a reference
618 * as required if the extent already exists or creating a new extent
619 * if it isn't in the extent allocation tree yet.
620 *
621 * The extent is inserted into the file, dropping any existing extents
622 * from the file that overlap the new one.
623 */
624static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
625				      struct btrfs_root *root,
626				      struct btrfs_path *path,
627				      struct extent_buffer *eb, int slot,
628				      struct btrfs_key *key)
629{
630	struct btrfs_drop_extents_args drop_args = { 0 };
631	struct btrfs_fs_info *fs_info = root->fs_info;
632	int found_type;
633	u64 extent_end;
634	u64 start = key->offset;
635	u64 nbytes = 0;
636	struct btrfs_file_extent_item *item;
637	struct inode *inode = NULL;
638	unsigned long size;
639	int ret = 0;
640
641	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
642	found_type = btrfs_file_extent_type(eb, item);
643
644	if (found_type == BTRFS_FILE_EXTENT_REG ||
645	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
646		nbytes = btrfs_file_extent_num_bytes(eb, item);
647		extent_end = start + nbytes;
648
649		/*
650		 * We don't add to the inodes nbytes if we are prealloc or a
651		 * hole.
652		 */
653		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
654			nbytes = 0;
655	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
656		size = btrfs_file_extent_ram_bytes(eb, item);
657		nbytes = btrfs_file_extent_ram_bytes(eb, item);
658		extent_end = ALIGN(start + size,
659				   fs_info->sectorsize);
660	} else {
661		ret = 0;
662		goto out;
663	}
664
665	inode = read_one_inode(root, key->objectid);
666	if (!inode) {
667		ret = -EIO;
668		goto out;
669	}
670
671	/*
672	 * first check to see if we already have this extent in the
673	 * file.  This must be done before the btrfs_drop_extents run
674	 * so we don't try to drop this extent.
675	 */
676	ret = btrfs_lookup_file_extent(trans, root, path,
677			btrfs_ino(BTRFS_I(inode)), start, 0);
678
679	if (ret == 0 &&
680	    (found_type == BTRFS_FILE_EXTENT_REG ||
681	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
682		struct btrfs_file_extent_item cmp1;
683		struct btrfs_file_extent_item cmp2;
684		struct btrfs_file_extent_item *existing;
685		struct extent_buffer *leaf;
686
687		leaf = path->nodes[0];
688		existing = btrfs_item_ptr(leaf, path->slots[0],
689					  struct btrfs_file_extent_item);
690
691		read_extent_buffer(eb, &cmp1, (unsigned long)item,
692				   sizeof(cmp1));
693		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
694				   sizeof(cmp2));
695
696		/*
697		 * we already have a pointer to this exact extent,
698		 * we don't have to do anything
699		 */
700		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
701			btrfs_release_path(path);
702			goto out;
703		}
704	}
705	btrfs_release_path(path);
706
707	/* drop any overlapping extents */
708	drop_args.start = start;
709	drop_args.end = extent_end;
710	drop_args.drop_cache = true;
711	ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
712	if (ret)
713		goto out;
714
715	if (found_type == BTRFS_FILE_EXTENT_REG ||
716	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
717		u64 offset;
718		unsigned long dest_offset;
719		struct btrfs_key ins;
720
721		if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
722		    btrfs_fs_incompat(fs_info, NO_HOLES))
723			goto update_inode;
724
725		ret = btrfs_insert_empty_item(trans, root, path, key,
726					      sizeof(*item));
727		if (ret)
728			goto out;
729		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
730						    path->slots[0]);
731		copy_extent_buffer(path->nodes[0], eb, dest_offset,
732				(unsigned long)item,  sizeof(*item));
733
734		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
735		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
736		ins.type = BTRFS_EXTENT_ITEM_KEY;
737		offset = key->offset - btrfs_file_extent_offset(eb, item);
738
739		/*
740		 * Manually record dirty extent, as here we did a shallow
741		 * file extent item copy and skip normal backref update,
742		 * but modifying extent tree all by ourselves.
743		 * So need to manually record dirty extent for qgroup,
744		 * as the owner of the file extent changed from log tree
745		 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
746		 */
747		ret = btrfs_qgroup_trace_extent(trans,
748				btrfs_file_extent_disk_bytenr(eb, item),
749				btrfs_file_extent_disk_num_bytes(eb, item));
750		if (ret < 0)
751			goto out;
752
753		if (ins.objectid > 0) {
754			struct btrfs_ref ref = { 0 };
755			u64 csum_start;
756			u64 csum_end;
757			LIST_HEAD(ordered_sums);
758
759			/*
760			 * is this extent already allocated in the extent
761			 * allocation tree?  If so, just add a reference
762			 */
763			ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
764						ins.offset);
765			if (ret < 0) {
766				goto out;
767			} else if (ret == 0) {
768				btrfs_init_generic_ref(&ref,
769						BTRFS_ADD_DELAYED_REF,
770						ins.objectid, ins.offset, 0);
771				btrfs_init_data_ref(&ref,
772						root->root_key.objectid,
773						key->objectid, offset, 0, false);
774				ret = btrfs_inc_extent_ref(trans, &ref);
775				if (ret)
776					goto out;
777			} else {
778				/*
779				 * insert the extent pointer in the extent
780				 * allocation tree
781				 */
782				ret = btrfs_alloc_logged_file_extent(trans,
783						root->root_key.objectid,
784						key->objectid, offset, &ins);
785				if (ret)
786					goto out;
787			}
788			btrfs_release_path(path);
789
790			if (btrfs_file_extent_compression(eb, item)) {
791				csum_start = ins.objectid;
792				csum_end = csum_start + ins.offset;
793			} else {
794				csum_start = ins.objectid +
795					btrfs_file_extent_offset(eb, item);
796				csum_end = csum_start +
797					btrfs_file_extent_num_bytes(eb, item);
798			}
799
800			ret = btrfs_lookup_csums_list(root->log_root,
801						csum_start, csum_end - 1,
802						&ordered_sums, 0, false);
803			if (ret)
804				goto out;
805			/*
806			 * Now delete all existing cums in the csum root that
807			 * cover our range. We do this because we can have an
808			 * extent that is completely referenced by one file
809			 * extent item and partially referenced by another
810			 * file extent item (like after using the clone or
811			 * extent_same ioctls). In this case if we end up doing
812			 * the replay of the one that partially references the
813			 * extent first, and we do not do the csum deletion
814			 * below, we can get 2 csum items in the csum tree that
815			 * overlap each other. For example, imagine our log has
816			 * the two following file extent items:
817			 *
818			 * key (257 EXTENT_DATA 409600)
819			 *     extent data disk byte 12845056 nr 102400
820			 *     extent data offset 20480 nr 20480 ram 102400
821			 *
822			 * key (257 EXTENT_DATA 819200)
823			 *     extent data disk byte 12845056 nr 102400
824			 *     extent data offset 0 nr 102400 ram 102400
825			 *
826			 * Where the second one fully references the 100K extent
827			 * that starts at disk byte 12845056, and the log tree
828			 * has a single csum item that covers the entire range
829			 * of the extent:
830			 *
831			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
832			 *
833			 * After the first file extent item is replayed, the
834			 * csum tree gets the following csum item:
835			 *
836			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
837			 *
838			 * Which covers the 20K sub-range starting at offset 20K
839			 * of our extent. Now when we replay the second file
840			 * extent item, if we do not delete existing csum items
841			 * that cover any of its blocks, we end up getting two
842			 * csum items in our csum tree that overlap each other:
843			 *
844			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
845			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
846			 *
847			 * Which is a problem, because after this anyone trying
848			 * to lookup up for the checksum of any block of our
849			 * extent starting at an offset of 40K or higher, will
850			 * end up looking at the second csum item only, which
851			 * does not contain the checksum for any block starting
852			 * at offset 40K or higher of our extent.
853			 */
854			while (!list_empty(&ordered_sums)) {
855				struct btrfs_ordered_sum *sums;
856				struct btrfs_root *csum_root;
857
858				sums = list_entry(ordered_sums.next,
859						struct btrfs_ordered_sum,
860						list);
861				csum_root = btrfs_csum_root(fs_info,
862							    sums->logical);
863				if (!ret)
864					ret = btrfs_del_csums(trans, csum_root,
865							      sums->logical,
866							      sums->len);
867				if (!ret)
868					ret = btrfs_csum_file_blocks(trans,
869								     csum_root,
870								     sums);
871				list_del(&sums->list);
872				kfree(sums);
873			}
874			if (ret)
875				goto out;
876		} else {
877			btrfs_release_path(path);
878		}
879	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
880		/* inline extents are easy, we just overwrite them */
881		ret = overwrite_item(trans, root, path, eb, slot, key);
882		if (ret)
883			goto out;
884	}
885
886	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
887						extent_end - start);
888	if (ret)
889		goto out;
890
891update_inode:
892	btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
893	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
894out:
895	iput(inode);
896	return ret;
897}
898
899static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
900				       struct btrfs_inode *dir,
901				       struct btrfs_inode *inode,
902				       const struct fscrypt_str *name)
903{
904	int ret;
905
906	ret = btrfs_unlink_inode(trans, dir, inode, name);
907	if (ret)
908		return ret;
909	/*
910	 * Whenever we need to check if a name exists or not, we check the
911	 * fs/subvolume tree. So after an unlink we must run delayed items, so
912	 * that future checks for a name during log replay see that the name
913	 * does not exists anymore.
914	 */
915	return btrfs_run_delayed_items(trans);
916}
917
918/*
919 * when cleaning up conflicts between the directory names in the
920 * subvolume, directory names in the log and directory names in the
921 * inode back references, we may have to unlink inodes from directories.
922 *
923 * This is a helper function to do the unlink of a specific directory
924 * item
925 */
926static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
927				      struct btrfs_path *path,
928				      struct btrfs_inode *dir,
929				      struct btrfs_dir_item *di)
930{
931	struct btrfs_root *root = dir->root;
932	struct inode *inode;
933	struct fscrypt_str name;
934	struct extent_buffer *leaf;
935	struct btrfs_key location;
936	int ret;
937
938	leaf = path->nodes[0];
939
940	btrfs_dir_item_key_to_cpu(leaf, di, &location);
941	ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
942	if (ret)
943		return -ENOMEM;
944
945	btrfs_release_path(path);
946
947	inode = read_one_inode(root, location.objectid);
948	if (!inode) {
949		ret = -EIO;
950		goto out;
951	}
952
953	ret = link_to_fixup_dir(trans, root, path, location.objectid);
954	if (ret)
955		goto out;
956
957	ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name);
958out:
959	kfree(name.name);
960	iput(inode);
961	return ret;
962}
963
964/*
965 * See if a given name and sequence number found in an inode back reference are
966 * already in a directory and correctly point to this inode.
967 *
968 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
969 * exists.
970 */
971static noinline int inode_in_dir(struct btrfs_root *root,
972				 struct btrfs_path *path,
973				 u64 dirid, u64 objectid, u64 index,
974				 struct fscrypt_str *name)
975{
976	struct btrfs_dir_item *di;
977	struct btrfs_key location;
978	int ret = 0;
979
980	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
981					 index, name, 0);
982	if (IS_ERR(di)) {
983		ret = PTR_ERR(di);
984		goto out;
985	} else if (di) {
986		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
987		if (location.objectid != objectid)
988			goto out;
989	} else {
990		goto out;
991	}
992
993	btrfs_release_path(path);
994	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
995	if (IS_ERR(di)) {
996		ret = PTR_ERR(di);
997		goto out;
998	} else if (di) {
999		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
1000		if (location.objectid == objectid)
1001			ret = 1;
1002	}
1003out:
1004	btrfs_release_path(path);
1005	return ret;
1006}
1007
1008/*
1009 * helper function to check a log tree for a named back reference in
1010 * an inode.  This is used to decide if a back reference that is
1011 * found in the subvolume conflicts with what we find in the log.
1012 *
1013 * inode backreferences may have multiple refs in a single item,
1014 * during replay we process one reference at a time, and we don't
1015 * want to delete valid links to a file from the subvolume if that
1016 * link is also in the log.
1017 */
1018static noinline int backref_in_log(struct btrfs_root *log,
1019				   struct btrfs_key *key,
1020				   u64 ref_objectid,
1021				   const struct fscrypt_str *name)
1022{
1023	struct btrfs_path *path;
1024	int ret;
1025
1026	path = btrfs_alloc_path();
1027	if (!path)
1028		return -ENOMEM;
1029
1030	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
1031	if (ret < 0) {
1032		goto out;
1033	} else if (ret == 1) {
1034		ret = 0;
1035		goto out;
1036	}
1037
1038	if (key->type == BTRFS_INODE_EXTREF_KEY)
1039		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1040						       path->slots[0],
1041						       ref_objectid, name);
1042	else
1043		ret = !!btrfs_find_name_in_backref(path->nodes[0],
1044						   path->slots[0], name);
1045out:
1046	btrfs_free_path(path);
1047	return ret;
1048}
1049
1050static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
1051				  struct btrfs_root *root,
1052				  struct btrfs_path *path,
1053				  struct btrfs_root *log_root,
1054				  struct btrfs_inode *dir,
1055				  struct btrfs_inode *inode,
1056				  u64 inode_objectid, u64 parent_objectid,
1057				  u64 ref_index, struct fscrypt_str *name)
1058{
1059	int ret;
1060	struct extent_buffer *leaf;
1061	struct btrfs_dir_item *di;
1062	struct btrfs_key search_key;
1063	struct btrfs_inode_extref *extref;
1064
1065again:
1066	/* Search old style refs */
1067	search_key.objectid = inode_objectid;
1068	search_key.type = BTRFS_INODE_REF_KEY;
1069	search_key.offset = parent_objectid;
1070	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1071	if (ret == 0) {
1072		struct btrfs_inode_ref *victim_ref;
1073		unsigned long ptr;
1074		unsigned long ptr_end;
1075
1076		leaf = path->nodes[0];
1077
1078		/* are we trying to overwrite a back ref for the root directory
1079		 * if so, just jump out, we're done
1080		 */
1081		if (search_key.objectid == search_key.offset)
1082			return 1;
1083
1084		/* check all the names in this back reference to see
1085		 * if they are in the log.  if so, we allow them to stay
1086		 * otherwise they must be unlinked as a conflict
1087		 */
1088		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1089		ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
1090		while (ptr < ptr_end) {
1091			struct fscrypt_str victim_name;
1092
1093			victim_ref = (struct btrfs_inode_ref *)ptr;
1094			ret = read_alloc_one_name(leaf, (victim_ref + 1),
1095				 btrfs_inode_ref_name_len(leaf, victim_ref),
1096				 &victim_name);
1097			if (ret)
1098				return ret;
1099
1100			ret = backref_in_log(log_root, &search_key,
1101					     parent_objectid, &victim_name);
1102			if (ret < 0) {
1103				kfree(victim_name.name);
1104				return ret;
1105			} else if (!ret) {
1106				inc_nlink(&inode->vfs_inode);
1107				btrfs_release_path(path);
1108
1109				ret = unlink_inode_for_log_replay(trans, dir, inode,
1110						&victim_name);
1111				kfree(victim_name.name);
1112				if (ret)
1113					return ret;
1114				goto again;
1115			}
1116			kfree(victim_name.name);
1117
1118			ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
1119		}
1120	}
1121	btrfs_release_path(path);
1122
1123	/* Same search but for extended refs */
1124	extref = btrfs_lookup_inode_extref(NULL, root, path, name,
1125					   inode_objectid, parent_objectid, 0,
1126					   0);
1127	if (IS_ERR(extref)) {
1128		return PTR_ERR(extref);
1129	} else if (extref) {
1130		u32 item_size;
1131		u32 cur_offset = 0;
1132		unsigned long base;
1133		struct inode *victim_parent;
1134
1135		leaf = path->nodes[0];
1136
1137		item_size = btrfs_item_size(leaf, path->slots[0]);
1138		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1139
1140		while (cur_offset < item_size) {
1141			struct fscrypt_str victim_name;
1142
1143			extref = (struct btrfs_inode_extref *)(base + cur_offset);
1144
1145			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1146				goto next;
1147
1148			ret = read_alloc_one_name(leaf, &extref->name,
1149				 btrfs_inode_extref_name_len(leaf, extref),
1150				 &victim_name);
1151			if (ret)
1152				return ret;
1153
1154			search_key.objectid = inode_objectid;
1155			search_key.type = BTRFS_INODE_EXTREF_KEY;
1156			search_key.offset = btrfs_extref_hash(parent_objectid,
1157							      victim_name.name,
1158							      victim_name.len);
1159			ret = backref_in_log(log_root, &search_key,
1160					     parent_objectid, &victim_name);
1161			if (ret < 0) {
1162				kfree(victim_name.name);
1163				return ret;
1164			} else if (!ret) {
1165				ret = -ENOENT;
1166				victim_parent = read_one_inode(root,
1167						parent_objectid);
1168				if (victim_parent) {
1169					inc_nlink(&inode->vfs_inode);
1170					btrfs_release_path(path);
1171
1172					ret = unlink_inode_for_log_replay(trans,
1173							BTRFS_I(victim_parent),
1174							inode, &victim_name);
1175				}
1176				iput(victim_parent);
1177				kfree(victim_name.name);
1178				if (ret)
1179					return ret;
1180				goto again;
1181			}
1182			kfree(victim_name.name);
1183next:
1184			cur_offset += victim_name.len + sizeof(*extref);
1185		}
1186	}
1187	btrfs_release_path(path);
1188
1189	/* look for a conflicting sequence number */
1190	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1191					 ref_index, name, 0);
1192	if (IS_ERR(di)) {
1193		return PTR_ERR(di);
1194	} else if (di) {
1195		ret = drop_one_dir_item(trans, path, dir, di);
1196		if (ret)
1197			return ret;
1198	}
1199	btrfs_release_path(path);
1200
1201	/* look for a conflicting name */
1202	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0);
1203	if (IS_ERR(di)) {
1204		return PTR_ERR(di);
1205	} else if (di) {
1206		ret = drop_one_dir_item(trans, path, dir, di);
1207		if (ret)
1208			return ret;
1209	}
1210	btrfs_release_path(path);
1211
1212	return 0;
1213}
1214
1215static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1216			     struct fscrypt_str *name, u64 *index,
1217			     u64 *parent_objectid)
1218{
1219	struct btrfs_inode_extref *extref;
1220	int ret;
1221
1222	extref = (struct btrfs_inode_extref *)ref_ptr;
1223
1224	ret = read_alloc_one_name(eb, &extref->name,
1225				  btrfs_inode_extref_name_len(eb, extref), name);
1226	if (ret)
1227		return ret;
1228
1229	if (index)
1230		*index = btrfs_inode_extref_index(eb, extref);
1231	if (parent_objectid)
1232		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
1233
1234	return 0;
1235}
1236
1237static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1238			  struct fscrypt_str *name, u64 *index)
1239{
1240	struct btrfs_inode_ref *ref;
1241	int ret;
1242
1243	ref = (struct btrfs_inode_ref *)ref_ptr;
1244
1245	ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
1246				  name);
1247	if (ret)
1248		return ret;
1249
1250	if (index)
1251		*index = btrfs_inode_ref_index(eb, ref);
1252
1253	return 0;
1254}
1255
1256/*
1257 * Take an inode reference item from the log tree and iterate all names from the
1258 * inode reference item in the subvolume tree with the same key (if it exists).
1259 * For any name that is not in the inode reference item from the log tree, do a
1260 * proper unlink of that name (that is, remove its entry from the inode
1261 * reference item and both dir index keys).
1262 */
1263static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1264				 struct btrfs_root *root,
1265				 struct btrfs_path *path,
1266				 struct btrfs_inode *inode,
1267				 struct extent_buffer *log_eb,
1268				 int log_slot,
1269				 struct btrfs_key *key)
1270{
1271	int ret;
1272	unsigned long ref_ptr;
1273	unsigned long ref_end;
1274	struct extent_buffer *eb;
1275
1276again:
1277	btrfs_release_path(path);
1278	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1279	if (ret > 0) {
1280		ret = 0;
1281		goto out;
1282	}
1283	if (ret < 0)
1284		goto out;
1285
1286	eb = path->nodes[0];
1287	ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1288	ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
1289	while (ref_ptr < ref_end) {
1290		struct fscrypt_str name;
1291		u64 parent_id;
1292
1293		if (key->type == BTRFS_INODE_EXTREF_KEY) {
1294			ret = extref_get_fields(eb, ref_ptr, &name,
1295						NULL, &parent_id);
1296		} else {
1297			parent_id = key->offset;
1298			ret = ref_get_fields(eb, ref_ptr, &name, NULL);
1299		}
1300		if (ret)
1301			goto out;
1302
1303		if (key->type == BTRFS_INODE_EXTREF_KEY)
1304			ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
1305							       parent_id, &name);
1306		else
1307			ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
1308
1309		if (!ret) {
1310			struct inode *dir;
1311
1312			btrfs_release_path(path);
1313			dir = read_one_inode(root, parent_id);
1314			if (!dir) {
1315				ret = -ENOENT;
1316				kfree(name.name);
1317				goto out;
1318			}
1319			ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
1320						 inode, &name);
1321			kfree(name.name);
1322			iput(dir);
1323			if (ret)
1324				goto out;
1325			goto again;
1326		}
1327
1328		kfree(name.name);
1329		ref_ptr += name.len;
1330		if (key->type == BTRFS_INODE_EXTREF_KEY)
1331			ref_ptr += sizeof(struct btrfs_inode_extref);
1332		else
1333			ref_ptr += sizeof(struct btrfs_inode_ref);
1334	}
1335	ret = 0;
1336 out:
1337	btrfs_release_path(path);
1338	return ret;
1339}
1340
1341/*
1342 * replay one inode back reference item found in the log tree.
1343 * eb, slot and key refer to the buffer and key found in the log tree.
1344 * root is the destination we are replaying into, and path is for temp
1345 * use by this function.  (it should be released on return).
1346 */
1347static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1348				  struct btrfs_root *root,
1349				  struct btrfs_root *log,
1350				  struct btrfs_path *path,
1351				  struct extent_buffer *eb, int slot,
1352				  struct btrfs_key *key)
1353{
1354	struct inode *dir = NULL;
1355	struct inode *inode = NULL;
1356	unsigned long ref_ptr;
1357	unsigned long ref_end;
1358	struct fscrypt_str name;
1359	int ret;
1360	int log_ref_ver = 0;
1361	u64 parent_objectid;
1362	u64 inode_objectid;
1363	u64 ref_index = 0;
1364	int ref_struct_size;
1365
1366	ref_ptr = btrfs_item_ptr_offset(eb, slot);
1367	ref_end = ref_ptr + btrfs_item_size(eb, slot);
1368
1369	if (key->type == BTRFS_INODE_EXTREF_KEY) {
1370		struct btrfs_inode_extref *r;
1371
1372		ref_struct_size = sizeof(struct btrfs_inode_extref);
1373		log_ref_ver = 1;
1374		r = (struct btrfs_inode_extref *)ref_ptr;
1375		parent_objectid = btrfs_inode_extref_parent(eb, r);
1376	} else {
1377		ref_struct_size = sizeof(struct btrfs_inode_ref);
1378		parent_objectid = key->offset;
1379	}
1380	inode_objectid = key->objectid;
1381
1382	/*
1383	 * it is possible that we didn't log all the parent directories
1384	 * for a given inode.  If we don't find the dir, just don't
1385	 * copy the back ref in.  The link count fixup code will take
1386	 * care of the rest
1387	 */
1388	dir = read_one_inode(root, parent_objectid);
1389	if (!dir) {
1390		ret = -ENOENT;
1391		goto out;
1392	}
1393
1394	inode = read_one_inode(root, inode_objectid);
1395	if (!inode) {
1396		ret = -EIO;
1397		goto out;
1398	}
1399
1400	while (ref_ptr < ref_end) {
1401		if (log_ref_ver) {
1402			ret = extref_get_fields(eb, ref_ptr, &name,
1403						&ref_index, &parent_objectid);
1404			/*
1405			 * parent object can change from one array
1406			 * item to another.
1407			 */
1408			if (!dir)
1409				dir = read_one_inode(root, parent_objectid);
1410			if (!dir) {
1411				ret = -ENOENT;
1412				goto out;
1413			}
1414		} else {
1415			ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
1416		}
1417		if (ret)
1418			goto out;
1419
1420		ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1421				   btrfs_ino(BTRFS_I(inode)), ref_index, &name);
1422		if (ret < 0) {
1423			goto out;
1424		} else if (ret == 0) {
1425			/*
1426			 * look for a conflicting back reference in the
1427			 * metadata. if we find one we have to unlink that name
1428			 * of the file before we add our new link.  Later on, we
1429			 * overwrite any existing back reference, and we don't
1430			 * want to create dangling pointers in the directory.
1431			 */
1432			ret = __add_inode_ref(trans, root, path, log,
1433					      BTRFS_I(dir), BTRFS_I(inode),
1434					      inode_objectid, parent_objectid,
1435					      ref_index, &name);
1436			if (ret) {
1437				if (ret == 1)
1438					ret = 0;
1439				goto out;
1440			}
1441
1442			/* insert our name */
1443			ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1444					     &name, 0, ref_index);
1445			if (ret)
1446				goto out;
1447
1448			ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1449			if (ret)
1450				goto out;
1451		}
1452		/* Else, ret == 1, we already have a perfect match, we're done. */
1453
1454		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
1455		kfree(name.name);
1456		name.name = NULL;
1457		if (log_ref_ver) {
1458			iput(dir);
1459			dir = NULL;
1460		}
1461	}
1462
1463	/*
1464	 * Before we overwrite the inode reference item in the subvolume tree
1465	 * with the item from the log tree, we must unlink all names from the
1466	 * parent directory that are in the subvolume's tree inode reference
1467	 * item, otherwise we end up with an inconsistent subvolume tree where
1468	 * dir index entries exist for a name but there is no inode reference
1469	 * item with the same name.
1470	 */
1471	ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
1472				    key);
1473	if (ret)
1474		goto out;
1475
1476	/* finally write the back reference in the inode */
1477	ret = overwrite_item(trans, root, path, eb, slot, key);
1478out:
1479	btrfs_release_path(path);
1480	kfree(name.name);
1481	iput(dir);
1482	iput(inode);
1483	return ret;
1484}
1485
1486static int count_inode_extrefs(struct btrfs_root *root,
1487		struct btrfs_inode *inode, struct btrfs_path *path)
1488{
1489	int ret = 0;
1490	int name_len;
1491	unsigned int nlink = 0;
1492	u32 item_size;
1493	u32 cur_offset = 0;
1494	u64 inode_objectid = btrfs_ino(inode);
1495	u64 offset = 0;
1496	unsigned long ptr;
1497	struct btrfs_inode_extref *extref;
1498	struct extent_buffer *leaf;
1499
1500	while (1) {
1501		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1502					    &extref, &offset);
1503		if (ret)
1504			break;
1505
1506		leaf = path->nodes[0];
1507		item_size = btrfs_item_size(leaf, path->slots[0]);
1508		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1509		cur_offset = 0;
1510
1511		while (cur_offset < item_size) {
1512			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1513			name_len = btrfs_inode_extref_name_len(leaf, extref);
1514
1515			nlink++;
1516
1517			cur_offset += name_len + sizeof(*extref);
1518		}
1519
1520		offset++;
1521		btrfs_release_path(path);
1522	}
1523	btrfs_release_path(path);
1524
1525	if (ret < 0 && ret != -ENOENT)
1526		return ret;
1527	return nlink;
1528}
1529
1530static int count_inode_refs(struct btrfs_root *root,
1531			struct btrfs_inode *inode, struct btrfs_path *path)
1532{
1533	int ret;
1534	struct btrfs_key key;
1535	unsigned int nlink = 0;
1536	unsigned long ptr;
1537	unsigned long ptr_end;
1538	int name_len;
1539	u64 ino = btrfs_ino(inode);
1540
1541	key.objectid = ino;
1542	key.type = BTRFS_INODE_REF_KEY;
1543	key.offset = (u64)-1;
1544
1545	while (1) {
1546		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1547		if (ret < 0)
1548			break;
1549		if (ret > 0) {
1550			if (path->slots[0] == 0)
1551				break;
1552			path->slots[0]--;
1553		}
1554process_slot:
1555		btrfs_item_key_to_cpu(path->nodes[0], &key,
1556				      path->slots[0]);
1557		if (key.objectid != ino ||
1558		    key.type != BTRFS_INODE_REF_KEY)
1559			break;
1560		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1561		ptr_end = ptr + btrfs_item_size(path->nodes[0],
1562						   path->slots[0]);
1563		while (ptr < ptr_end) {
1564			struct btrfs_inode_ref *ref;
1565
1566			ref = (struct btrfs_inode_ref *)ptr;
1567			name_len = btrfs_inode_ref_name_len(path->nodes[0],
1568							    ref);
1569			ptr = (unsigned long)(ref + 1) + name_len;
1570			nlink++;
1571		}
1572
1573		if (key.offset == 0)
1574			break;
1575		if (path->slots[0] > 0) {
1576			path->slots[0]--;
1577			goto process_slot;
1578		}
1579		key.offset--;
1580		btrfs_release_path(path);
1581	}
1582	btrfs_release_path(path);
1583
1584	return nlink;
1585}
1586
1587/*
1588 * There are a few corners where the link count of the file can't
1589 * be properly maintained during replay.  So, instead of adding
1590 * lots of complexity to the log code, we just scan the backrefs
1591 * for any file that has been through replay.
1592 *
1593 * The scan will update the link count on the inode to reflect the
1594 * number of back refs found.  If it goes down to zero, the iput
1595 * will free the inode.
1596 */
1597static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1598					   struct btrfs_root *root,
1599					   struct inode *inode)
1600{
1601	struct btrfs_path *path;
1602	int ret;
1603	u64 nlink = 0;
1604	u64 ino = btrfs_ino(BTRFS_I(inode));
1605
1606	path = btrfs_alloc_path();
1607	if (!path)
1608		return -ENOMEM;
1609
1610	ret = count_inode_refs(root, BTRFS_I(inode), path);
1611	if (ret < 0)
1612		goto out;
1613
1614	nlink = ret;
1615
1616	ret = count_inode_extrefs(root, BTRFS_I(inode), path);
1617	if (ret < 0)
1618		goto out;
1619
1620	nlink += ret;
1621
1622	ret = 0;
1623
1624	if (nlink != inode->i_nlink) {
1625		set_nlink(inode, nlink);
1626		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1627		if (ret)
1628			goto out;
1629	}
1630	BTRFS_I(inode)->index_cnt = (u64)-1;
1631
1632	if (inode->i_nlink == 0) {
1633		if (S_ISDIR(inode->i_mode)) {
1634			ret = replay_dir_deletes(trans, root, NULL, path,
1635						 ino, 1);
1636			if (ret)
1637				goto out;
1638		}
1639		ret = btrfs_insert_orphan_item(trans, root, ino);
1640		if (ret == -EEXIST)
1641			ret = 0;
1642	}
1643
1644out:
1645	btrfs_free_path(path);
1646	return ret;
1647}
1648
1649static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1650					    struct btrfs_root *root,
1651					    struct btrfs_path *path)
1652{
1653	int ret;
1654	struct btrfs_key key;
1655	struct inode *inode;
1656
1657	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1658	key.type = BTRFS_ORPHAN_ITEM_KEY;
1659	key.offset = (u64)-1;
1660	while (1) {
1661		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1662		if (ret < 0)
1663			break;
1664
1665		if (ret == 1) {
1666			ret = 0;
1667			if (path->slots[0] == 0)
1668				break;
1669			path->slots[0]--;
1670		}
1671
1672		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1673		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1674		    key.type != BTRFS_ORPHAN_ITEM_KEY)
1675			break;
1676
1677		ret = btrfs_del_item(trans, root, path);
1678		if (ret)
1679			break;
1680
1681		btrfs_release_path(path);
1682		inode = read_one_inode(root, key.offset);
1683		if (!inode) {
1684			ret = -EIO;
1685			break;
1686		}
1687
1688		ret = fixup_inode_link_count(trans, root, inode);
1689		iput(inode);
1690		if (ret)
1691			break;
1692
1693		/*
1694		 * fixup on a directory may create new entries,
1695		 * make sure we always look for the highset possible
1696		 * offset
1697		 */
1698		key.offset = (u64)-1;
1699	}
1700	btrfs_release_path(path);
1701	return ret;
1702}
1703
1704
1705/*
1706 * record a given inode in the fixup dir so we can check its link
1707 * count when replay is done.  The link count is incremented here
1708 * so the inode won't go away until we check it
1709 */
1710static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1711				      struct btrfs_root *root,
1712				      struct btrfs_path *path,
1713				      u64 objectid)
1714{
1715	struct btrfs_key key;
1716	int ret = 0;
1717	struct inode *inode;
1718
1719	inode = read_one_inode(root, objectid);
1720	if (!inode)
1721		return -EIO;
1722
1723	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1724	key.type = BTRFS_ORPHAN_ITEM_KEY;
1725	key.offset = objectid;
1726
1727	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1728
1729	btrfs_release_path(path);
1730	if (ret == 0) {
1731		if (!inode->i_nlink)
1732			set_nlink(inode, 1);
1733		else
1734			inc_nlink(inode);
1735		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
1736	} else if (ret == -EEXIST) {
1737		ret = 0;
1738	}
1739	iput(inode);
1740
1741	return ret;
1742}
1743
1744/*
1745 * when replaying the log for a directory, we only insert names
1746 * for inodes that actually exist.  This means an fsync on a directory
1747 * does not implicitly fsync all the new files in it
1748 */
1749static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1750				    struct btrfs_root *root,
1751				    u64 dirid, u64 index,
1752				    const struct fscrypt_str *name,
1753				    struct btrfs_key *location)
1754{
1755	struct inode *inode;
1756	struct inode *dir;
1757	int ret;
1758
1759	inode = read_one_inode(root, location->objectid);
1760	if (!inode)
1761		return -ENOENT;
1762
1763	dir = read_one_inode(root, dirid);
1764	if (!dir) {
1765		iput(inode);
1766		return -EIO;
1767	}
1768
1769	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1770			     1, index);
1771
1772	/* FIXME, put inode into FIXUP list */
1773
1774	iput(inode);
1775	iput(dir);
1776	return ret;
1777}
1778
1779static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
1780					struct btrfs_inode *dir,
1781					struct btrfs_path *path,
1782					struct btrfs_dir_item *dst_di,
1783					const struct btrfs_key *log_key,
1784					u8 log_flags,
1785					bool exists)
1786{
1787	struct btrfs_key found_key;
1788
1789	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1790	/* The existing dentry points to the same inode, don't delete it. */
1791	if (found_key.objectid == log_key->objectid &&
1792	    found_key.type == log_key->type &&
1793	    found_key.offset == log_key->offset &&
1794	    btrfs_dir_flags(path->nodes[0], dst_di) == log_flags)
1795		return 1;
1796
1797	/*
1798	 * Don't drop the conflicting directory entry if the inode for the new
1799	 * entry doesn't exist.
1800	 */
1801	if (!exists)
1802		return 0;
1803
1804	return drop_one_dir_item(trans, path, dir, dst_di);
1805}
1806
1807/*
1808 * take a single entry in a log directory item and replay it into
1809 * the subvolume.
1810 *
1811 * if a conflicting item exists in the subdirectory already,
1812 * the inode it points to is unlinked and put into the link count
1813 * fix up tree.
1814 *
1815 * If a name from the log points to a file or directory that does
1816 * not exist in the FS, it is skipped.  fsyncs on directories
1817 * do not force down inodes inside that directory, just changes to the
1818 * names or unlinks in a directory.
1819 *
1820 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1821 * non-existing inode) and 1 if the name was replayed.
1822 */
1823static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1824				    struct btrfs_root *root,
1825				    struct btrfs_path *path,
1826				    struct extent_buffer *eb,
1827				    struct btrfs_dir_item *di,
1828				    struct btrfs_key *key)
1829{
1830	struct fscrypt_str name;
1831	struct btrfs_dir_item *dir_dst_di;
1832	struct btrfs_dir_item *index_dst_di;
1833	bool dir_dst_matches = false;
1834	bool index_dst_matches = false;
1835	struct btrfs_key log_key;
1836	struct btrfs_key search_key;
1837	struct inode *dir;
1838	u8 log_flags;
1839	bool exists;
1840	int ret;
1841	bool update_size = true;
1842	bool name_added = false;
1843
1844	dir = read_one_inode(root, key->objectid);
1845	if (!dir)
1846		return -EIO;
1847
1848	ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
1849	if (ret)
1850		goto out;
1851
1852	log_flags = btrfs_dir_flags(eb, di);
1853	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1854	ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1855	btrfs_release_path(path);
1856	if (ret < 0)
1857		goto out;
1858	exists = (ret == 0);
1859	ret = 0;
1860
1861	dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1862					   &name, 1);
1863	if (IS_ERR(dir_dst_di)) {
1864		ret = PTR_ERR(dir_dst_di);
1865		goto out;
1866	} else if (dir_dst_di) {
1867		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
1868						   dir_dst_di, &log_key,
1869						   log_flags, exists);
1870		if (ret < 0)
1871			goto out;
1872		dir_dst_matches = (ret == 1);
1873	}
1874
1875	btrfs_release_path(path);
1876
1877	index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1878						   key->objectid, key->offset,
1879						   &name, 1);
1880	if (IS_ERR(index_dst_di)) {
1881		ret = PTR_ERR(index_dst_di);
1882		goto out;
1883	} else if (index_dst_di) {
1884		ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
1885						   index_dst_di, &log_key,
1886						   log_flags, exists);
1887		if (ret < 0)
1888			goto out;
1889		index_dst_matches = (ret == 1);
1890	}
1891
1892	btrfs_release_path(path);
1893
1894	if (dir_dst_matches && index_dst_matches) {
1895		ret = 0;
1896		update_size = false;
1897		goto out;
1898	}
1899
1900	/*
1901	 * Check if the inode reference exists in the log for the given name,
1902	 * inode and parent inode
1903	 */
1904	search_key.objectid = log_key.objectid;
1905	search_key.type = BTRFS_INODE_REF_KEY;
1906	search_key.offset = key->objectid;
1907	ret = backref_in_log(root->log_root, &search_key, 0, &name);
1908	if (ret < 0) {
1909	        goto out;
1910	} else if (ret) {
1911	        /* The dentry will be added later. */
1912	        ret = 0;
1913	        update_size = false;
1914	        goto out;
1915	}
1916
1917	search_key.objectid = log_key.objectid;
1918	search_key.type = BTRFS_INODE_EXTREF_KEY;
1919	search_key.offset = key->objectid;
1920	ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
1921	if (ret < 0) {
1922		goto out;
1923	} else if (ret) {
1924		/* The dentry will be added later. */
1925		ret = 0;
1926		update_size = false;
1927		goto out;
1928	}
1929	btrfs_release_path(path);
1930	ret = insert_one_name(trans, root, key->objectid, key->offset,
1931			      &name, &log_key);
1932	if (ret && ret != -ENOENT && ret != -EEXIST)
1933		goto out;
1934	if (!ret)
1935		name_added = true;
1936	update_size = false;
1937	ret = 0;
1938
1939out:
1940	if (!ret && update_size) {
1941		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
1942		ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
1943	}
1944	kfree(name.name);
1945	iput(dir);
1946	if (!ret && name_added)
1947		ret = 1;
1948	return ret;
1949}
1950
1951/* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
1952static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1953					struct btrfs_root *root,
1954					struct btrfs_path *path,
1955					struct extent_buffer *eb, int slot,
1956					struct btrfs_key *key)
1957{
1958	int ret;
1959	struct btrfs_dir_item *di;
1960
1961	/* We only log dir index keys, which only contain a single dir item. */
1962	ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
1963
1964	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
1965	ret = replay_one_name(trans, root, path, eb, di, key);
1966	if (ret < 0)
1967		return ret;
1968
1969	/*
1970	 * If this entry refers to a non-directory (directories can not have a
1971	 * link count > 1) and it was added in the transaction that was not
1972	 * committed, make sure we fixup the link count of the inode the entry
1973	 * points to. Otherwise something like the following would result in a
1974	 * directory pointing to an inode with a wrong link that does not account
1975	 * for this dir entry:
1976	 *
1977	 * mkdir testdir
1978	 * touch testdir/foo
1979	 * touch testdir/bar
1980	 * sync
1981	 *
1982	 * ln testdir/bar testdir/bar_link
1983	 * ln testdir/foo testdir/foo_link
1984	 * xfs_io -c "fsync" testdir/bar
1985	 *
1986	 * <power failure>
1987	 *
1988	 * mount fs, log replay happens
1989	 *
1990	 * File foo would remain with a link count of 1 when it has two entries
1991	 * pointing to it in the directory testdir. This would make it impossible
1992	 * to ever delete the parent directory has it would result in stale
1993	 * dentries that can never be deleted.
1994	 */
1995	if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) {
1996		struct btrfs_path *fixup_path;
1997		struct btrfs_key di_key;
1998
1999		fixup_path = btrfs_alloc_path();
2000		if (!fixup_path)
2001			return -ENOMEM;
2002
2003		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2004		ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
2005		btrfs_free_path(fixup_path);
2006	}
2007
2008	return ret;
2009}
2010
2011/*
2012 * directory replay has two parts.  There are the standard directory
2013 * items in the log copied from the subvolume, and range items
2014 * created in the log while the subvolume was logged.
2015 *
2016 * The range items tell us which parts of the key space the log
2017 * is authoritative for.  During replay, if a key in the subvolume
2018 * directory is in a logged range item, but not actually in the log
2019 * that means it was deleted from the directory before the fsync
2020 * and should be removed.
2021 */
2022static noinline int find_dir_range(struct btrfs_root *root,
2023				   struct btrfs_path *path,
2024				   u64 dirid,
2025				   u64 *start_ret, u64 *end_ret)
2026{
2027	struct btrfs_key key;
2028	u64 found_end;
2029	struct btrfs_dir_log_item *item;
2030	int ret;
2031	int nritems;
2032
2033	if (*start_ret == (u64)-1)
2034		return 1;
2035
2036	key.objectid = dirid;
2037	key.type = BTRFS_DIR_LOG_INDEX_KEY;
2038	key.offset = *start_ret;
2039
2040	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2041	if (ret < 0)
2042		goto out;
2043	if (ret > 0) {
2044		if (path->slots[0] == 0)
2045			goto out;
2046		path->slots[0]--;
2047	}
2048	if (ret != 0)
2049		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2050
2051	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2052		ret = 1;
2053		goto next;
2054	}
2055	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2056			      struct btrfs_dir_log_item);
2057	found_end = btrfs_dir_log_end(path->nodes[0], item);
2058
2059	if (*start_ret >= key.offset && *start_ret <= found_end) {
2060		ret = 0;
2061		*start_ret = key.offset;
2062		*end_ret = found_end;
2063		goto out;
2064	}
2065	ret = 1;
2066next:
2067	/* check the next slot in the tree to see if it is a valid item */
2068	nritems = btrfs_header_nritems(path->nodes[0]);
2069	path->slots[0]++;
2070	if (path->slots[0] >= nritems) {
2071		ret = btrfs_next_leaf(root, path);
2072		if (ret)
2073			goto out;
2074	}
2075
2076	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2077
2078	if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2079		ret = 1;
2080		goto out;
2081	}
2082	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2083			      struct btrfs_dir_log_item);
2084	found_end = btrfs_dir_log_end(path->nodes[0], item);
2085	*start_ret = key.offset;
2086	*end_ret = found_end;
2087	ret = 0;
2088out:
2089	btrfs_release_path(path);
2090	return ret;
2091}
2092
2093/*
2094 * this looks for a given directory item in the log.  If the directory
2095 * item is not in the log, the item is removed and the inode it points
2096 * to is unlinked
2097 */
2098static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2099				      struct btrfs_root *log,
2100				      struct btrfs_path *path,
2101				      struct btrfs_path *log_path,
2102				      struct inode *dir,
2103				      struct btrfs_key *dir_key)
2104{
2105	struct btrfs_root *root = BTRFS_I(dir)->root;
2106	int ret;
2107	struct extent_buffer *eb;
2108	int slot;
2109	struct btrfs_dir_item *di;
2110	struct fscrypt_str name;
2111	struct inode *inode = NULL;
2112	struct btrfs_key location;
2113
2114	/*
2115	 * Currently we only log dir index keys. Even if we replay a log created
2116	 * by an older kernel that logged both dir index and dir item keys, all
2117	 * we need to do is process the dir index keys, we (and our caller) can
2118	 * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
2119	 */
2120	ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
2121
2122	eb = path->nodes[0];
2123	slot = path->slots[0];
2124	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2125	ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
2126	if (ret)
2127		goto out;
2128
2129	if (log) {
2130		struct btrfs_dir_item *log_di;
2131
2132		log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
2133						     dir_key->objectid,
2134						     dir_key->offset, &name, 0);
2135		if (IS_ERR(log_di)) {
2136			ret = PTR_ERR(log_di);
2137			goto out;
2138		} else if (log_di) {
2139			/* The dentry exists in the log, we have nothing to do. */
2140			ret = 0;
2141			goto out;
2142		}
2143	}
2144
2145	btrfs_dir_item_key_to_cpu(eb, di, &location);
2146	btrfs_release_path(path);
2147	btrfs_release_path(log_path);
2148	inode = read_one_inode(root, location.objectid);
2149	if (!inode) {
2150		ret = -EIO;
2151		goto out;
2152	}
2153
2154	ret = link_to_fixup_dir(trans, root, path, location.objectid);
2155	if (ret)
2156		goto out;
2157
2158	inc_nlink(inode);
2159	ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
2160					  &name);
2161	/*
2162	 * Unlike dir item keys, dir index keys can only have one name (entry) in
2163	 * them, as there are no key collisions since each key has a unique offset
2164	 * (an index number), so we're done.
2165	 */
2166out:
2167	btrfs_release_path(path);
2168	btrfs_release_path(log_path);
2169	kfree(name.name);
2170	iput(inode);
2171	return ret;
2172}
2173
2174static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2175			      struct btrfs_root *root,
2176			      struct btrfs_root *log,
2177			      struct btrfs_path *path,
2178			      const u64 ino)
2179{
2180	struct btrfs_key search_key;
2181	struct btrfs_path *log_path;
2182	int i;
2183	int nritems;
2184	int ret;
2185
2186	log_path = btrfs_alloc_path();
2187	if (!log_path)
2188		return -ENOMEM;
2189
2190	search_key.objectid = ino;
2191	search_key.type = BTRFS_XATTR_ITEM_KEY;
2192	search_key.offset = 0;
2193again:
2194	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2195	if (ret < 0)
2196		goto out;
2197process_leaf:
2198	nritems = btrfs_header_nritems(path->nodes[0]);
2199	for (i = path->slots[0]; i < nritems; i++) {
2200		struct btrfs_key key;
2201		struct btrfs_dir_item *di;
2202		struct btrfs_dir_item *log_di;
2203		u32 total_size;
2204		u32 cur;
2205
2206		btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2207		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2208			ret = 0;
2209			goto out;
2210		}
2211
2212		di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2213		total_size = btrfs_item_size(path->nodes[0], i);
2214		cur = 0;
2215		while (cur < total_size) {
2216			u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2217			u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2218			u32 this_len = sizeof(*di) + name_len + data_len;
2219			char *name;
2220
2221			name = kmalloc(name_len, GFP_NOFS);
2222			if (!name) {
2223				ret = -ENOMEM;
2224				goto out;
2225			}
2226			read_extent_buffer(path->nodes[0], name,
2227					   (unsigned long)(di + 1), name_len);
2228
2229			log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2230						    name, name_len, 0);
2231			btrfs_release_path(log_path);
2232			if (!log_di) {
2233				/* Doesn't exist in log tree, so delete it. */
2234				btrfs_release_path(path);
2235				di = btrfs_lookup_xattr(trans, root, path, ino,
2236							name, name_len, -1);
2237				kfree(name);
2238				if (IS_ERR(di)) {
2239					ret = PTR_ERR(di);
2240					goto out;
2241				}
2242				ASSERT(di);
2243				ret = btrfs_delete_one_dir_name(trans, root,
2244								path, di);
2245				if (ret)
2246					goto out;
2247				btrfs_release_path(path);
2248				search_key = key;
2249				goto again;
2250			}
2251			kfree(name);
2252			if (IS_ERR(log_di)) {
2253				ret = PTR_ERR(log_di);
2254				goto out;
2255			}
2256			cur += this_len;
2257			di = (struct btrfs_dir_item *)((char *)di + this_len);
2258		}
2259	}
2260	ret = btrfs_next_leaf(root, path);
2261	if (ret > 0)
2262		ret = 0;
2263	else if (ret == 0)
2264		goto process_leaf;
2265out:
2266	btrfs_free_path(log_path);
2267	btrfs_release_path(path);
2268	return ret;
2269}
2270
2271
2272/*
2273 * deletion replay happens before we copy any new directory items
2274 * out of the log or out of backreferences from inodes.  It
2275 * scans the log to find ranges of keys that log is authoritative for,
2276 * and then scans the directory to find items in those ranges that are
2277 * not present in the log.
2278 *
2279 * Anything we don't find in the log is unlinked and removed from the
2280 * directory.
2281 */
2282static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2283				       struct btrfs_root *root,
2284				       struct btrfs_root *log,
2285				       struct btrfs_path *path,
2286				       u64 dirid, int del_all)
2287{
2288	u64 range_start;
2289	u64 range_end;
2290	int ret = 0;
2291	struct btrfs_key dir_key;
2292	struct btrfs_key found_key;
2293	struct btrfs_path *log_path;
2294	struct inode *dir;
2295
2296	dir_key.objectid = dirid;
2297	dir_key.type = BTRFS_DIR_INDEX_KEY;
2298	log_path = btrfs_alloc_path();
2299	if (!log_path)
2300		return -ENOMEM;
2301
2302	dir = read_one_inode(root, dirid);
2303	/* it isn't an error if the inode isn't there, that can happen
2304	 * because we replay the deletes before we copy in the inode item
2305	 * from the log
2306	 */
2307	if (!dir) {
2308		btrfs_free_path(log_path);
2309		return 0;
2310	}
2311
2312	range_start = 0;
2313	range_end = 0;
2314	while (1) {
2315		if (del_all)
2316			range_end = (u64)-1;
2317		else {
2318			ret = find_dir_range(log, path, dirid,
2319					     &range_start, &range_end);
2320			if (ret < 0)
2321				goto out;
2322			else if (ret > 0)
2323				break;
2324		}
2325
2326		dir_key.offset = range_start;
2327		while (1) {
2328			int nritems;
2329			ret = btrfs_search_slot(NULL, root, &dir_key, path,
2330						0, 0);
2331			if (ret < 0)
2332				goto out;
2333
2334			nritems = btrfs_header_nritems(path->nodes[0]);
2335			if (path->slots[0] >= nritems) {
2336				ret = btrfs_next_leaf(root, path);
2337				if (ret == 1)
2338					break;
2339				else if (ret < 0)
2340					goto out;
2341			}
2342			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2343					      path->slots[0]);
2344			if (found_key.objectid != dirid ||
2345			    found_key.type != dir_key.type) {
2346				ret = 0;
2347				goto out;
2348			}
2349
2350			if (found_key.offset > range_end)
2351				break;
2352
2353			ret = check_item_in_log(trans, log, path,
2354						log_path, dir,
2355						&found_key);
2356			if (ret)
2357				goto out;
2358			if (found_key.offset == (u64)-1)
2359				break;
2360			dir_key.offset = found_key.offset + 1;
2361		}
2362		btrfs_release_path(path);
2363		if (range_end == (u64)-1)
2364			break;
2365		range_start = range_end + 1;
2366	}
2367	ret = 0;
2368out:
2369	btrfs_release_path(path);
2370	btrfs_free_path(log_path);
2371	iput(dir);
2372	return ret;
2373}
2374
2375/*
2376 * the process_func used to replay items from the log tree.  This
2377 * gets called in two different stages.  The first stage just looks
2378 * for inodes and makes sure they are all copied into the subvolume.
2379 *
2380 * The second stage copies all the other item types from the log into
2381 * the subvolume.  The two stage approach is slower, but gets rid of
2382 * lots of complexity around inodes referencing other inodes that exist
2383 * only in the log (references come from either directory items or inode
2384 * back refs).
2385 */
2386static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2387			     struct walk_control *wc, u64 gen, int level)
2388{
2389	int nritems;
2390	struct btrfs_tree_parent_check check = {
2391		.transid = gen,
2392		.level = level
2393	};
2394	struct btrfs_path *path;
2395	struct btrfs_root *root = wc->replay_dest;
2396	struct btrfs_key key;
2397	int i;
2398	int ret;
2399
2400	ret = btrfs_read_extent_buffer(eb, &check);
2401	if (ret)
2402		return ret;
2403
2404	level = btrfs_header_level(eb);
2405
2406	if (level != 0)
2407		return 0;
2408
2409	path = btrfs_alloc_path();
2410	if (!path)
2411		return -ENOMEM;
2412
2413	nritems = btrfs_header_nritems(eb);
2414	for (i = 0; i < nritems; i++) {
2415		btrfs_item_key_to_cpu(eb, &key, i);
2416
2417		/* inode keys are done during the first stage */
2418		if (key.type == BTRFS_INODE_ITEM_KEY &&
2419		    wc->stage == LOG_WALK_REPLAY_INODES) {
2420			struct btrfs_inode_item *inode_item;
2421			u32 mode;
2422
2423			inode_item = btrfs_item_ptr(eb, i,
2424					    struct btrfs_inode_item);
2425			/*
2426			 * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2427			 * and never got linked before the fsync, skip it, as
2428			 * replaying it is pointless since it would be deleted
2429			 * later. We skip logging tmpfiles, but it's always
2430			 * possible we are replaying a log created with a kernel
2431			 * that used to log tmpfiles.
2432			 */
2433			if (btrfs_inode_nlink(eb, inode_item) == 0) {
2434				wc->ignore_cur_inode = true;
2435				continue;
2436			} else {
2437				wc->ignore_cur_inode = false;
2438			}
2439			ret = replay_xattr_deletes(wc->trans, root, log,
2440						   path, key.objectid);
2441			if (ret)
2442				break;
2443			mode = btrfs_inode_mode(eb, inode_item);
2444			if (S_ISDIR(mode)) {
2445				ret = replay_dir_deletes(wc->trans,
2446					 root, log, path, key.objectid, 0);
2447				if (ret)
2448					break;
2449			}
2450			ret = overwrite_item(wc->trans, root, path,
2451					     eb, i, &key);
2452			if (ret)
2453				break;
2454
2455			/*
2456			 * Before replaying extents, truncate the inode to its
2457			 * size. We need to do it now and not after log replay
2458			 * because before an fsync we can have prealloc extents
2459			 * added beyond the inode's i_size. If we did it after,
2460			 * through orphan cleanup for example, we would drop
2461			 * those prealloc extents just after replaying them.
2462			 */
2463			if (S_ISREG(mode)) {
2464				struct btrfs_drop_extents_args drop_args = { 0 };
2465				struct inode *inode;
2466				u64 from;
2467
2468				inode = read_one_inode(root, key.objectid);
2469				if (!inode) {
2470					ret = -EIO;
2471					break;
2472				}
2473				from = ALIGN(i_size_read(inode),
2474					     root->fs_info->sectorsize);
2475				drop_args.start = from;
2476				drop_args.end = (u64)-1;
2477				drop_args.drop_cache = true;
2478				ret = btrfs_drop_extents(wc->trans, root,
2479							 BTRFS_I(inode),
2480							 &drop_args);
2481				if (!ret) {
2482					inode_sub_bytes(inode,
2483							drop_args.bytes_found);
2484					/* Update the inode's nbytes. */
2485					ret = btrfs_update_inode(wc->trans,
2486							root, BTRFS_I(inode));
2487				}
2488				iput(inode);
2489				if (ret)
2490					break;
2491			}
2492
2493			ret = link_to_fixup_dir(wc->trans, root,
2494						path, key.objectid);
2495			if (ret)
2496				break;
2497		}
2498
2499		if (wc->ignore_cur_inode)
2500			continue;
2501
2502		if (key.type == BTRFS_DIR_INDEX_KEY &&
2503		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2504			ret = replay_one_dir_item(wc->trans, root, path,
2505						  eb, i, &key);
2506			if (ret)
2507				break;
2508		}
2509
2510		if (wc->stage < LOG_WALK_REPLAY_ALL)
2511			continue;
2512
2513		/* these keys are simply copied */
2514		if (key.type == BTRFS_XATTR_ITEM_KEY) {
2515			ret = overwrite_item(wc->trans, root, path,
2516					     eb, i, &key);
2517			if (ret)
2518				break;
2519		} else if (key.type == BTRFS_INODE_REF_KEY ||
2520			   key.type == BTRFS_INODE_EXTREF_KEY) {
2521			ret = add_inode_ref(wc->trans, root, log, path,
2522					    eb, i, &key);
2523			if (ret && ret != -ENOENT)
2524				break;
2525			ret = 0;
2526		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2527			ret = replay_one_extent(wc->trans, root, path,
2528						eb, i, &key);
2529			if (ret)
2530				break;
2531		}
2532		/*
2533		 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
2534		 * BTRFS_DIR_INDEX_KEY items which we use to derive the
2535		 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
2536		 * older kernel with such keys, ignore them.
2537		 */
2538	}
2539	btrfs_free_path(path);
2540	return ret;
2541}
2542
2543/*
2544 * Correctly adjust the reserved bytes occupied by a log tree extent buffer
2545 */
2546static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
2547{
2548	struct btrfs_block_group *cache;
2549
2550	cache = btrfs_lookup_block_group(fs_info, start);
2551	if (!cache) {
2552		btrfs_err(fs_info, "unable to find block group for %llu", start);
2553		return;
2554	}
2555
2556	spin_lock(&cache->space_info->lock);
2557	spin_lock(&cache->lock);
2558	cache->reserved -= fs_info->nodesize;
2559	cache->space_info->bytes_reserved -= fs_info->nodesize;
2560	spin_unlock(&cache->lock);
2561	spin_unlock(&cache->space_info->lock);
2562
2563	btrfs_put_block_group(cache);
2564}
2565
2566static int clean_log_buffer(struct btrfs_trans_handle *trans,
2567			    struct extent_buffer *eb)
2568{
2569	int ret;
2570
2571	btrfs_tree_lock(eb);
2572	btrfs_clear_buffer_dirty(trans, eb);
2573	wait_on_extent_buffer_writeback(eb);
2574	btrfs_tree_unlock(eb);
2575
2576	if (trans) {
2577		ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len);
2578		if (ret)
2579			return ret;
2580		btrfs_redirty_list_add(trans->transaction, eb);
2581	} else {
2582		unaccount_log_buffer(eb->fs_info, eb->start);
2583	}
2584
2585	return 0;
2586}
2587
2588static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2589				   struct btrfs_root *root,
2590				   struct btrfs_path *path, int *level,
2591				   struct walk_control *wc)
2592{
2593	struct btrfs_fs_info *fs_info = root->fs_info;
2594	u64 bytenr;
2595	u64 ptr_gen;
2596	struct extent_buffer *next;
2597	struct extent_buffer *cur;
2598	int ret = 0;
2599
2600	while (*level > 0) {
2601		struct btrfs_tree_parent_check check = { 0 };
2602
2603		cur = path->nodes[*level];
2604
2605		WARN_ON(btrfs_header_level(cur) != *level);
2606
2607		if (path->slots[*level] >=
2608		    btrfs_header_nritems(cur))
2609			break;
2610
2611		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2612		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2613		check.transid = ptr_gen;
2614		check.level = *level - 1;
2615		check.has_first_key = true;
2616		btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
2617
2618		next = btrfs_find_create_tree_block(fs_info, bytenr,
2619						    btrfs_header_owner(cur),
2620						    *level - 1);
2621		if (IS_ERR(next))
2622			return PTR_ERR(next);
2623
2624		if (*level == 1) {
2625			ret = wc->process_func(root, next, wc, ptr_gen,
2626					       *level - 1);
2627			if (ret) {
2628				free_extent_buffer(next);
2629				return ret;
2630			}
2631
2632			path->slots[*level]++;
2633			if (wc->free) {
2634				ret = btrfs_read_extent_buffer(next, &check);
2635				if (ret) {
2636					free_extent_buffer(next);
2637					return ret;
2638				}
2639
2640				ret = clean_log_buffer(trans, next);
2641				if (ret) {
2642					free_extent_buffer(next);
2643					return ret;
2644				}
2645			}
2646			free_extent_buffer(next);
2647			continue;
2648		}
2649		ret = btrfs_read_extent_buffer(next, &check);
2650		if (ret) {
2651			free_extent_buffer(next);
2652			return ret;
2653		}
2654
2655		if (path->nodes[*level-1])
2656			free_extent_buffer(path->nodes[*level-1]);
2657		path->nodes[*level-1] = next;
2658		*level = btrfs_header_level(next);
2659		path->slots[*level] = 0;
2660		cond_resched();
2661	}
2662	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2663
2664	cond_resched();
2665	return 0;
2666}
2667
2668static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2669				 struct btrfs_root *root,
2670				 struct btrfs_path *path, int *level,
2671				 struct walk_control *wc)
2672{
2673	int i;
2674	int slot;
2675	int ret;
2676
2677	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2678		slot = path->slots[i];
2679		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2680			path->slots[i]++;
2681			*level = i;
2682			WARN_ON(*level == 0);
2683			return 0;
2684		} else {
2685			ret = wc->process_func(root, path->nodes[*level], wc,
2686				 btrfs_header_generation(path->nodes[*level]),
2687				 *level);
2688			if (ret)
2689				return ret;
2690
2691			if (wc->free) {
2692				ret = clean_log_buffer(trans, path->nodes[*level]);
2693				if (ret)
2694					return ret;
2695			}
2696			free_extent_buffer(path->nodes[*level]);
2697			path->nodes[*level] = NULL;
2698			*level = i + 1;
2699		}
2700	}
2701	return 1;
2702}
2703
2704/*
2705 * drop the reference count on the tree rooted at 'snap'.  This traverses
2706 * the tree freeing any blocks that have a ref count of zero after being
2707 * decremented.
2708 */
2709static int walk_log_tree(struct btrfs_trans_handle *trans,
2710			 struct btrfs_root *log, struct walk_control *wc)
2711{
2712	int ret = 0;
2713	int wret;
2714	int level;
2715	struct btrfs_path *path;
2716	int orig_level;
2717
2718	path = btrfs_alloc_path();
2719	if (!path)
2720		return -ENOMEM;
2721
2722	level = btrfs_header_level(log->node);
2723	orig_level = level;
2724	path->nodes[level] = log->node;
2725	atomic_inc(&log->node->refs);
2726	path->slots[level] = 0;
2727
2728	while (1) {
2729		wret = walk_down_log_tree(trans, log, path, &level, wc);
2730		if (wret > 0)
2731			break;
2732		if (wret < 0) {
2733			ret = wret;
2734			goto out;
2735		}
2736
2737		wret = walk_up_log_tree(trans, log, path, &level, wc);
2738		if (wret > 0)
2739			break;
2740		if (wret < 0) {
2741			ret = wret;
2742			goto out;
2743		}
2744	}
2745
2746	/* was the root node processed? if not, catch it here */
2747	if (path->nodes[orig_level]) {
2748		ret = wc->process_func(log, path->nodes[orig_level], wc,
2749			 btrfs_header_generation(path->nodes[orig_level]),
2750			 orig_level);
2751		if (ret)
2752			goto out;
2753		if (wc->free)
2754			ret = clean_log_buffer(trans, path->nodes[orig_level]);
2755	}
2756
2757out:
2758	btrfs_free_path(path);
2759	return ret;
2760}
2761
2762/*
2763 * helper function to update the item for a given subvolumes log root
2764 * in the tree of log roots
2765 */
2766static int update_log_root(struct btrfs_trans_handle *trans,
2767			   struct btrfs_root *log,
2768			   struct btrfs_root_item *root_item)
2769{
2770	struct btrfs_fs_info *fs_info = log->fs_info;
2771	int ret;
2772
2773	if (log->log_transid == 1) {
2774		/* insert root item on the first sync */
2775		ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2776				&log->root_key, root_item);
2777	} else {
2778		ret = btrfs_update_root(trans, fs_info->log_root_tree,
2779				&log->root_key, root_item);
2780	}
2781	return ret;
2782}
2783
2784static void wait_log_commit(struct btrfs_root *root, int transid)
2785{
2786	DEFINE_WAIT(wait);
2787	int index = transid % 2;
2788
2789	/*
2790	 * we only allow two pending log transactions at a time,
2791	 * so we know that if ours is more than 2 older than the
2792	 * current transaction, we're done
2793	 */
2794	for (;;) {
2795		prepare_to_wait(&root->log_commit_wait[index],
2796				&wait, TASK_UNINTERRUPTIBLE);
2797
2798		if (!(root->log_transid_committed < transid &&
2799		      atomic_read(&root->log_commit[index])))
2800			break;
2801
2802		mutex_unlock(&root->log_mutex);
2803		schedule();
2804		mutex_lock(&root->log_mutex);
2805	}
2806	finish_wait(&root->log_commit_wait[index], &wait);
2807}
2808
2809static void wait_for_writer(struct btrfs_root *root)
2810{
2811	DEFINE_WAIT(wait);
2812
2813	for (;;) {
2814		prepare_to_wait(&root->log_writer_wait, &wait,
2815				TASK_UNINTERRUPTIBLE);
2816		if (!atomic_read(&root->log_writers))
2817			break;
2818
2819		mutex_unlock(&root->log_mutex);
2820		schedule();
2821		mutex_lock(&root->log_mutex);
2822	}
2823	finish_wait(&root->log_writer_wait, &wait);
2824}
2825
2826static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2827					struct btrfs_log_ctx *ctx)
2828{
2829	mutex_lock(&root->log_mutex);
2830	list_del_init(&ctx->list);
2831	mutex_unlock(&root->log_mutex);
2832}
2833
2834/*
2835 * Invoked in log mutex context, or be sure there is no other task which
2836 * can access the list.
2837 */
2838static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2839					     int index, int error)
2840{
2841	struct btrfs_log_ctx *ctx;
2842	struct btrfs_log_ctx *safe;
2843
2844	list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
2845		list_del_init(&ctx->list);
2846		ctx->log_ret = error;
2847	}
2848}
2849
2850/*
2851 * btrfs_sync_log does sends a given tree log down to the disk and
2852 * updates the super blocks to record it.  When this call is done,
2853 * you know that any inodes previously logged are safely on disk only
2854 * if it returns 0.
2855 *
2856 * Any other return value means you need to call btrfs_commit_transaction.
2857 * Some of the edge cases for fsyncing directories that have had unlinks
2858 * or renames done in the past mean that sometimes the only safe
2859 * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2860 * that has happened.
2861 */
2862int btrfs_sync_log(struct btrfs_trans_handle *trans,
2863		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2864{
2865	int index1;
2866	int index2;
2867	int mark;
2868	int ret;
2869	struct btrfs_fs_info *fs_info = root->fs_info;
2870	struct btrfs_root *log = root->log_root;
2871	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
2872	struct btrfs_root_item new_root_item;
2873	int log_transid = 0;
2874	struct btrfs_log_ctx root_log_ctx;
2875	struct blk_plug plug;
2876	u64 log_root_start;
2877	u64 log_root_level;
2878
2879	mutex_lock(&root->log_mutex);
2880	log_transid = ctx->log_transid;
2881	if (root->log_transid_committed >= log_transid) {
2882		mutex_unlock(&root->log_mutex);
2883		return ctx->log_ret;
2884	}
2885
2886	index1 = log_transid % 2;
2887	if (atomic_read(&root->log_commit[index1])) {
2888		wait_log_commit(root, log_transid);
2889		mutex_unlock(&root->log_mutex);
2890		return ctx->log_ret;
2891	}
2892	ASSERT(log_transid == root->log_transid);
2893	atomic_set(&root->log_commit[index1], 1);
2894
2895	/* wait for previous tree log sync to complete */
2896	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2897		wait_log_commit(root, log_transid - 1);
2898
2899	while (1) {
2900		int batch = atomic_read(&root->log_batch);
2901		/* when we're on an ssd, just kick the log commit out */
2902		if (!btrfs_test_opt(fs_info, SSD) &&
2903		    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
2904			mutex_unlock(&root->log_mutex);
2905			schedule_timeout_uninterruptible(1);
2906			mutex_lock(&root->log_mutex);
2907		}
2908		wait_for_writer(root);
2909		if (batch == atomic_read(&root->log_batch))
2910			break;
2911	}
2912
2913	/* bail out if we need to do a full commit */
2914	if (btrfs_need_log_full_commit(trans)) {
2915		ret = BTRFS_LOG_FORCE_COMMIT;
2916		mutex_unlock(&root->log_mutex);
2917		goto out;
2918	}
2919
2920	if (log_transid % 2 == 0)
2921		mark = EXTENT_DIRTY;
2922	else
2923		mark = EXTENT_NEW;
2924
2925	/* we start IO on  all the marked extents here, but we don't actually
2926	 * wait for them until later.
2927	 */
2928	blk_start_plug(&plug);
2929	ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
2930	/*
2931	 * -EAGAIN happens when someone, e.g., a concurrent transaction
2932	 *  commit, writes a dirty extent in this tree-log commit. This
2933	 *  concurrent write will create a hole writing out the extents,
2934	 *  and we cannot proceed on a zoned filesystem, requiring
2935	 *  sequential writing. While we can bail out to a full commit
2936	 *  here, but we can continue hoping the concurrent writing fills
2937	 *  the hole.
2938	 */
2939	if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
2940		ret = 0;
2941	if (ret) {
2942		blk_finish_plug(&plug);
2943		btrfs_set_log_full_commit(trans);
2944		mutex_unlock(&root->log_mutex);
2945		goto out;
2946	}
2947
2948	/*
2949	 * We _must_ update under the root->log_mutex in order to make sure we
2950	 * have a consistent view of the log root we are trying to commit at
2951	 * this moment.
2952	 *
2953	 * We _must_ copy this into a local copy, because we are not holding the
2954	 * log_root_tree->log_mutex yet.  This is important because when we
2955	 * commit the log_root_tree we must have a consistent view of the
2956	 * log_root_tree when we update the super block to point at the
2957	 * log_root_tree bytenr.  If we update the log_root_tree here we'll race
2958	 * with the commit and possibly point at the new block which we may not
2959	 * have written out.
2960	 */
2961	btrfs_set_root_node(&log->root_item, log->node);
2962	memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
2963
2964	root->log_transid++;
2965	log->log_transid = root->log_transid;
2966	root->log_start_pid = 0;
2967	/*
2968	 * IO has been started, blocks of the log tree have WRITTEN flag set
2969	 * in their headers. new modifications of the log will be written to
2970	 * new positions. so it's safe to allow log writers to go in.
2971	 */
2972	mutex_unlock(&root->log_mutex);
2973
2974	if (btrfs_is_zoned(fs_info)) {
2975		mutex_lock(&fs_info->tree_root->log_mutex);
2976		if (!log_root_tree->node) {
2977			ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
2978			if (ret) {
2979				mutex_unlock(&fs_info->tree_root->log_mutex);
2980				blk_finish_plug(&plug);
2981				goto out;
2982			}
2983		}
2984		mutex_unlock(&fs_info->tree_root->log_mutex);
2985	}
2986
2987	btrfs_init_log_ctx(&root_log_ctx, NULL);
2988
2989	mutex_lock(&log_root_tree->log_mutex);
2990
2991	index2 = log_root_tree->log_transid % 2;
2992	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2993	root_log_ctx.log_transid = log_root_tree->log_transid;
2994
2995	/*
2996	 * Now we are safe to update the log_root_tree because we're under the
2997	 * log_mutex, and we're a current writer so we're holding the commit
2998	 * open until we drop the log_mutex.
2999	 */
3000	ret = update_log_root(trans, log, &new_root_item);
3001	if (ret) {
3002		if (!list_empty(&root_log_ctx.list))
3003			list_del_init(&root_log_ctx.list);
3004
3005		blk_finish_plug(&plug);
3006		btrfs_set_log_full_commit(trans);
3007		if (ret != -ENOSPC)
3008			btrfs_err(fs_info,
3009				  "failed to update log for root %llu ret %d",
3010				  root->root_key.objectid, ret);
3011		btrfs_wait_tree_log_extents(log, mark);
3012		mutex_unlock(&log_root_tree->log_mutex);
3013		goto out;
3014	}
3015
3016	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3017		blk_finish_plug(&plug);
3018		list_del_init(&root_log_ctx.list);
3019		mutex_unlock(&log_root_tree->log_mutex);
3020		ret = root_log_ctx.log_ret;
3021		goto out;
3022	}
3023
3024	index2 = root_log_ctx.log_transid % 2;
3025	if (atomic_read(&log_root_tree->log_commit[index2])) {
3026		blk_finish_plug(&plug);
3027		ret = btrfs_wait_tree_log_extents(log, mark);
3028		wait_log_commit(log_root_tree,
3029				root_log_ctx.log_transid);
3030		mutex_unlock(&log_root_tree->log_mutex);
3031		if (!ret)
3032			ret = root_log_ctx.log_ret;
3033		goto out;
3034	}
3035	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
3036	atomic_set(&log_root_tree->log_commit[index2], 1);
3037
3038	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3039		wait_log_commit(log_root_tree,
3040				root_log_ctx.log_transid - 1);
3041	}
3042
3043	/*
3044	 * now that we've moved on to the tree of log tree roots,
3045	 * check the full commit flag again
3046	 */
3047	if (btrfs_need_log_full_commit(trans)) {
3048		blk_finish_plug(&plug);
3049		btrfs_wait_tree_log_extents(log, mark);
3050		mutex_unlock(&log_root_tree->log_mutex);
3051		ret = BTRFS_LOG_FORCE_COMMIT;
3052		goto out_wake_log_root;
3053	}
3054
3055	ret = btrfs_write_marked_extents(fs_info,
3056					 &log_root_tree->dirty_log_pages,
3057					 EXTENT_DIRTY | EXTENT_NEW);
3058	blk_finish_plug(&plug);
3059	/*
3060	 * As described above, -EAGAIN indicates a hole in the extents. We
3061	 * cannot wait for these write outs since the waiting cause a
3062	 * deadlock. Bail out to the full commit instead.
3063	 */
3064	if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
3065		btrfs_set_log_full_commit(trans);
3066		btrfs_wait_tree_log_extents(log, mark);
3067		mutex_unlock(&log_root_tree->log_mutex);
3068		goto out_wake_log_root;
3069	} else if (ret) {
3070		btrfs_set_log_full_commit(trans);
3071		mutex_unlock(&log_root_tree->log_mutex);
3072		goto out_wake_log_root;
3073	}
3074	ret = btrfs_wait_tree_log_extents(log, mark);
3075	if (!ret)
3076		ret = btrfs_wait_tree_log_extents(log_root_tree,
3077						  EXTENT_NEW | EXTENT_DIRTY);
3078	if (ret) {
3079		btrfs_set_log_full_commit(trans);
3080		mutex_unlock(&log_root_tree->log_mutex);
3081		goto out_wake_log_root;
3082	}
3083
3084	log_root_start = log_root_tree->node->start;
3085	log_root_level = btrfs_header_level(log_root_tree->node);
3086	log_root_tree->log_transid++;
3087	mutex_unlock(&log_root_tree->log_mutex);
3088
3089	/*
3090	 * Here we are guaranteed that nobody is going to write the superblock
3091	 * for the current transaction before us and that neither we do write
3092	 * our superblock before the previous transaction finishes its commit
3093	 * and writes its superblock, because:
3094	 *
3095	 * 1) We are holding a handle on the current transaction, so no body
3096	 *    can commit it until we release the handle;
3097	 *
3098	 * 2) Before writing our superblock we acquire the tree_log_mutex, so
3099	 *    if the previous transaction is still committing, and hasn't yet
3100	 *    written its superblock, we wait for it to do it, because a
3101	 *    transaction commit acquires the tree_log_mutex when the commit
3102	 *    begins and releases it only after writing its superblock.
3103	 */
3104	mutex_lock(&fs_info->tree_log_mutex);
3105
3106	/*
3107	 * The previous transaction writeout phase could have failed, and thus
3108	 * marked the fs in an error state.  We must not commit here, as we
3109	 * could have updated our generation in the super_for_commit and
3110	 * writing the super here would result in transid mismatches.  If there
3111	 * is an error here just bail.
3112	 */
3113	if (BTRFS_FS_ERROR(fs_info)) {
3114		ret = -EIO;
3115		btrfs_set_log_full_commit(trans);
3116		btrfs_abort_transaction(trans, ret);
3117		mutex_unlock(&fs_info->tree_log_mutex);
3118		goto out_wake_log_root;
3119	}
3120
3121	btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
3122	btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
3123	ret = write_all_supers(fs_info, 1);
3124	mutex_unlock(&fs_info->tree_log_mutex);
3125	if (ret) {
3126		btrfs_set_log_full_commit(trans);
3127		btrfs_abort_transaction(trans, ret);
3128		goto out_wake_log_root;
3129	}
3130
3131	/*
3132	 * We know there can only be one task here, since we have not yet set
3133	 * root->log_commit[index1] to 0 and any task attempting to sync the
3134	 * log must wait for the previous log transaction to commit if it's
3135	 * still in progress or wait for the current log transaction commit if
3136	 * someone else already started it. We use <= and not < because the
3137	 * first log transaction has an ID of 0.
3138	 */
3139	ASSERT(root->last_log_commit <= log_transid);
3140	root->last_log_commit = log_transid;
3141
3142out_wake_log_root:
3143	mutex_lock(&log_root_tree->log_mutex);
3144	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3145
3146	log_root_tree->log_transid_committed++;
3147	atomic_set(&log_root_tree->log_commit[index2], 0);
3148	mutex_unlock(&log_root_tree->log_mutex);
3149
3150	/*
3151	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3152	 * all the updates above are seen by the woken threads. It might not be
3153	 * necessary, but proving that seems to be hard.
3154	 */
3155	cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3156out:
3157	mutex_lock(&root->log_mutex);
3158	btrfs_remove_all_log_ctxs(root, index1, ret);
3159	root->log_transid_committed++;
3160	atomic_set(&root->log_commit[index1], 0);
3161	mutex_unlock(&root->log_mutex);
3162
3163	/*
3164	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3165	 * all the updates above are seen by the woken threads. It might not be
3166	 * necessary, but proving that seems to be hard.
3167	 */
3168	cond_wake_up(&root->log_commit_wait[index1]);
3169	return ret;
3170}
3171
3172static void free_log_tree(struct btrfs_trans_handle *trans,
3173			  struct btrfs_root *log)
3174{
3175	int ret;
3176	struct walk_control wc = {
3177		.free = 1,
3178		.process_func = process_one_buffer
3179	};
3180
3181	if (log->node) {
3182		ret = walk_log_tree(trans, log, &wc);
3183		if (ret) {
3184			/*
3185			 * We weren't able to traverse the entire log tree, the
3186			 * typical scenario is getting an -EIO when reading an
3187			 * extent buffer of the tree, due to a previous writeback
3188			 * failure of it.
3189			 */
3190			set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
3191				&log->fs_info->fs_state);
3192
3193			/*
3194			 * Some extent buffers of the log tree may still be dirty
3195			 * and not yet written back to storage, because we may
3196			 * have updates to a log tree without syncing a log tree,
3197			 * such as during rename and link operations. So flush
3198			 * them out and wait for their writeback to complete, so
3199			 * that we properly cleanup their state and pages.
3200			 */
3201			btrfs_write_marked_extents(log->fs_info,
3202						   &log->dirty_log_pages,
3203						   EXTENT_DIRTY | EXTENT_NEW);
3204			btrfs_wait_tree_log_extents(log,
3205						    EXTENT_DIRTY | EXTENT_NEW);
3206
3207			if (trans)
3208				btrfs_abort_transaction(trans, ret);
3209			else
3210				btrfs_handle_fs_error(log->fs_info, ret, NULL);
3211		}
3212	}
3213
3214	clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
3215			  EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
3216	extent_io_tree_release(&log->log_csum_range);
3217
3218	btrfs_put_root(log);
3219}
3220
3221/*
3222 * free all the extents used by the tree log.  This should be called
3223 * at commit time of the full transaction
3224 */
3225int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3226{
3227	if (root->log_root) {
3228		free_log_tree(trans, root->log_root);
3229		root->log_root = NULL;
3230		clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
3231	}
3232	return 0;
3233}
3234
3235int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3236			     struct btrfs_fs_info *fs_info)
3237{
3238	if (fs_info->log_root_tree) {
3239		free_log_tree(trans, fs_info->log_root_tree);
3240		fs_info->log_root_tree = NULL;
3241		clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
3242	}
3243	return 0;
3244}
3245
3246/*
3247 * Check if an inode was logged in the current transaction. This correctly deals
3248 * with the case where the inode was logged but has a logged_trans of 0, which
3249 * happens if the inode is evicted and loaded again, as logged_trans is an in
3250 * memory only field (not persisted).
3251 *
3252 * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
3253 * and < 0 on error.
3254 */
3255static int inode_logged(const struct btrfs_trans_handle *trans,
3256			struct btrfs_inode *inode,
3257			struct btrfs_path *path_in)
3258{
3259	struct btrfs_path *path = path_in;
3260	struct btrfs_key key;
3261	int ret;
3262
3263	if (inode->logged_trans == trans->transid)
3264		return 1;
3265
3266	/*
3267	 * If logged_trans is not 0, then we know the inode logged was not logged
3268	 * in this transaction, so we can return false right away.
3269	 */
3270	if (inode->logged_trans > 0)
3271		return 0;
3272
3273	/*
3274	 * If no log tree was created for this root in this transaction, then
3275	 * the inode can not have been logged in this transaction. In that case
3276	 * set logged_trans to anything greater than 0 and less than the current
3277	 * transaction's ID, to avoid the search below in a future call in case
3278	 * a log tree gets created after this.
3279	 */
3280	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
3281		inode->logged_trans = trans->transid - 1;
3282		return 0;
3283	}
3284
3285	/*
3286	 * We have a log tree and the inode's logged_trans is 0. We can't tell
3287	 * for sure if the inode was logged before in this transaction by looking
3288	 * only at logged_trans. We could be pessimistic and assume it was, but
3289	 * that can lead to unnecessarily logging an inode during rename and link
3290	 * operations, and then further updating the log in followup rename and
3291	 * link operations, specially if it's a directory, which adds latency
3292	 * visible to applications doing a series of rename or link operations.
3293	 *
3294	 * A logged_trans of 0 here can mean several things:
3295	 *
3296	 * 1) The inode was never logged since the filesystem was mounted, and may
3297	 *    or may have not been evicted and loaded again;
3298	 *
3299	 * 2) The inode was logged in a previous transaction, then evicted and
3300	 *    then loaded again;
3301	 *
3302	 * 3) The inode was logged in the current transaction, then evicted and
3303	 *    then loaded again.
3304	 *
3305	 * For cases 1) and 2) we don't want to return true, but we need to detect
3306	 * case 3) and return true. So we do a search in the log root for the inode
3307	 * item.
3308	 */
3309	key.objectid = btrfs_ino(inode);
3310	key.type = BTRFS_INODE_ITEM_KEY;
3311	key.offset = 0;
3312
3313	if (!path) {
3314		path = btrfs_alloc_path();
3315		if (!path)
3316			return -ENOMEM;
3317	}
3318
3319	ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
3320
3321	if (path_in)
3322		btrfs_release_path(path);
3323	else
3324		btrfs_free_path(path);
3325
3326	/*
3327	 * Logging an inode always results in logging its inode item. So if we
3328	 * did not find the item we know the inode was not logged for sure.
3329	 */
3330	if (ret < 0) {
3331		return ret;
3332	} else if (ret > 0) {
3333		/*
3334		 * Set logged_trans to a value greater than 0 and less then the
3335		 * current transaction to avoid doing the search in future calls.
3336		 */
3337		inode->logged_trans = trans->transid - 1;
3338		return 0;
3339	}
3340
3341	/*
3342	 * The inode was previously logged and then evicted, set logged_trans to
3343	 * the current transacion's ID, to avoid future tree searches as long as
3344	 * the inode is not evicted again.
3345	 */
3346	inode->logged_trans = trans->transid;
3347
3348	/*
3349	 * If it's a directory, then we must set last_dir_index_offset to the
3350	 * maximum possible value, so that the next attempt to log the inode does
3351	 * not skip checking if dir index keys found in modified subvolume tree
3352	 * leaves have been logged before, otherwise it would result in attempts
3353	 * to insert duplicate dir index keys in the log tree. This must be done
3354	 * because last_dir_index_offset is an in-memory only field, not persisted
3355	 * in the inode item or any other on-disk structure, so its value is lost
3356	 * once the inode is evicted.
3357	 */
3358	if (S_ISDIR(inode->vfs_inode.i_mode))
3359		inode->last_dir_index_offset = (u64)-1;
3360
3361	return 1;
3362}
3363
3364/*
3365 * Delete a directory entry from the log if it exists.
3366 *
3367 * Returns < 0 on error
3368 *           1 if the entry does not exists
3369 *           0 if the entry existed and was successfully deleted
3370 */
3371static int del_logged_dentry(struct btrfs_trans_handle *trans,
3372			     struct btrfs_root *log,
3373			     struct btrfs_path *path,
3374			     u64 dir_ino,
3375			     const struct fscrypt_str *name,
3376			     u64 index)
3377{
3378	struct btrfs_dir_item *di;
3379
3380	/*
3381	 * We only log dir index items of a directory, so we don't need to look
3382	 * for dir item keys.
3383	 */
3384	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3385					 index, name, -1);
3386	if (IS_ERR(di))
3387		return PTR_ERR(di);
3388	else if (!di)
3389		return 1;
3390
3391	/*
3392	 * We do not need to update the size field of the directory's
3393	 * inode item because on log replay we update the field to reflect
3394	 * all existing entries in the directory (see overwrite_item()).
3395	 */
3396	return btrfs_delete_one_dir_name(trans, log, path, di);
3397}
3398
3399/*
3400 * If both a file and directory are logged, and unlinks or renames are
3401 * mixed in, we have a few interesting corners:
3402 *
3403 * create file X in dir Y
3404 * link file X to X.link in dir Y
3405 * fsync file X
3406 * unlink file X but leave X.link
3407 * fsync dir Y
3408 *
3409 * After a crash we would expect only X.link to exist.  But file X
3410 * didn't get fsync'd again so the log has back refs for X and X.link.
3411 *
3412 * We solve this by removing directory entries and inode backrefs from the
3413 * log when a file that was logged in the current transaction is
3414 * unlinked.  Any later fsync will include the updated log entries, and
3415 * we'll be able to reconstruct the proper directory items from backrefs.
3416 *
3417 * This optimizations allows us to avoid relogging the entire inode
3418 * or the entire directory.
3419 */
3420void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3421				  struct btrfs_root *root,
3422				  const struct fscrypt_str *name,
3423				  struct btrfs_inode *dir, u64 index)
3424{
3425	struct btrfs_path *path;
3426	int ret;
3427
3428	ret = inode_logged(trans, dir, NULL);
3429	if (ret == 0)
3430		return;
3431	else if (ret < 0) {
3432		btrfs_set_log_full_commit(trans);
3433		return;
3434	}
3435
3436	ret = join_running_log_trans(root);
3437	if (ret)
3438		return;
3439
3440	mutex_lock(&dir->log_mutex);
3441
3442	path = btrfs_alloc_path();
3443	if (!path) {
3444		ret = -ENOMEM;
3445		goto out_unlock;
3446	}
3447
3448	ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
3449				name, index);
3450	btrfs_free_path(path);
3451out_unlock:
3452	mutex_unlock(&dir->log_mutex);
3453	if (ret < 0)
3454		btrfs_set_log_full_commit(trans);
3455	btrfs_end_log_trans(root);
3456}
3457
3458/* see comments for btrfs_del_dir_entries_in_log */
3459void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3460				struct btrfs_root *root,
3461				const struct fscrypt_str *name,
3462				struct btrfs_inode *inode, u64 dirid)
3463{
3464	struct btrfs_root *log;
3465	u64 index;
3466	int ret;
3467
3468	ret = inode_logged(trans, inode, NULL);
3469	if (ret == 0)
3470		return;
3471	else if (ret < 0) {
3472		btrfs_set_log_full_commit(trans);
3473		return;
3474	}
3475
3476	ret = join_running_log_trans(root);
3477	if (ret)
3478		return;
3479	log = root->log_root;
3480	mutex_lock(&inode->log_mutex);
3481
3482	ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode),
3483				  dirid, &index);
3484	mutex_unlock(&inode->log_mutex);
3485	if (ret < 0 && ret != -ENOENT)
3486		btrfs_set_log_full_commit(trans);
3487	btrfs_end_log_trans(root);
3488}
3489
3490/*
3491 * creates a range item in the log for 'dirid'.  first_offset and
3492 * last_offset tell us which parts of the key space the log should
3493 * be considered authoritative for.
3494 */
3495static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3496				       struct btrfs_root *log,
3497				       struct btrfs_path *path,
3498				       u64 dirid,
3499				       u64 first_offset, u64 last_offset)
3500{
3501	int ret;
3502	struct btrfs_key key;
3503	struct btrfs_dir_log_item *item;
3504
3505	key.objectid = dirid;
3506	key.offset = first_offset;
3507	key.type = BTRFS_DIR_LOG_INDEX_KEY;
3508	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3509	/*
3510	 * -EEXIST is fine and can happen sporadically when we are logging a
3511	 * directory and have concurrent insertions in the subvolume's tree for
3512	 * items from other inodes and that result in pushing off some dir items
3513	 * from one leaf to another in order to accommodate for the new items.
3514	 * This results in logging the same dir index range key.
3515	 */
3516	if (ret && ret != -EEXIST)
3517		return ret;
3518
3519	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3520			      struct btrfs_dir_log_item);
3521	if (ret == -EEXIST) {
3522		const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
3523
3524		/*
3525		 * btrfs_del_dir_entries_in_log() might have been called during
3526		 * an unlink between the initial insertion of this key and the
3527		 * current update, or we might be logging a single entry deletion
3528		 * during a rename, so set the new last_offset to the max value.
3529		 */
3530		last_offset = max(last_offset, curr_end);
3531	}
3532	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3533	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
3534	btrfs_release_path(path);
3535	return 0;
3536}
3537
3538static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
3539				 struct btrfs_inode *inode,
3540				 struct extent_buffer *src,
3541				 struct btrfs_path *dst_path,
3542				 int start_slot,
3543				 int count)
3544{
3545	struct btrfs_root *log = inode->root->log_root;
3546	char *ins_data = NULL;
3547	struct btrfs_item_batch batch;
3548	struct extent_buffer *dst;
3549	unsigned long src_offset;
3550	unsigned long dst_offset;
3551	u64 last_index;
3552	struct btrfs_key key;
3553	u32 item_size;
3554	int ret;
3555	int i;
3556
3557	ASSERT(count > 0);
3558	batch.nr = count;
3559
3560	if (count == 1) {
3561		btrfs_item_key_to_cpu(src, &key, start_slot);
3562		item_size = btrfs_item_size(src, start_slot);
3563		batch.keys = &key;
3564		batch.data_sizes = &item_size;
3565		batch.total_data_size = item_size;
3566	} else {
3567		struct btrfs_key *ins_keys;
3568		u32 *ins_sizes;
3569
3570		ins_data = kmalloc(count * sizeof(u32) +
3571				   count * sizeof(struct btrfs_key), GFP_NOFS);
3572		if (!ins_data)
3573			return -ENOMEM;
3574
3575		ins_sizes = (u32 *)ins_data;
3576		ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
3577		batch.keys = ins_keys;
3578		batch.data_sizes = ins_sizes;
3579		batch.total_data_size = 0;
3580
3581		for (i = 0; i < count; i++) {
3582			const int slot = start_slot + i;
3583
3584			btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
3585			ins_sizes[i] = btrfs_item_size(src, slot);
3586			batch.total_data_size += ins_sizes[i];
3587		}
3588	}
3589
3590	ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
3591	if (ret)
3592		goto out;
3593
3594	dst = dst_path->nodes[0];
3595	/*
3596	 * Copy all the items in bulk, in a single copy operation. Item data is
3597	 * organized such that it's placed at the end of a leaf and from right
3598	 * to left. For example, the data for the second item ends at an offset
3599	 * that matches the offset where the data for the first item starts, the
3600	 * data for the third item ends at an offset that matches the offset
3601	 * where the data of the second items starts, and so on.
3602	 * Therefore our source and destination start offsets for copy match the
3603	 * offsets of the last items (highest slots).
3604	 */
3605	dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
3606	src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
3607	copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
3608	btrfs_release_path(dst_path);
3609
3610	last_index = batch.keys[count - 1].offset;
3611	ASSERT(last_index > inode->last_dir_index_offset);
3612
3613	/*
3614	 * If for some unexpected reason the last item's index is not greater
3615	 * than the last index we logged, warn and force a transaction commit.
3616	 */
3617	if (WARN_ON(last_index <= inode->last_dir_index_offset))
3618		ret = BTRFS_LOG_FORCE_COMMIT;
3619	else
3620		inode->last_dir_index_offset = last_index;
3621
3622	if (btrfs_get_first_dir_index_to_log(inode) == 0)
3623		btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
3624out:
3625	kfree(ins_data);
3626
3627	return ret;
3628}
3629
3630static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
3631				  struct btrfs_inode *inode,
3632				  struct btrfs_path *path,
3633				  struct btrfs_path *dst_path,
3634				  struct btrfs_log_ctx *ctx,
3635				  u64 *last_old_dentry_offset)
3636{
3637	struct btrfs_root *log = inode->root->log_root;
3638	struct extent_buffer *src;
3639	const int nritems = btrfs_header_nritems(path->nodes[0]);
3640	const u64 ino = btrfs_ino(inode);
3641	bool last_found = false;
3642	int batch_start = 0;
3643	int batch_size = 0;
3644	int i;
3645
3646	/*
3647	 * We need to clone the leaf, release the read lock on it, and use the
3648	 * clone before modifying the log tree. See the comment at copy_items()
3649	 * about why we need to do this.
3650	 */
3651	src = btrfs_clone_extent_buffer(path->nodes[0]);
3652	if (!src)
3653		return -ENOMEM;
3654
3655	i = path->slots[0];
3656	btrfs_release_path(path);
3657	path->nodes[0] = src;
3658	path->slots[0] = i;
3659
3660	for (; i < nritems; i++) {
3661		struct btrfs_dir_item *di;
3662		struct btrfs_key key;
3663		int ret;
3664
3665		btrfs_item_key_to_cpu(src, &key, i);
3666
3667		if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
3668			last_found = true;
3669			break;
3670		}
3671
3672		di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3673
3674		/*
3675		 * Skip ranges of items that consist only of dir item keys created
3676		 * in past transactions. However if we find a gap, we must log a
3677		 * dir index range item for that gap, so that index keys in that
3678		 * gap are deleted during log replay.
3679		 */
3680		if (btrfs_dir_transid(src, di) < trans->transid) {
3681			if (key.offset > *last_old_dentry_offset + 1) {
3682				ret = insert_dir_log_key(trans, log, dst_path,
3683						 ino, *last_old_dentry_offset + 1,
3684						 key.offset - 1);
3685				if (ret < 0)
3686					return ret;
3687			}
3688
3689			*last_old_dentry_offset = key.offset;
3690			continue;
3691		}
3692
3693		/* If we logged this dir index item before, we can skip it. */
3694		if (key.offset <= inode->last_dir_index_offset)
3695			continue;
3696
3697		/*
3698		 * We must make sure that when we log a directory entry, the
3699		 * corresponding inode, after log replay, has a matching link
3700		 * count. For example:
3701		 *
3702		 * touch foo
3703		 * mkdir mydir
3704		 * sync
3705		 * ln foo mydir/bar
3706		 * xfs_io -c "fsync" mydir
3707		 * <crash>
3708		 * <mount fs and log replay>
3709		 *
3710		 * Would result in a fsync log that when replayed, our file inode
3711		 * would have a link count of 1, but we get two directory entries
3712		 * pointing to the same inode. After removing one of the names,
3713		 * it would not be possible to remove the other name, which
3714		 * resulted always in stale file handle errors, and would not be
3715		 * possible to rmdir the parent directory, since its i_size could
3716		 * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
3717		 * resulting in -ENOTEMPTY errors.
3718		 */
3719		if (!ctx->log_new_dentries) {
3720			struct btrfs_key di_key;
3721
3722			btrfs_dir_item_key_to_cpu(src, di, &di_key);
3723			if (di_key.type != BTRFS_ROOT_ITEM_KEY)
3724				ctx->log_new_dentries = true;
3725		}
3726
3727		if (batch_size == 0)
3728			batch_start = i;
3729		batch_size++;
3730	}
3731
3732	if (batch_size > 0) {
3733		int ret;
3734
3735		ret = flush_dir_items_batch(trans, inode, src, dst_path,
3736					    batch_start, batch_size);
3737		if (ret < 0)
3738			return ret;
3739	}
3740
3741	return last_found ? 1 : 0;
3742}
3743
3744/*
3745 * log all the items included in the current transaction for a given
3746 * directory.  This also creates the range items in the log tree required
3747 * to replay anything deleted before the fsync
3748 */
3749static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3750			  struct btrfs_inode *inode,
3751			  struct btrfs_path *path,
3752			  struct btrfs_path *dst_path,
3753			  struct btrfs_log_ctx *ctx,
3754			  u64 min_offset, u64 *last_offset_ret)
3755{
3756	struct btrfs_key min_key;
3757	struct btrfs_root *root = inode->root;
3758	struct btrfs_root *log = root->log_root;
3759	int ret;
3760	u64 last_old_dentry_offset = min_offset - 1;
3761	u64 last_offset = (u64)-1;
3762	u64 ino = btrfs_ino(inode);
3763
3764	min_key.objectid = ino;
3765	min_key.type = BTRFS_DIR_INDEX_KEY;
3766	min_key.offset = min_offset;
3767
3768	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3769
3770	/*
3771	 * we didn't find anything from this transaction, see if there
3772	 * is anything at all
3773	 */
3774	if (ret != 0 || min_key.objectid != ino ||
3775	    min_key.type != BTRFS_DIR_INDEX_KEY) {
3776		min_key.objectid = ino;
3777		min_key.type = BTRFS_DIR_INDEX_KEY;
3778		min_key.offset = (u64)-1;
3779		btrfs_release_path(path);
3780		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3781		if (ret < 0) {
3782			btrfs_release_path(path);
3783			return ret;
3784		}
3785		ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
3786
3787		/* if ret == 0 there are items for this type,
3788		 * create a range to tell us the last key of this type.
3789		 * otherwise, there are no items in this directory after
3790		 * *min_offset, and we create a range to indicate that.
3791		 */
3792		if (ret == 0) {
3793			struct btrfs_key tmp;
3794
3795			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3796					      path->slots[0]);
3797			if (tmp.type == BTRFS_DIR_INDEX_KEY)
3798				last_old_dentry_offset = tmp.offset;
3799		} else if (ret > 0) {
3800			ret = 0;
3801		}
3802
3803		goto done;
3804	}
3805
3806	/* go backward to find any previous key */
3807	ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
3808	if (ret == 0) {
3809		struct btrfs_key tmp;
3810
3811		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3812		/*
3813		 * The dir index key before the first one we found that needs to
3814		 * be logged might be in a previous leaf, and there might be a
3815		 * gap between these keys, meaning that we had deletions that
3816		 * happened. So the key range item we log (key type
3817		 * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
3818		 * previous key's offset plus 1, so that those deletes are replayed.
3819		 */
3820		if (tmp.type == BTRFS_DIR_INDEX_KEY)
3821			last_old_dentry_offset = tmp.offset;
3822	} else if (ret < 0) {
3823		goto done;
3824	}
3825
3826	btrfs_release_path(path);
3827
3828	/*
3829	 * Find the first key from this transaction again or the one we were at
3830	 * in the loop below in case we had to reschedule. We may be logging the
3831	 * directory without holding its VFS lock, which happen when logging new
3832	 * dentries (through log_new_dir_dentries()) or in some cases when we
3833	 * need to log the parent directory of an inode. This means a dir index
3834	 * key might be deleted from the inode's root, and therefore we may not
3835	 * find it anymore. If we can't find it, just move to the next key. We
3836	 * can not bail out and ignore, because if we do that we will simply
3837	 * not log dir index keys that come after the one that was just deleted
3838	 * and we can end up logging a dir index range that ends at (u64)-1
3839	 * (@last_offset is initialized to that), resulting in removing dir
3840	 * entries we should not remove at log replay time.
3841	 */
3842search:
3843	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3844	if (ret > 0) {
3845		ret = btrfs_next_item(root, path);
3846		if (ret > 0) {
3847			/* There are no more keys in the inode's root. */
3848			ret = 0;
3849			goto done;
3850		}
3851	}
3852	if (ret < 0)
3853		goto done;
3854
3855	/*
3856	 * we have a block from this transaction, log every item in it
3857	 * from our directory
3858	 */
3859	while (1) {
3860		ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
3861					     &last_old_dentry_offset);
3862		if (ret != 0) {
3863			if (ret > 0)
3864				ret = 0;
3865			goto done;
3866		}
3867		path->slots[0] = btrfs_header_nritems(path->nodes[0]);
3868
3869		/*
3870		 * look ahead to the next item and see if it is also
3871		 * from this directory and from this transaction
3872		 */
3873		ret = btrfs_next_leaf(root, path);
3874		if (ret) {
3875			if (ret == 1) {
3876				last_offset = (u64)-1;
3877				ret = 0;
3878			}
3879			goto done;
3880		}
3881		btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
3882		if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
3883			last_offset = (u64)-1;
3884			goto done;
3885		}
3886		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3887			/*
3888			 * The next leaf was not changed in the current transaction
3889			 * and has at least one dir index key.
3890			 * We check for the next key because there might have been
3891			 * one or more deletions between the last key we logged and
3892			 * that next key. So the key range item we log (key type
3893			 * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
3894			 * offset minus 1, so that those deletes are replayed.
3895			 */
3896			last_offset = min_key.offset - 1;
3897			goto done;
3898		}
3899		if (need_resched()) {
3900			btrfs_release_path(path);
3901			cond_resched();
3902			goto search;
3903		}
3904	}
3905done:
3906	btrfs_release_path(path);
3907	btrfs_release_path(dst_path);
3908
3909	if (ret == 0) {
3910		*last_offset_ret = last_offset;
3911		/*
3912		 * In case the leaf was changed in the current transaction but
3913		 * all its dir items are from a past transaction, the last item
3914		 * in the leaf is a dir item and there's no gap between that last
3915		 * dir item and the first one on the next leaf (which did not
3916		 * change in the current transaction), then we don't need to log
3917		 * a range, last_old_dentry_offset is == to last_offset.
3918		 */
3919		ASSERT(last_old_dentry_offset <= last_offset);
3920		if (last_old_dentry_offset < last_offset)
3921			ret = insert_dir_log_key(trans, log, path, ino,
3922						 last_old_dentry_offset + 1,
3923						 last_offset);
3924	}
3925
3926	return ret;
3927}
3928
3929/*
3930 * If the inode was logged before and it was evicted, then its
3931 * last_dir_index_offset is (u64)-1, so we don't the value of the last index
3932 * key offset. If that's the case, search for it and update the inode. This
3933 * is to avoid lookups in the log tree every time we try to insert a dir index
3934 * key from a leaf changed in the current transaction, and to allow us to always
3935 * do batch insertions of dir index keys.
3936 */
3937static int update_last_dir_index_offset(struct btrfs_inode *inode,
3938					struct btrfs_path *path,
3939					const struct btrfs_log_ctx *ctx)
3940{
3941	const u64 ino = btrfs_ino(inode);
3942	struct btrfs_key key;
3943	int ret;
3944
3945	lockdep_assert_held(&inode->log_mutex);
3946
3947	if (inode->last_dir_index_offset != (u64)-1)
3948		return 0;
3949
3950	if (!ctx->logged_before) {
3951		inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
3952		return 0;
3953	}
3954
3955	key.objectid = ino;
3956	key.type = BTRFS_DIR_INDEX_KEY;
3957	key.offset = (u64)-1;
3958
3959	ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
3960	/*
3961	 * An error happened or we actually have an index key with an offset
3962	 * value of (u64)-1. Bail out, we're done.
3963	 */
3964	if (ret <= 0)
3965		goto out;
3966
3967	ret = 0;
3968	inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
3969
3970	/*
3971	 * No dir index items, bail out and leave last_dir_index_offset with
3972	 * the value right before the first valid index value.
3973	 */
3974	if (path->slots[0] == 0)
3975		goto out;
3976
3977	/*
3978	 * btrfs_search_slot() left us at one slot beyond the slot with the last
3979	 * index key, or beyond the last key of the directory that is not an
3980	 * index key. If we have an index key before, set last_dir_index_offset
3981	 * to its offset value, otherwise leave it with a value right before the
3982	 * first valid index value, as it means we have an empty directory.
3983	 */
3984	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
3985	if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
3986		inode->last_dir_index_offset = key.offset;
3987
3988out:
3989	btrfs_release_path(path);
3990
3991	return ret;
3992}
3993
3994/*
3995 * logging directories is very similar to logging inodes, We find all the items
3996 * from the current transaction and write them to the log.
3997 *
3998 * The recovery code scans the directory in the subvolume, and if it finds a
3999 * key in the range logged that is not present in the log tree, then it means
4000 * that dir entry was unlinked during the transaction.
4001 *
4002 * In order for that scan to work, we must include one key smaller than
4003 * the smallest logged by this transaction and one key larger than the largest
4004 * key logged by this transaction.
4005 */
4006static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
4007			  struct btrfs_inode *inode,
4008			  struct btrfs_path *path,
4009			  struct btrfs_path *dst_path,
4010			  struct btrfs_log_ctx *ctx)
4011{
4012	u64 min_key;
4013	u64 max_key;
4014	int ret;
4015
4016	ret = update_last_dir_index_offset(inode, path, ctx);
4017	if (ret)
4018		return ret;
4019
4020	min_key = BTRFS_DIR_START_INDEX;
4021	max_key = 0;
4022
4023	while (1) {
4024		ret = log_dir_items(trans, inode, path, dst_path,
4025				ctx, min_key, &max_key);
4026		if (ret)
4027			return ret;
4028		if (max_key == (u64)-1)
4029			break;
4030		min_key = max_key + 1;
4031	}
4032
4033	return 0;
4034}
4035
4036/*
4037 * a helper function to drop items from the log before we relog an
4038 * inode.  max_key_type indicates the highest item type to remove.
4039 * This cannot be run for file data extents because it does not
4040 * free the extents they point to.
4041 */
4042static int drop_inode_items(struct btrfs_trans_handle *trans,
4043				  struct btrfs_root *log,
4044				  struct btrfs_path *path,
4045				  struct btrfs_inode *inode,
4046				  int max_key_type)
4047{
4048	int ret;
4049	struct btrfs_key key;
4050	struct btrfs_key found_key;
4051	int start_slot;
4052
4053	key.objectid = btrfs_ino(inode);
4054	key.type = max_key_type;
4055	key.offset = (u64)-1;
4056
4057	while (1) {
4058		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
4059		if (ret < 0) {
4060			break;
4061		} else if (ret > 0) {
4062			if (path->slots[0] == 0)
4063				break;
4064			path->slots[0]--;
4065		}
4066
4067		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
4068				      path->slots[0]);
4069
4070		if (found_key.objectid != key.objectid)
4071			break;
4072
4073		found_key.offset = 0;
4074		found_key.type = 0;
4075		ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot);
4076		if (ret < 0)
4077			break;
4078
4079		ret = btrfs_del_items(trans, log, path, start_slot,
4080				      path->slots[0] - start_slot + 1);
4081		/*
4082		 * If start slot isn't 0 then we don't need to re-search, we've
4083		 * found the last guy with the objectid in this tree.
4084		 */
4085		if (ret || start_slot != 0)
4086			break;
4087		btrfs_release_path(path);
4088	}
4089	btrfs_release_path(path);
4090	if (ret > 0)
4091		ret = 0;
4092	return ret;
4093}
4094
4095static int truncate_inode_items(struct btrfs_trans_handle *trans,
4096				struct btrfs_root *log_root,
4097				struct btrfs_inode *inode,
4098				u64 new_size, u32 min_type)
4099{
4100	struct btrfs_truncate_control control = {
4101		.new_size = new_size,
4102		.ino = btrfs_ino(inode),
4103		.min_type = min_type,
4104		.skip_ref_updates = true,
4105	};
4106
4107	return btrfs_truncate_inode_items(trans, log_root, &control);
4108}
4109
4110static void fill_inode_item(struct btrfs_trans_handle *trans,
4111			    struct extent_buffer *leaf,
4112			    struct btrfs_inode_item *item,
4113			    struct inode *inode, int log_inode_only,
4114			    u64 logged_isize)
4115{
4116	struct btrfs_map_token token;
4117	u64 flags;
4118
4119	btrfs_init_map_token(&token, leaf);
4120
4121	if (log_inode_only) {
4122		/* set the generation to zero so the recover code
4123		 * can tell the difference between an logging
4124		 * just to say 'this inode exists' and a logging
4125		 * to say 'update this inode with these values'
4126		 */
4127		btrfs_set_token_inode_generation(&token, item, 0);
4128		btrfs_set_token_inode_size(&token, item, logged_isize);
4129	} else {
4130		btrfs_set_token_inode_generation(&token, item,
4131						 BTRFS_I(inode)->generation);
4132		btrfs_set_token_inode_size(&token, item, inode->i_size);
4133	}
4134
4135	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
4136	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
4137	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
4138	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
4139
4140	btrfs_set_token_timespec_sec(&token, &item->atime,
4141				     inode->i_atime.tv_sec);
4142	btrfs_set_token_timespec_nsec(&token, &item->atime,
4143				      inode->i_atime.tv_nsec);
4144
4145	btrfs_set_token_timespec_sec(&token, &item->mtime,
4146				     inode->i_mtime.tv_sec);
4147	btrfs_set_token_timespec_nsec(&token, &item->mtime,
4148				      inode->i_mtime.tv_nsec);
4149
4150	btrfs_set_token_timespec_sec(&token, &item->ctime,
4151				     inode_get_ctime(inode).tv_sec);
4152	btrfs_set_token_timespec_nsec(&token, &item->ctime,
4153				      inode_get_ctime(inode).tv_nsec);
4154
4155	/*
4156	 * We do not need to set the nbytes field, in fact during a fast fsync
4157	 * its value may not even be correct, since a fast fsync does not wait
4158	 * for ordered extent completion, which is where we update nbytes, it
4159	 * only waits for writeback to complete. During log replay as we find
4160	 * file extent items and replay them, we adjust the nbytes field of the
4161	 * inode item in subvolume tree as needed (see overwrite_item()).
4162	 */
4163
4164	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
4165	btrfs_set_token_inode_transid(&token, item, trans->transid);
4166	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
4167	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4168					  BTRFS_I(inode)->ro_flags);
4169	btrfs_set_token_inode_flags(&token, item, flags);
4170	btrfs_set_token_inode_block_group(&token, item, 0);
4171}
4172
4173static int log_inode_item(struct btrfs_trans_handle *trans,
4174			  struct btrfs_root *log, struct btrfs_path *path,
4175			  struct btrfs_inode *inode, bool inode_item_dropped)
4176{
4177	struct btrfs_inode_item *inode_item;
4178	int ret;
4179
4180	/*
4181	 * If we are doing a fast fsync and the inode was logged before in the
4182	 * current transaction, then we know the inode was previously logged and
4183	 * it exists in the log tree. For performance reasons, in this case use
4184	 * btrfs_search_slot() directly with ins_len set to 0 so that we never
4185	 * attempt a write lock on the leaf's parent, which adds unnecessary lock
4186	 * contention in case there are concurrent fsyncs for other inodes of the
4187	 * same subvolume. Using btrfs_insert_empty_item() when the inode item
4188	 * already exists can also result in unnecessarily splitting a leaf.
4189	 */
4190	if (!inode_item_dropped && inode->logged_trans == trans->transid) {
4191		ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
4192		ASSERT(ret <= 0);
4193		if (ret > 0)
4194			ret = -ENOENT;
4195	} else {
4196		/*
4197		 * This means it is the first fsync in the current transaction,
4198		 * so the inode item is not in the log and we need to insert it.
4199		 * We can never get -EEXIST because we are only called for a fast
4200		 * fsync and in case an inode eviction happens after the inode was
4201		 * logged before in the current transaction, when we load again
4202		 * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
4203		 * flags and set ->logged_trans to 0.
4204		 */
4205		ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
4206					      sizeof(*inode_item));
4207		ASSERT(ret != -EEXIST);
4208	}
4209	if (ret)
4210		return ret;
4211	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4212				    struct btrfs_inode_item);
4213	fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
4214			0, 0);
4215	btrfs_release_path(path);
4216	return 0;
4217}
4218
4219static int log_csums(struct btrfs_trans_handle *trans,
4220		     struct btrfs_inode *inode,
4221		     struct btrfs_root *log_root,
4222		     struct btrfs_ordered_sum *sums)
4223{
4224	const u64 lock_end = sums->logical + sums->len - 1;
4225	struct extent_state *cached_state = NULL;
4226	int ret;
4227
4228	/*
4229	 * If this inode was not used for reflink operations in the current
4230	 * transaction with new extents, then do the fast path, no need to
4231	 * worry about logging checksum items with overlapping ranges.
4232	 */
4233	if (inode->last_reflink_trans < trans->transid)
4234		return btrfs_csum_file_blocks(trans, log_root, sums);
4235
4236	/*
4237	 * Serialize logging for checksums. This is to avoid racing with the
4238	 * same checksum being logged by another task that is logging another
4239	 * file which happens to refer to the same extent as well. Such races
4240	 * can leave checksum items in the log with overlapping ranges.
4241	 */
4242	ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end,
4243			  &cached_state);
4244	if (ret)
4245		return ret;
4246	/*
4247	 * Due to extent cloning, we might have logged a csum item that covers a
4248	 * subrange of a cloned extent, and later we can end up logging a csum
4249	 * item for a larger subrange of the same extent or the entire range.
4250	 * This would leave csum items in the log tree that cover the same range
4251	 * and break the searches for checksums in the log tree, resulting in
4252	 * some checksums missing in the fs/subvolume tree. So just delete (or
4253	 * trim and adjust) any existing csum items in the log for this range.
4254	 */
4255	ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len);
4256	if (!ret)
4257		ret = btrfs_csum_file_blocks(trans, log_root, sums);
4258
4259	unlock_extent(&log_root->log_csum_range, sums->logical, lock_end,
4260		      &cached_state);
4261
4262	return ret;
4263}
4264
4265static noinline int copy_items(struct btrfs_trans_handle *trans,
4266			       struct btrfs_inode *inode,
4267			       struct btrfs_path *dst_path,
4268			       struct btrfs_path *src_path,
4269			       int start_slot, int nr, int inode_only,
4270			       u64 logged_isize)
4271{
4272	struct btrfs_root *log = inode->root->log_root;
4273	struct btrfs_file_extent_item *extent;
4274	struct extent_buffer *src;
4275	int ret = 0;
4276	struct btrfs_key *ins_keys;
4277	u32 *ins_sizes;
4278	struct btrfs_item_batch batch;
4279	char *ins_data;
4280	int i;
4281	int dst_index;
4282	const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
4283	const u64 i_size = i_size_read(&inode->vfs_inode);
4284
4285	/*
4286	 * To keep lockdep happy and avoid deadlocks, clone the source leaf and
4287	 * use the clone. This is because otherwise we would be changing the log
4288	 * tree, to insert items from the subvolume tree or insert csum items,
4289	 * while holding a read lock on a leaf from the subvolume tree, which
4290	 * creates a nasty lock dependency when COWing log tree nodes/leaves:
4291	 *
4292	 * 1) Modifying the log tree triggers an extent buffer allocation while
4293	 *    holding a write lock on a parent extent buffer from the log tree.
4294	 *    Allocating the pages for an extent buffer, or the extent buffer
4295	 *    struct, can trigger inode eviction and finally the inode eviction
4296	 *    will trigger a release/remove of a delayed node, which requires
4297	 *    taking the delayed node's mutex;
4298	 *
4299	 * 2) Allocating a metadata extent for a log tree can trigger the async
4300	 *    reclaim thread and make us wait for it to release enough space and
4301	 *    unblock our reservation ticket. The reclaim thread can start
4302	 *    flushing delayed items, and that in turn results in the need to
4303	 *    lock delayed node mutexes and in the need to write lock extent
4304	 *    buffers of a subvolume tree - all this while holding a write lock
4305	 *    on the parent extent buffer in the log tree.
4306	 *
4307	 * So one task in scenario 1) running in parallel with another task in
4308	 * scenario 2) could lead to a deadlock, one wanting to lock a delayed
4309	 * node mutex while having a read lock on a leaf from the subvolume,
4310	 * while the other is holding the delayed node's mutex and wants to
4311	 * write lock the same subvolume leaf for flushing delayed items.
4312	 */
4313	src = btrfs_clone_extent_buffer(src_path->nodes[0]);
4314	if (!src)
4315		return -ENOMEM;
4316
4317	i = src_path->slots[0];
4318	btrfs_release_path(src_path);
4319	src_path->nodes[0] = src;
4320	src_path->slots[0] = i;
4321
4322	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
4323			   nr * sizeof(u32), GFP_NOFS);
4324	if (!ins_data)
4325		return -ENOMEM;
4326
4327	ins_sizes = (u32 *)ins_data;
4328	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
4329	batch.keys = ins_keys;
4330	batch.data_sizes = ins_sizes;
4331	batch.total_data_size = 0;
4332	batch.nr = 0;
4333
4334	dst_index = 0;
4335	for (i = 0; i < nr; i++) {
4336		const int src_slot = start_slot + i;
4337		struct btrfs_root *csum_root;
4338		struct btrfs_ordered_sum *sums;
4339		struct btrfs_ordered_sum *sums_next;
4340		LIST_HEAD(ordered_sums);
4341		u64 disk_bytenr;
4342		u64 disk_num_bytes;
4343		u64 extent_offset;
4344		u64 extent_num_bytes;
4345		bool is_old_extent;
4346
4347		btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
4348
4349		if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
4350			goto add_to_batch;
4351
4352		extent = btrfs_item_ptr(src, src_slot,
4353					struct btrfs_file_extent_item);
4354
4355		is_old_extent = (btrfs_file_extent_generation(src, extent) <
4356				 trans->transid);
4357
4358		/*
4359		 * Don't copy extents from past generations. That would make us
4360		 * log a lot more metadata for common cases like doing only a
4361		 * few random writes into a file and then fsync it for the first
4362		 * time or after the full sync flag is set on the inode. We can
4363		 * get leaves full of extent items, most of which are from past
4364		 * generations, so we can skip them - as long as the inode has
4365		 * not been the target of a reflink operation in this transaction,
4366		 * as in that case it might have had file extent items with old
4367		 * generations copied into it. We also must always log prealloc
4368		 * extents that start at or beyond eof, otherwise we would lose
4369		 * them on log replay.
4370		 */
4371		if (is_old_extent &&
4372		    ins_keys[dst_index].offset < i_size &&
4373		    inode->last_reflink_trans < trans->transid)
4374			continue;
4375
4376		if (skip_csum)
4377			goto add_to_batch;
4378
4379		/* Only regular extents have checksums. */
4380		if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
4381			goto add_to_batch;
4382
4383		/*
4384		 * If it's an extent created in a past transaction, then its
4385		 * checksums are already accessible from the committed csum tree,
4386		 * no need to log them.
4387		 */
4388		if (is_old_extent)
4389			goto add_to_batch;
4390
4391		disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
4392		/* If it's an explicit hole, there are no checksums. */
4393		if (disk_bytenr == 0)
4394			goto add_to_batch;
4395
4396		disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
4397
4398		if (btrfs_file_extent_compression(src, extent)) {
4399			extent_offset = 0;
4400			extent_num_bytes = disk_num_bytes;
4401		} else {
4402			extent_offset = btrfs_file_extent_offset(src, extent);
4403			extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
4404		}
4405
4406		csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
4407		disk_bytenr += extent_offset;
4408		ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
4409					      disk_bytenr + extent_num_bytes - 1,
4410					      &ordered_sums, 0, false);
4411		if (ret)
4412			goto out;
4413
4414		list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
4415			if (!ret)
4416				ret = log_csums(trans, inode, log, sums);
4417			list_del(&sums->list);
4418			kfree(sums);
4419		}
4420		if (ret)
4421			goto out;
4422
4423add_to_batch:
4424		ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
4425		batch.total_data_size += ins_sizes[dst_index];
4426		batch.nr++;
4427		dst_index++;
4428	}
4429
4430	/*
4431	 * We have a leaf full of old extent items that don't need to be logged,
4432	 * so we don't need to do anything.
4433	 */
4434	if (batch.nr == 0)
4435		goto out;
4436
4437	ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
4438	if (ret)
4439		goto out;
4440
4441	dst_index = 0;
4442	for (i = 0; i < nr; i++) {
4443		const int src_slot = start_slot + i;
4444		const int dst_slot = dst_path->slots[0] + dst_index;
4445		struct btrfs_key key;
4446		unsigned long src_offset;
4447		unsigned long dst_offset;
4448
4449		/*
4450		 * We're done, all the remaining items in the source leaf
4451		 * correspond to old file extent items.
4452		 */
4453		if (dst_index >= batch.nr)
4454			break;
4455
4456		btrfs_item_key_to_cpu(src, &key, src_slot);
4457
4458		if (key.type != BTRFS_EXTENT_DATA_KEY)
4459			goto copy_item;
4460
4461		extent = btrfs_item_ptr(src, src_slot,
4462					struct btrfs_file_extent_item);
4463
4464		/* See the comment in the previous loop, same logic. */
4465		if (btrfs_file_extent_generation(src, extent) < trans->transid &&
4466		    key.offset < i_size &&
4467		    inode->last_reflink_trans < trans->transid)
4468			continue;
4469
4470copy_item:
4471		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
4472		src_offset = btrfs_item_ptr_offset(src, src_slot);
4473
4474		if (key.type == BTRFS_INODE_ITEM_KEY) {
4475			struct btrfs_inode_item *inode_item;
4476
4477			inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
4478						    struct btrfs_inode_item);
4479			fill_inode_item(trans, dst_path->nodes[0], inode_item,
4480					&inode->vfs_inode,
4481					inode_only == LOG_INODE_EXISTS,
4482					logged_isize);
4483		} else {
4484			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
4485					   src_offset, ins_sizes[dst_index]);
4486		}
4487
4488		dst_index++;
4489	}
4490
4491	btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]);
4492	btrfs_release_path(dst_path);
4493out:
4494	kfree(ins_data);
4495
4496	return ret;
4497}
4498
4499static int extent_cmp(void *priv, const struct list_head *a,
4500		      const struct list_head *b)
4501{
4502	const struct extent_map *em1, *em2;
4503
4504	em1 = list_entry(a, struct extent_map, list);
4505	em2 = list_entry(b, struct extent_map, list);
4506
4507	if (em1->start < em2->start)
4508		return -1;
4509	else if (em1->start > em2->start)
4510		return 1;
4511	return 0;
4512}
4513
4514static int log_extent_csums(struct btrfs_trans_handle *trans,
4515			    struct btrfs_inode *inode,
4516			    struct btrfs_root *log_root,
4517			    const struct extent_map *em,
4518			    struct btrfs_log_ctx *ctx)
4519{
4520	struct btrfs_ordered_extent *ordered;
4521	struct btrfs_root *csum_root;
4522	u64 csum_offset;
4523	u64 csum_len;
4524	u64 mod_start = em->mod_start;
4525	u64 mod_len = em->mod_len;
4526	LIST_HEAD(ordered_sums);
4527	int ret = 0;
4528
4529	if (inode->flags & BTRFS_INODE_NODATASUM ||
4530	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
4531	    em->block_start == EXTENT_MAP_HOLE)
4532		return 0;
4533
4534	list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
4535		const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
4536		const u64 mod_end = mod_start + mod_len;
4537		struct btrfs_ordered_sum *sums;
4538
4539		if (mod_len == 0)
4540			break;
4541
4542		if (ordered_end <= mod_start)
4543			continue;
4544		if (mod_end <= ordered->file_offset)
4545			break;
4546
4547		/*
4548		 * We are going to copy all the csums on this ordered extent, so
4549		 * go ahead and adjust mod_start and mod_len in case this ordered
4550		 * extent has already been logged.
4551		 */
4552		if (ordered->file_offset > mod_start) {
4553			if (ordered_end >= mod_end)
4554				mod_len = ordered->file_offset - mod_start;
4555			/*
4556			 * If we have this case
4557			 *
4558			 * |--------- logged extent ---------|
4559			 *       |----- ordered extent ----|
4560			 *
4561			 * Just don't mess with mod_start and mod_len, we'll
4562			 * just end up logging more csums than we need and it
4563			 * will be ok.
4564			 */
4565		} else {
4566			if (ordered_end < mod_end) {
4567				mod_len = mod_end - ordered_end;
4568				mod_start = ordered_end;
4569			} else {
4570				mod_len = 0;
4571			}
4572		}
4573
4574		/*
4575		 * To keep us from looping for the above case of an ordered
4576		 * extent that falls inside of the logged extent.
4577		 */
4578		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
4579			continue;
4580
4581		list_for_each_entry(sums, &ordered->list, list) {
4582			ret = log_csums(trans, inode, log_root, sums);
4583			if (ret)
4584				return ret;
4585		}
4586	}
4587
4588	/* We're done, found all csums in the ordered extents. */
4589	if (mod_len == 0)
4590		return 0;
4591
4592	/* If we're compressed we have to save the entire range of csums. */
4593	if (em->compress_type) {
4594		csum_offset = 0;
4595		csum_len = max(em->block_len, em->orig_block_len);
4596	} else {
4597		csum_offset = mod_start - em->start;
4598		csum_len = mod_len;
4599	}
4600
4601	/* block start is already adjusted for the file extent offset. */
4602	csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
4603	ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset,
4604				      em->block_start + csum_offset +
4605				      csum_len - 1, &ordered_sums, 0, false);
4606	if (ret)
4607		return ret;
4608
4609	while (!list_empty(&ordered_sums)) {
4610		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4611						   struct btrfs_ordered_sum,
4612						   list);
4613		if (!ret)
4614			ret = log_csums(trans, inode, log_root, sums);
4615		list_del(&sums->list);
4616		kfree(sums);
4617	}
4618
4619	return ret;
4620}
4621
4622static int log_one_extent(struct btrfs_trans_handle *trans,
4623			  struct btrfs_inode *inode,
4624			  const struct extent_map *em,
4625			  struct btrfs_path *path,
4626			  struct btrfs_log_ctx *ctx)
4627{
4628	struct btrfs_drop_extents_args drop_args = { 0 };
4629	struct btrfs_root *log = inode->root->log_root;
4630	struct btrfs_file_extent_item fi = { 0 };
4631	struct extent_buffer *leaf;
4632	struct btrfs_key key;
4633	u64 extent_offset = em->start - em->orig_start;
4634	u64 block_len;
4635	int ret;
4636
4637	btrfs_set_stack_file_extent_generation(&fi, trans->transid);
4638	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4639		btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
4640	else
4641		btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
4642
4643	block_len = max(em->block_len, em->orig_block_len);
4644	if (em->compress_type != BTRFS_COMPRESS_NONE) {
4645		btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
4646		btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
4647	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4648		btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start -
4649							extent_offset);
4650		btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
4651	}
4652
4653	btrfs_set_stack_file_extent_offset(&fi, extent_offset);
4654	btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
4655	btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
4656	btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
4657
4658	ret = log_extent_csums(trans, inode, log, em, ctx);
4659	if (ret)
4660		return ret;
4661
4662	/*
4663	 * If this is the first time we are logging the inode in the current
4664	 * transaction, we can avoid btrfs_drop_extents(), which is expensive
4665	 * because it does a deletion search, which always acquires write locks
4666	 * for extent buffers at levels 2, 1 and 0. This not only wastes time
4667	 * but also adds significant contention in a log tree, since log trees
4668	 * are small, with a root at level 2 or 3 at most, due to their short
4669	 * life span.
4670	 */
4671	if (ctx->logged_before) {
4672		drop_args.path = path;
4673		drop_args.start = em->start;
4674		drop_args.end = em->start + em->len;
4675		drop_args.replace_extent = true;
4676		drop_args.extent_item_size = sizeof(fi);
4677		ret = btrfs_drop_extents(trans, log, inode, &drop_args);
4678		if (ret)
4679			return ret;
4680	}
4681
4682	if (!drop_args.extent_inserted) {
4683		key.objectid = btrfs_ino(inode);
4684		key.type = BTRFS_EXTENT_DATA_KEY;
4685		key.offset = em->start;
4686
4687		ret = btrfs_insert_empty_item(trans, log, path, &key,
4688					      sizeof(fi));
4689		if (ret)
4690			return ret;
4691	}
4692	leaf = path->nodes[0];
4693	write_extent_buffer(leaf, &fi,
4694			    btrfs_item_ptr_offset(leaf, path->slots[0]),
4695			    sizeof(fi));
4696	btrfs_mark_buffer_dirty(trans, leaf);
4697
4698	btrfs_release_path(path);
4699
4700	return ret;
4701}
4702
4703/*
4704 * Log all prealloc extents beyond the inode's i_size to make sure we do not
4705 * lose them after doing a full/fast fsync and replaying the log. We scan the
4706 * subvolume's root instead of iterating the inode's extent map tree because
4707 * otherwise we can log incorrect extent items based on extent map conversion.
4708 * That can happen due to the fact that extent maps are merged when they
4709 * are not in the extent map tree's list of modified extents.
4710 */
4711static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4712				      struct btrfs_inode *inode,
4713				      struct btrfs_path *path)
4714{
4715	struct btrfs_root *root = inode->root;
4716	struct btrfs_key key;
4717	const u64 i_size = i_size_read(&inode->vfs_inode);
4718	const u64 ino = btrfs_ino(inode);
4719	struct btrfs_path *dst_path = NULL;
4720	bool dropped_extents = false;
4721	u64 truncate_offset = i_size;
4722	struct extent_buffer *leaf;
4723	int slot;
4724	int ins_nr = 0;
4725	int start_slot = 0;
4726	int ret;
4727
4728	if (!(inode->flags & BTRFS_INODE_PREALLOC))
4729		return 0;
4730
4731	key.objectid = ino;
4732	key.type = BTRFS_EXTENT_DATA_KEY;
4733	key.offset = i_size;
4734	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4735	if (ret < 0)
4736		goto out;
4737
4738	/*
4739	 * We must check if there is a prealloc extent that starts before the
4740	 * i_size and crosses the i_size boundary. This is to ensure later we
4741	 * truncate down to the end of that extent and not to the i_size, as
4742	 * otherwise we end up losing part of the prealloc extent after a log
4743	 * replay and with an implicit hole if there is another prealloc extent
4744	 * that starts at an offset beyond i_size.
4745	 */
4746	ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
4747	if (ret < 0)
4748		goto out;
4749
4750	if (ret == 0) {
4751		struct btrfs_file_extent_item *ei;
4752
4753		leaf = path->nodes[0];
4754		slot = path->slots[0];
4755		ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4756
4757		if (btrfs_file_extent_type(leaf, ei) ==
4758		    BTRFS_FILE_EXTENT_PREALLOC) {
4759			u64 extent_end;
4760
4761			btrfs_item_key_to_cpu(leaf, &key, slot);
4762			extent_end = key.offset +
4763				btrfs_file_extent_num_bytes(leaf, ei);
4764
4765			if (extent_end > i_size)
4766				truncate_offset = extent_end;
4767		}
4768	} else {
4769		ret = 0;
4770	}
4771
4772	while (true) {
4773		leaf = path->nodes[0];
4774		slot = path->slots[0];
4775
4776		if (slot >= btrfs_header_nritems(leaf)) {
4777			if (ins_nr > 0) {
4778				ret = copy_items(trans, inode, dst_path, path,
4779						 start_slot, ins_nr, 1, 0);
4780				if (ret < 0)
4781					goto out;
4782				ins_nr = 0;
4783			}
4784			ret = btrfs_next_leaf(root, path);
4785			if (ret < 0)
4786				goto out;
4787			if (ret > 0) {
4788				ret = 0;
4789				break;
4790			}
4791			continue;
4792		}
4793
4794		btrfs_item_key_to_cpu(leaf, &key, slot);
4795		if (key.objectid > ino)
4796			break;
4797		if (WARN_ON_ONCE(key.objectid < ino) ||
4798		    key.type < BTRFS_EXTENT_DATA_KEY ||
4799		    key.offset < i_size) {
4800			path->slots[0]++;
4801			continue;
4802		}
4803		if (!dropped_extents) {
4804			/*
4805			 * Avoid logging extent items logged in past fsync calls
4806			 * and leading to duplicate keys in the log tree.
4807			 */
4808			ret = truncate_inode_items(trans, root->log_root, inode,
4809						   truncate_offset,
4810						   BTRFS_EXTENT_DATA_KEY);
4811			if (ret)
4812				goto out;
4813			dropped_extents = true;
4814		}
4815		if (ins_nr == 0)
4816			start_slot = slot;
4817		ins_nr++;
4818		path->slots[0]++;
4819		if (!dst_path) {
4820			dst_path = btrfs_alloc_path();
4821			if (!dst_path) {
4822				ret = -ENOMEM;
4823				goto out;
4824			}
4825		}
4826	}
4827	if (ins_nr > 0)
4828		ret = copy_items(trans, inode, dst_path, path,
4829				 start_slot, ins_nr, 1, 0);
4830out:
4831	btrfs_release_path(path);
4832	btrfs_free_path(dst_path);
4833	return ret;
4834}
4835
4836static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4837				     struct btrfs_inode *inode,
4838				     struct btrfs_path *path,
4839				     struct btrfs_log_ctx *ctx)
4840{
4841	struct btrfs_ordered_extent *ordered;
4842	struct btrfs_ordered_extent *tmp;
4843	struct extent_map *em, *n;
4844	LIST_HEAD(extents);
4845	struct extent_map_tree *tree = &inode->extent_tree;
4846	int ret = 0;
4847	int num = 0;
4848
4849	write_lock(&tree->lock);
4850
4851	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4852		list_del_init(&em->list);
4853		/*
4854		 * Just an arbitrary number, this can be really CPU intensive
4855		 * once we start getting a lot of extents, and really once we
4856		 * have a bunch of extents we just want to commit since it will
4857		 * be faster.
4858		 */
4859		if (++num > 32768) {
4860			list_del_init(&tree->modified_extents);
4861			ret = -EFBIG;
4862			goto process;
4863		}
4864
4865		if (em->generation < trans->transid)
4866			continue;
4867
4868		/* We log prealloc extents beyond eof later. */
4869		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
4870		    em->start >= i_size_read(&inode->vfs_inode))
4871			continue;
4872
4873		/* Need a ref to keep it from getting evicted from cache */
4874		refcount_inc(&em->refs);
4875		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
4876		list_add_tail(&em->list, &extents);
4877		num++;
4878	}
4879
4880	list_sort(NULL, &extents, extent_cmp);
4881process:
4882	while (!list_empty(&extents)) {
4883		em = list_entry(extents.next, struct extent_map, list);
4884
4885		list_del_init(&em->list);
4886
4887		/*
4888		 * If we had an error we just need to delete everybody from our
4889		 * private list.
4890		 */
4891		if (ret) {
4892			clear_em_logging(tree, em);
4893			free_extent_map(em);
4894			continue;
4895		}
4896
4897		write_unlock(&tree->lock);
4898
4899		ret = log_one_extent(trans, inode, em, path, ctx);
4900		write_lock(&tree->lock);
4901		clear_em_logging(tree, em);
4902		free_extent_map(em);
4903	}
4904	WARN_ON(!list_empty(&extents));
4905	write_unlock(&tree->lock);
4906
4907	if (!ret)
4908		ret = btrfs_log_prealloc_extents(trans, inode, path);
4909	if (ret)
4910		return ret;
4911
4912	/*
4913	 * We have logged all extents successfully, now make sure the commit of
4914	 * the current transaction waits for the ordered extents to complete
4915	 * before it commits and wipes out the log trees, otherwise we would
4916	 * lose data if an ordered extents completes after the transaction
4917	 * commits and a power failure happens after the transaction commit.
4918	 */
4919	list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
4920		list_del_init(&ordered->log_list);
4921		set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
4922
4923		if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4924			spin_lock_irq(&inode->ordered_tree.lock);
4925			if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4926				set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
4927				atomic_inc(&trans->transaction->pending_ordered);
4928			}
4929			spin_unlock_irq(&inode->ordered_tree.lock);
4930		}
4931		btrfs_put_ordered_extent(ordered);
4932	}
4933
4934	return 0;
4935}
4936
4937static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
4938			     struct btrfs_path *path, u64 *size_ret)
4939{
4940	struct btrfs_key key;
4941	int ret;
4942
4943	key.objectid = btrfs_ino(inode);
4944	key.type = BTRFS_INODE_ITEM_KEY;
4945	key.offset = 0;
4946
4947	ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4948	if (ret < 0) {
4949		return ret;
4950	} else if (ret > 0) {
4951		*size_ret = 0;
4952	} else {
4953		struct btrfs_inode_item *item;
4954
4955		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4956				      struct btrfs_inode_item);
4957		*size_ret = btrfs_inode_size(path->nodes[0], item);
4958		/*
4959		 * If the in-memory inode's i_size is smaller then the inode
4960		 * size stored in the btree, return the inode's i_size, so
4961		 * that we get a correct inode size after replaying the log
4962		 * when before a power failure we had a shrinking truncate
4963		 * followed by addition of a new name (rename / new hard link).
4964		 * Otherwise return the inode size from the btree, to avoid
4965		 * data loss when replaying a log due to previously doing a
4966		 * write that expands the inode's size and logging a new name
4967		 * immediately after.
4968		 */
4969		if (*size_ret > inode->vfs_inode.i_size)
4970			*size_ret = inode->vfs_inode.i_size;
4971	}
4972
4973	btrfs_release_path(path);
4974	return 0;
4975}
4976
4977/*
4978 * At the moment we always log all xattrs. This is to figure out at log replay
4979 * time which xattrs must have their deletion replayed. If a xattr is missing
4980 * in the log tree and exists in the fs/subvol tree, we delete it. This is
4981 * because if a xattr is deleted, the inode is fsynced and a power failure
4982 * happens, causing the log to be replayed the next time the fs is mounted,
4983 * we want the xattr to not exist anymore (same behaviour as other filesystems
4984 * with a journal, ext3/4, xfs, f2fs, etc).
4985 */
4986static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
4987				struct btrfs_inode *inode,
4988				struct btrfs_path *path,
4989				struct btrfs_path *dst_path)
4990{
4991	struct btrfs_root *root = inode->root;
4992	int ret;
4993	struct btrfs_key key;
4994	const u64 ino = btrfs_ino(inode);
4995	int ins_nr = 0;
4996	int start_slot = 0;
4997	bool found_xattrs = false;
4998
4999	if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
5000		return 0;
5001
5002	key.objectid = ino;
5003	key.type = BTRFS_XATTR_ITEM_KEY;
5004	key.offset = 0;
5005
5006	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5007	if (ret < 0)
5008		return ret;
5009
5010	while (true) {
5011		int slot = path->slots[0];
5012		struct extent_buffer *leaf = path->nodes[0];
5013		int nritems = btrfs_header_nritems(leaf);
5014
5015		if (slot >= nritems) {
5016			if (ins_nr > 0) {
5017				ret = copy_items(trans, inode, dst_path, path,
5018						 start_slot, ins_nr, 1, 0);
5019				if (ret < 0)
5020					return ret;
5021				ins_nr = 0;
5022			}
5023			ret = btrfs_next_leaf(root, path);
5024			if (ret < 0)
5025				return ret;
5026			else if (ret > 0)
5027				break;
5028			continue;
5029		}
5030
5031		btrfs_item_key_to_cpu(leaf, &key, slot);
5032		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
5033			break;
5034
5035		if (ins_nr == 0)
5036			start_slot = slot;
5037		ins_nr++;
5038		path->slots[0]++;
5039		found_xattrs = true;
5040		cond_resched();
5041	}
5042	if (ins_nr > 0) {
5043		ret = copy_items(trans, inode, dst_path, path,
5044				 start_slot, ins_nr, 1, 0);
5045		if (ret < 0)
5046			return ret;
5047	}
5048
5049	if (!found_xattrs)
5050		set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
5051
5052	return 0;
5053}
5054
5055/*
5056 * When using the NO_HOLES feature if we punched a hole that causes the
5057 * deletion of entire leafs or all the extent items of the first leaf (the one
5058 * that contains the inode item and references) we may end up not processing
5059 * any extents, because there are no leafs with a generation matching the
5060 * current transaction that have extent items for our inode. So we need to find
5061 * if any holes exist and then log them. We also need to log holes after any
5062 * truncate operation that changes the inode's size.
5063 */
5064static int btrfs_log_holes(struct btrfs_trans_handle *trans,
5065			   struct btrfs_inode *inode,
5066			   struct btrfs_path *path)
5067{
5068	struct btrfs_root *root = inode->root;
5069	struct btrfs_fs_info *fs_info = root->fs_info;
5070	struct btrfs_key key;
5071	const u64 ino = btrfs_ino(inode);
5072	const u64 i_size = i_size_read(&inode->vfs_inode);
5073	u64 prev_extent_end = 0;
5074	int ret;
5075
5076	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
5077		return 0;
5078
5079	key.objectid = ino;
5080	key.type = BTRFS_EXTENT_DATA_KEY;
5081	key.offset = 0;
5082
5083	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5084	if (ret < 0)
5085		return ret;
5086
5087	while (true) {
5088		struct extent_buffer *leaf = path->nodes[0];
5089
5090		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
5091			ret = btrfs_next_leaf(root, path);
5092			if (ret < 0)
5093				return ret;
5094			if (ret > 0) {
5095				ret = 0;
5096				break;
5097			}
5098			leaf = path->nodes[0];
5099		}
5100
5101		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5102		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
5103			break;
5104
5105		/* We have a hole, log it. */
5106		if (prev_extent_end < key.offset) {
5107			const u64 hole_len = key.offset - prev_extent_end;
5108
5109			/*
5110			 * Release the path to avoid deadlocks with other code
5111			 * paths that search the root while holding locks on
5112			 * leafs from the log root.
5113			 */
5114			btrfs_release_path(path);
5115			ret = btrfs_insert_hole_extent(trans, root->log_root,
5116						       ino, prev_extent_end,
5117						       hole_len);
5118			if (ret < 0)
5119				return ret;
5120
5121			/*
5122			 * Search for the same key again in the root. Since it's
5123			 * an extent item and we are holding the inode lock, the
5124			 * key must still exist. If it doesn't just emit warning
5125			 * and return an error to fall back to a transaction
5126			 * commit.
5127			 */
5128			ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5129			if (ret < 0)
5130				return ret;
5131			if (WARN_ON(ret > 0))
5132				return -ENOENT;
5133			leaf = path->nodes[0];
5134		}
5135
5136		prev_extent_end = btrfs_file_extent_end(path);
5137		path->slots[0]++;
5138		cond_resched();
5139	}
5140
5141	if (prev_extent_end < i_size) {
5142		u64 hole_len;
5143
5144		btrfs_release_path(path);
5145		hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
5146		ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
5147					       prev_extent_end, hole_len);
5148		if (ret < 0)
5149			return ret;
5150	}
5151
5152	return 0;
5153}
5154
5155/*
5156 * When we are logging a new inode X, check if it doesn't have a reference that
5157 * matches the reference from some other inode Y created in a past transaction
5158 * and that was renamed in the current transaction. If we don't do this, then at
5159 * log replay time we can lose inode Y (and all its files if it's a directory):
5160 *
5161 * mkdir /mnt/x
5162 * echo "hello world" > /mnt/x/foobar
5163 * sync
5164 * mv /mnt/x /mnt/y
5165 * mkdir /mnt/x                 # or touch /mnt/x
5166 * xfs_io -c fsync /mnt/x
5167 * <power fail>
5168 * mount fs, trigger log replay
5169 *
5170 * After the log replay procedure, we would lose the first directory and all its
5171 * files (file foobar).
5172 * For the case where inode Y is not a directory we simply end up losing it:
5173 *
5174 * echo "123" > /mnt/foo
5175 * sync
5176 * mv /mnt/foo /mnt/bar
5177 * echo "abc" > /mnt/foo
5178 * xfs_io -c fsync /mnt/foo
5179 * <power fail>
5180 *
5181 * We also need this for cases where a snapshot entry is replaced by some other
5182 * entry (file or directory) otherwise we end up with an unreplayable log due to
5183 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
5184 * if it were a regular entry:
5185 *
5186 * mkdir /mnt/x
5187 * btrfs subvolume snapshot /mnt /mnt/x/snap
5188 * btrfs subvolume delete /mnt/x/snap
5189 * rmdir /mnt/x
5190 * mkdir /mnt/x
5191 * fsync /mnt/x or fsync some new file inside it
5192 * <power fail>
5193 *
5194 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
5195 * the same transaction.
5196 */
5197static int btrfs_check_ref_name_override(struct extent_buffer *eb,
5198					 const int slot,
5199					 const struct btrfs_key *key,
5200					 struct btrfs_inode *inode,
5201					 u64 *other_ino, u64 *other_parent)
5202{
5203	int ret;
5204	struct btrfs_path *search_path;
5205	char *name = NULL;
5206	u32 name_len = 0;
5207	u32 item_size = btrfs_item_size(eb, slot);
5208	u32 cur_offset = 0;
5209	unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
5210
5211	search_path = btrfs_alloc_path();
5212	if (!search_path)
5213		return -ENOMEM;
5214	search_path->search_commit_root = 1;
5215	search_path->skip_locking = 1;
5216
5217	while (cur_offset < item_size) {
5218		u64 parent;
5219		u32 this_name_len;
5220		u32 this_len;
5221		unsigned long name_ptr;
5222		struct btrfs_dir_item *di;
5223		struct fscrypt_str name_str;
5224
5225		if (key->type == BTRFS_INODE_REF_KEY) {
5226			struct btrfs_inode_ref *iref;
5227
5228			iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
5229			parent = key->offset;
5230			this_name_len = btrfs_inode_ref_name_len(eb, iref);
5231			name_ptr = (unsigned long)(iref + 1);
5232			this_len = sizeof(*iref) + this_name_len;
5233		} else {
5234			struct btrfs_inode_extref *extref;
5235
5236			extref = (struct btrfs_inode_extref *)(ptr +
5237							       cur_offset);
5238			parent = btrfs_inode_extref_parent(eb, extref);
5239			this_name_len = btrfs_inode_extref_name_len(eb, extref);
5240			name_ptr = (unsigned long)&extref->name;
5241			this_len = sizeof(*extref) + this_name_len;
5242		}
5243
5244		if (this_name_len > name_len) {
5245			char *new_name;
5246
5247			new_name = krealloc(name, this_name_len, GFP_NOFS);
5248			if (!new_name) {
5249				ret = -ENOMEM;
5250				goto out;
5251			}
5252			name_len = this_name_len;
5253			name = new_name;
5254		}
5255
5256		read_extent_buffer(eb, name, name_ptr, this_name_len);
5257
5258		name_str.name = name;
5259		name_str.len = this_name_len;
5260		di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
5261				parent, &name_str, 0);
5262		if (di && !IS_ERR(di)) {
5263			struct btrfs_key di_key;
5264
5265			btrfs_dir_item_key_to_cpu(search_path->nodes[0],
5266						  di, &di_key);
5267			if (di_key.type == BTRFS_INODE_ITEM_KEY) {
5268				if (di_key.objectid != key->objectid) {
5269					ret = 1;
5270					*other_ino = di_key.objectid;
5271					*other_parent = parent;
5272				} else {
5273					ret = 0;
5274				}
5275			} else {
5276				ret = -EAGAIN;
5277			}
5278			goto out;
5279		} else if (IS_ERR(di)) {
5280			ret = PTR_ERR(di);
5281			goto out;
5282		}
5283		btrfs_release_path(search_path);
5284
5285		cur_offset += this_len;
5286	}
5287	ret = 0;
5288out:
5289	btrfs_free_path(search_path);
5290	kfree(name);
5291	return ret;
5292}
5293
5294/*
5295 * Check if we need to log an inode. This is used in contexts where while
5296 * logging an inode we need to log another inode (either that it exists or in
5297 * full mode). This is used instead of btrfs_inode_in_log() because the later
5298 * requires the inode to be in the log and have the log transaction committed,
5299 * while here we do not care if the log transaction was already committed - our
5300 * caller will commit the log later - and we want to avoid logging an inode
5301 * multiple times when multiple tasks have joined the same log transaction.
5302 */
5303static bool need_log_inode(const struct btrfs_trans_handle *trans,
5304			   struct btrfs_inode *inode)
5305{
5306	/*
5307	 * If a directory was not modified, no dentries added or removed, we can
5308	 * and should avoid logging it.
5309	 */
5310	if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
5311		return false;
5312
5313	/*
5314	 * If this inode does not have new/updated/deleted xattrs since the last
5315	 * time it was logged and is flagged as logged in the current transaction,
5316	 * we can skip logging it. As for new/deleted names, those are updated in
5317	 * the log by link/unlink/rename operations.
5318	 * In case the inode was logged and then evicted and reloaded, its
5319	 * logged_trans will be 0, in which case we have to fully log it since
5320	 * logged_trans is a transient field, not persisted.
5321	 */
5322	if (inode_logged(trans, inode, NULL) == 1 &&
5323	    !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
5324		return false;
5325
5326	return true;
5327}
5328
5329struct btrfs_dir_list {
5330	u64 ino;
5331	struct list_head list;
5332};
5333
5334/*
5335 * Log the inodes of the new dentries of a directory.
5336 * See process_dir_items_leaf() for details about why it is needed.
5337 * This is a recursive operation - if an existing dentry corresponds to a
5338 * directory, that directory's new entries are logged too (same behaviour as
5339 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5340 * the dentries point to we do not acquire their VFS lock, otherwise lockdep
5341 * complains about the following circular lock dependency / possible deadlock:
5342 *
5343 *        CPU0                                        CPU1
5344 *        ----                                        ----
5345 * lock(&type->i_mutex_dir_key#3/2);
5346 *                                            lock(sb_internal#2);
5347 *                                            lock(&type->i_mutex_dir_key#3/2);
5348 * lock(&sb->s_type->i_mutex_key#14);
5349 *
5350 * Where sb_internal is the lock (a counter that works as a lock) acquired by
5351 * sb_start_intwrite() in btrfs_start_transaction().
5352 * Not acquiring the VFS lock of the inodes is still safe because:
5353 *
5354 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5355 *    that while logging the inode new references (names) are added or removed
5356 *    from the inode, leaving the logged inode item with a link count that does
5357 *    not match the number of logged inode reference items. This is fine because
5358 *    at log replay time we compute the real number of links and correct the
5359 *    link count in the inode item (see replay_one_buffer() and
5360 *    link_to_fixup_dir());
5361 *
5362 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5363 *    while logging the inode's items new index items (key type
5364 *    BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
5365 *    has a size that doesn't match the sum of the lengths of all the logged
5366 *    names - this is ok, not a problem, because at log replay time we set the
5367 *    directory's i_size to the correct value (see replay_one_name() and
5368 *    overwrite_item()).
5369 */
5370static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5371				struct btrfs_inode *start_inode,
5372				struct btrfs_log_ctx *ctx)
5373{
5374	struct btrfs_root *root = start_inode->root;
5375	struct btrfs_fs_info *fs_info = root->fs_info;
5376	struct btrfs_path *path;
5377	LIST_HEAD(dir_list);
5378	struct btrfs_dir_list *dir_elem;
5379	u64 ino = btrfs_ino(start_inode);
5380	struct btrfs_inode *curr_inode = start_inode;
5381	int ret = 0;
5382
5383	/*
5384	 * If we are logging a new name, as part of a link or rename operation,
5385	 * don't bother logging new dentries, as we just want to log the names
5386	 * of an inode and that any new parents exist.
5387	 */
5388	if (ctx->logging_new_name)
5389		return 0;
5390
5391	path = btrfs_alloc_path();
5392	if (!path)
5393		return -ENOMEM;
5394
5395	/* Pairs with btrfs_add_delayed_iput below. */
5396	ihold(&curr_inode->vfs_inode);
5397
5398	while (true) {
5399		struct inode *vfs_inode;
5400		struct btrfs_key key;
5401		struct btrfs_key found_key;
5402		u64 next_index;
5403		bool continue_curr_inode = true;
5404		int iter_ret;
5405
5406		key.objectid = ino;
5407		key.type = BTRFS_DIR_INDEX_KEY;
5408		key.offset = btrfs_get_first_dir_index_to_log(curr_inode);
5409		next_index = key.offset;
5410again:
5411		btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) {
5412			struct extent_buffer *leaf = path->nodes[0];
5413			struct btrfs_dir_item *di;
5414			struct btrfs_key di_key;
5415			struct inode *di_inode;
5416			int log_mode = LOG_INODE_EXISTS;
5417			int type;
5418
5419			if (found_key.objectid != ino ||
5420			    found_key.type != BTRFS_DIR_INDEX_KEY) {
5421				continue_curr_inode = false;
5422				break;
5423			}
5424
5425			next_index = found_key.offset + 1;
5426
5427			di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
5428			type = btrfs_dir_ftype(leaf, di);
5429			if (btrfs_dir_transid(leaf, di) < trans->transid)
5430				continue;
5431			btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5432			if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5433				continue;
5434
5435			btrfs_release_path(path);
5436			di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
5437			if (IS_ERR(di_inode)) {
5438				ret = PTR_ERR(di_inode);
5439				goto out;
5440			}
5441
5442			if (!need_log_inode(trans, BTRFS_I(di_inode))) {
5443				btrfs_add_delayed_iput(BTRFS_I(di_inode));
5444				break;
5445			}
5446
5447			ctx->log_new_dentries = false;
5448			if (type == BTRFS_FT_DIR)
5449				log_mode = LOG_INODE_ALL;
5450			ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
5451					      log_mode, ctx);
5452			btrfs_add_delayed_iput(BTRFS_I(di_inode));
5453			if (ret)
5454				goto out;
5455			if (ctx->log_new_dentries) {
5456				dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5457				if (!dir_elem) {
5458					ret = -ENOMEM;
5459					goto out;
5460				}
5461				dir_elem->ino = di_key.objectid;
5462				list_add_tail(&dir_elem->list, &dir_list);
5463			}
5464			break;
5465		}
5466
5467		btrfs_release_path(path);
5468
5469		if (iter_ret < 0) {
5470			ret = iter_ret;
5471			goto out;
5472		} else if (iter_ret > 0) {
5473			continue_curr_inode = false;
5474		} else {
5475			key = found_key;
5476		}
5477
5478		if (continue_curr_inode && key.offset < (u64)-1) {
5479			key.offset++;
5480			goto again;
5481		}
5482
5483		btrfs_set_first_dir_index_to_log(curr_inode, next_index);
5484
5485		if (list_empty(&dir_list))
5486			break;
5487
5488		dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
5489		ino = dir_elem->ino;
5490		list_del(&dir_elem->list);
5491		kfree(dir_elem);
5492
5493		btrfs_add_delayed_iput(curr_inode);
5494		curr_inode = NULL;
5495
5496		vfs_inode = btrfs_iget(fs_info->sb, ino, root);
5497		if (IS_ERR(vfs_inode)) {
5498			ret = PTR_ERR(vfs_inode);
5499			break;
5500		}
5501		curr_inode = BTRFS_I(vfs_inode);
5502	}
5503out:
5504	btrfs_free_path(path);
5505	if (curr_inode)
5506		btrfs_add_delayed_iput(curr_inode);
5507
5508	if (ret) {
5509		struct btrfs_dir_list *next;
5510
5511		list_for_each_entry_safe(dir_elem, next, &dir_list, list)
5512			kfree(dir_elem);
5513	}
5514
5515	return ret;
5516}
5517
5518struct btrfs_ino_list {
5519	u64 ino;
5520	u64 parent;
5521	struct list_head list;
5522};
5523
5524static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
5525{
5526	struct btrfs_ino_list *curr;
5527	struct btrfs_ino_list *next;
5528
5529	list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
5530		list_del(&curr->list);
5531		kfree(curr);
5532	}
5533}
5534
5535static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
5536				    struct btrfs_path *path)
5537{
5538	struct btrfs_key key;
5539	int ret;
5540
5541	key.objectid = ino;
5542	key.type = BTRFS_INODE_ITEM_KEY;
5543	key.offset = 0;
5544
5545	path->search_commit_root = 1;
5546	path->skip_locking = 1;
5547
5548	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5549	if (WARN_ON_ONCE(ret > 0)) {
5550		/*
5551		 * We have previously found the inode through the commit root
5552		 * so this should not happen. If it does, just error out and
5553		 * fallback to a transaction commit.
5554		 */
5555		ret = -ENOENT;
5556	} else if (ret == 0) {
5557		struct btrfs_inode_item *item;
5558
5559		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5560				      struct btrfs_inode_item);
5561		if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
5562			ret = 1;
5563	}
5564
5565	btrfs_release_path(path);
5566	path->search_commit_root = 0;
5567	path->skip_locking = 0;
5568
5569	return ret;
5570}
5571
5572static int add_conflicting_inode(struct btrfs_trans_handle *trans,
5573				 struct btrfs_root *root,
5574				 struct btrfs_path *path,
5575				 u64 ino, u64 parent,
5576				 struct btrfs_log_ctx *ctx)
5577{
5578	struct btrfs_ino_list *ino_elem;
5579	struct inode *inode;
5580
5581	/*
5582	 * It's rare to have a lot of conflicting inodes, in practice it is not
5583	 * common to have more than 1 or 2. We don't want to collect too many,
5584	 * as we could end up logging too many inodes (even if only in
5585	 * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
5586	 * commits.
5587	 */
5588	if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
5589		return BTRFS_LOG_FORCE_COMMIT;
5590
5591	inode = btrfs_iget(root->fs_info->sb, ino, root);
5592	/*
5593	 * If the other inode that had a conflicting dir entry was deleted in
5594	 * the current transaction then we either:
5595	 *
5596	 * 1) Log the parent directory (later after adding it to the list) if
5597	 *    the inode is a directory. This is because it may be a deleted
5598	 *    subvolume/snapshot or it may be a regular directory that had
5599	 *    deleted subvolumes/snapshots (or subdirectories that had them),
5600	 *    and at the moment we can't deal with dropping subvolumes/snapshots
5601	 *    during log replay. So we just log the parent, which will result in
5602	 *    a fallback to a transaction commit if we are dealing with those
5603	 *    cases (last_unlink_trans will match the current transaction);
5604	 *
5605	 * 2) Do nothing if it's not a directory. During log replay we simply
5606	 *    unlink the conflicting dentry from the parent directory and then
5607	 *    add the dentry for our inode. Like this we can avoid logging the
5608	 *    parent directory (and maybe fallback to a transaction commit in
5609	 *    case it has a last_unlink_trans == trans->transid, due to moving
5610	 *    some inode from it to some other directory).
5611	 */
5612	if (IS_ERR(inode)) {
5613		int ret = PTR_ERR(inode);
5614
5615		if (ret != -ENOENT)
5616			return ret;
5617
5618		ret = conflicting_inode_is_dir(root, ino, path);
5619		/* Not a directory or we got an error. */
5620		if (ret <= 0)
5621			return ret;
5622
5623		/* Conflicting inode is a directory, so we'll log its parent. */
5624		ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5625		if (!ino_elem)
5626			return -ENOMEM;
5627		ino_elem->ino = ino;
5628		ino_elem->parent = parent;
5629		list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
5630		ctx->num_conflict_inodes++;
5631
5632		return 0;
5633	}
5634
5635	/*
5636	 * If the inode was already logged skip it - otherwise we can hit an
5637	 * infinite loop. Example:
5638	 *
5639	 * From the commit root (previous transaction) we have the following
5640	 * inodes:
5641	 *
5642	 * inode 257 a directory
5643	 * inode 258 with references "zz" and "zz_link" on inode 257
5644	 * inode 259 with reference "a" on inode 257
5645	 *
5646	 * And in the current (uncommitted) transaction we have:
5647	 *
5648	 * inode 257 a directory, unchanged
5649	 * inode 258 with references "a" and "a2" on inode 257
5650	 * inode 259 with reference "zz_link" on inode 257
5651	 * inode 261 with reference "zz" on inode 257
5652	 *
5653	 * When logging inode 261 the following infinite loop could
5654	 * happen if we don't skip already logged inodes:
5655	 *
5656	 * - we detect inode 258 as a conflicting inode, with inode 261
5657	 *   on reference "zz", and log it;
5658	 *
5659	 * - we detect inode 259 as a conflicting inode, with inode 258
5660	 *   on reference "a", and log it;
5661	 *
5662	 * - we detect inode 258 as a conflicting inode, with inode 259
5663	 *   on reference "zz_link", and log it - again! After this we
5664	 *   repeat the above steps forever.
5665	 *
5666	 * Here we can use need_log_inode() because we only need to log the
5667	 * inode in LOG_INODE_EXISTS mode and rename operations update the log,
5668	 * so that the log ends up with the new name and without the old name.
5669	 */
5670	if (!need_log_inode(trans, BTRFS_I(inode))) {
5671		btrfs_add_delayed_iput(BTRFS_I(inode));
5672		return 0;
5673	}
5674
5675	btrfs_add_delayed_iput(BTRFS_I(inode));
5676
5677	ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5678	if (!ino_elem)
5679		return -ENOMEM;
5680	ino_elem->ino = ino;
5681	ino_elem->parent = parent;
5682	list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
5683	ctx->num_conflict_inodes++;
5684
5685	return 0;
5686}
5687
5688static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
5689				  struct btrfs_root *root,
5690				  struct btrfs_log_ctx *ctx)
5691{
5692	struct btrfs_fs_info *fs_info = root->fs_info;
5693	int ret = 0;
5694
5695	/*
5696	 * Conflicting inodes are logged by the first call to btrfs_log_inode(),
5697	 * otherwise we could have unbounded recursion of btrfs_log_inode()
5698	 * calls. This check guarantees we can have only 1 level of recursion.
5699	 */
5700	if (ctx->logging_conflict_inodes)
5701		return 0;
5702
5703	ctx->logging_conflict_inodes = true;
5704
5705	/*
5706	 * New conflicting inodes may be found and added to the list while we
5707	 * are logging a conflicting inode, so keep iterating while the list is
5708	 * not empty.
5709	 */
5710	while (!list_empty(&ctx->conflict_inodes)) {
5711		struct btrfs_ino_list *curr;
5712		struct inode *inode;
5713		u64 ino;
5714		u64 parent;
5715
5716		curr = list_first_entry(&ctx->conflict_inodes,
5717					struct btrfs_ino_list, list);
5718		ino = curr->ino;
5719		parent = curr->parent;
5720		list_del(&curr->list);
5721		kfree(curr);
5722
5723		inode = btrfs_iget(fs_info->sb, ino, root);
5724		/*
5725		 * If the other inode that had a conflicting dir entry was
5726		 * deleted in the current transaction, we need to log its parent
5727		 * directory. See the comment at add_conflicting_inode().
5728		 */
5729		if (IS_ERR(inode)) {
5730			ret = PTR_ERR(inode);
5731			if (ret != -ENOENT)
5732				break;
5733
5734			inode = btrfs_iget(fs_info->sb, parent, root);
5735			if (IS_ERR(inode)) {
5736				ret = PTR_ERR(inode);
5737				break;
5738			}
5739
5740			/*
5741			 * Always log the directory, we cannot make this
5742			 * conditional on need_log_inode() because the directory
5743			 * might have been logged in LOG_INODE_EXISTS mode or
5744			 * the dir index of the conflicting inode is not in a
5745			 * dir index key range logged for the directory. So we
5746			 * must make sure the deletion is recorded.
5747			 */
5748			ret = btrfs_log_inode(trans, BTRFS_I(inode),
5749					      LOG_INODE_ALL, ctx);
5750			btrfs_add_delayed_iput(BTRFS_I(inode));
5751			if (ret)
5752				break;
5753			continue;
5754		}
5755
5756		/*
5757		 * Here we can use need_log_inode() because we only need to log
5758		 * the inode in LOG_INODE_EXISTS mode and rename operations
5759		 * update the log, so that the log ends up with the new name and
5760		 * without the old name.
5761		 *
5762		 * We did this check at add_conflicting_inode(), but here we do
5763		 * it again because if some other task logged the inode after
5764		 * that, we can avoid doing it again.
5765		 */
5766		if (!need_log_inode(trans, BTRFS_I(inode))) {
5767			btrfs_add_delayed_iput(BTRFS_I(inode));
5768			continue;
5769		}
5770
5771		/*
5772		 * We are safe logging the other inode without acquiring its
5773		 * lock as long as we log with the LOG_INODE_EXISTS mode. We
5774		 * are safe against concurrent renames of the other inode as
5775		 * well because during a rename we pin the log and update the
5776		 * log with the new name before we unpin it.
5777		 */
5778		ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
5779		btrfs_add_delayed_iput(BTRFS_I(inode));
5780		if (ret)
5781			break;
5782	}
5783
5784	ctx->logging_conflict_inodes = false;
5785	if (ret)
5786		free_conflicting_inodes(ctx);
5787
5788	return ret;
5789}
5790
5791static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
5792				   struct btrfs_inode *inode,
5793				   struct btrfs_key *min_key,
5794				   const struct btrfs_key *max_key,
5795				   struct btrfs_path *path,
5796				   struct btrfs_path *dst_path,
5797				   const u64 logged_isize,
5798				   const int inode_only,
5799				   struct btrfs_log_ctx *ctx,
5800				   bool *need_log_inode_item)
5801{
5802	const u64 i_size = i_size_read(&inode->vfs_inode);
5803	struct btrfs_root *root = inode->root;
5804	int ins_start_slot = 0;
5805	int ins_nr = 0;
5806	int ret;
5807
5808	while (1) {
5809		ret = btrfs_search_forward(root, min_key, path, trans->transid);
5810		if (ret < 0)
5811			return ret;
5812		if (ret > 0) {
5813			ret = 0;
5814			break;
5815		}
5816again:
5817		/* Note, ins_nr might be > 0 here, cleanup outside the loop */
5818		if (min_key->objectid != max_key->objectid)
5819			break;
5820		if (min_key->type > max_key->type)
5821			break;
5822
5823		if (min_key->type == BTRFS_INODE_ITEM_KEY) {
5824			*need_log_inode_item = false;
5825		} else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
5826			   min_key->offset >= i_size) {
5827			/*
5828			 * Extents at and beyond eof are logged with
5829			 * btrfs_log_prealloc_extents().
5830			 * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
5831			 * and no keys greater than that, so bail out.
5832			 */
5833			break;
5834		} else if ((min_key->type == BTRFS_INODE_REF_KEY ||
5835			    min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5836			   (inode->generation == trans->transid ||
5837			    ctx->logging_conflict_inodes)) {
5838			u64 other_ino = 0;
5839			u64 other_parent = 0;
5840
5841			ret = btrfs_check_ref_name_override(path->nodes[0],
5842					path->slots[0], min_key, inode,
5843					&other_ino, &other_parent);
5844			if (ret < 0) {
5845				return ret;
5846			} else if (ret > 0 &&
5847				   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
5848				if (ins_nr > 0) {
5849					ins_nr++;
5850				} else {
5851					ins_nr = 1;
5852					ins_start_slot = path->slots[0];
5853				}
5854				ret = copy_items(trans, inode, dst_path, path,
5855						 ins_start_slot, ins_nr,
5856						 inode_only, logged_isize);
5857				if (ret < 0)
5858					return ret;
5859				ins_nr = 0;
5860
5861				btrfs_release_path(path);
5862				ret = add_conflicting_inode(trans, root, path,
5863							    other_ino,
5864							    other_parent, ctx);
5865				if (ret)
5866					return ret;
5867				goto next_key;
5868			}
5869		} else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5870			/* Skip xattrs, logged later with btrfs_log_all_xattrs() */
5871			if (ins_nr == 0)
5872				goto next_slot;
5873			ret = copy_items(trans, inode, dst_path, path,
5874					 ins_start_slot,
5875					 ins_nr, inode_only, logged_isize);
5876			if (ret < 0)
5877				return ret;
5878			ins_nr = 0;
5879			goto next_slot;
5880		}
5881
5882		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5883			ins_nr++;
5884			goto next_slot;
5885		} else if (!ins_nr) {
5886			ins_start_slot = path->slots[0];
5887			ins_nr = 1;
5888			goto next_slot;
5889		}
5890
5891		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5892				 ins_nr, inode_only, logged_isize);
5893		if (ret < 0)
5894			return ret;
5895		ins_nr = 1;
5896		ins_start_slot = path->slots[0];
5897next_slot:
5898		path->slots[0]++;
5899		if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5900			btrfs_item_key_to_cpu(path->nodes[0], min_key,
5901					      path->slots[0]);
5902			goto again;
5903		}
5904		if (ins_nr) {
5905			ret = copy_items(trans, inode, dst_path, path,
5906					 ins_start_slot, ins_nr, inode_only,
5907					 logged_isize);
5908			if (ret < 0)
5909				return ret;
5910			ins_nr = 0;
5911		}
5912		btrfs_release_path(path);
5913next_key:
5914		if (min_key->offset < (u64)-1) {
5915			min_key->offset++;
5916		} else if (min_key->type < max_key->type) {
5917			min_key->type++;
5918			min_key->offset = 0;
5919		} else {
5920			break;
5921		}
5922
5923		/*
5924		 * We may process many leaves full of items for our inode, so
5925		 * avoid monopolizing a cpu for too long by rescheduling while
5926		 * not holding locks on any tree.
5927		 */
5928		cond_resched();
5929	}
5930	if (ins_nr) {
5931		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5932				 ins_nr, inode_only, logged_isize);
5933		if (ret)
5934			return ret;
5935	}
5936
5937	if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
5938		/*
5939		 * Release the path because otherwise we might attempt to double
5940		 * lock the same leaf with btrfs_log_prealloc_extents() below.
5941		 */
5942		btrfs_release_path(path);
5943		ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
5944	}
5945
5946	return ret;
5947}
5948
5949static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
5950				      struct btrfs_root *log,
5951				      struct btrfs_path *path,
5952				      const struct btrfs_item_batch *batch,
5953				      const struct btrfs_delayed_item *first_item)
5954{
5955	const struct btrfs_delayed_item *curr = first_item;
5956	int ret;
5957
5958	ret = btrfs_insert_empty_items(trans, log, path, batch);
5959	if (ret)
5960		return ret;
5961
5962	for (int i = 0; i < batch->nr; i++) {
5963		char *data_ptr;
5964
5965		data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
5966		write_extent_buffer(path->nodes[0], &curr->data,
5967				    (unsigned long)data_ptr, curr->data_len);
5968		curr = list_next_entry(curr, log_list);
5969		path->slots[0]++;
5970	}
5971
5972	btrfs_release_path(path);
5973
5974	return 0;
5975}
5976
5977static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
5978				       struct btrfs_inode *inode,
5979				       struct btrfs_path *path,
5980				       const struct list_head *delayed_ins_list,
5981				       struct btrfs_log_ctx *ctx)
5982{
5983	/* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
5984	const int max_batch_size = 195;
5985	const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
5986	const u64 ino = btrfs_ino(inode);
5987	struct btrfs_root *log = inode->root->log_root;
5988	struct btrfs_item_batch batch = {
5989		.nr = 0,
5990		.total_data_size = 0,
5991	};
5992	const struct btrfs_delayed_item *first = NULL;
5993	const struct btrfs_delayed_item *curr;
5994	char *ins_data;
5995	struct btrfs_key *ins_keys;
5996	u32 *ins_sizes;
5997	u64 curr_batch_size = 0;
5998	int batch_idx = 0;
5999	int ret;
6000
6001	/* We are adding dir index items to the log tree. */
6002	lockdep_assert_held(&inode->log_mutex);
6003
6004	/*
6005	 * We collect delayed items before copying index keys from the subvolume
6006	 * to the log tree. However just after we collected them, they may have
6007	 * been flushed (all of them or just some of them), and therefore we
6008	 * could have copied them from the subvolume tree to the log tree.
6009	 * So find the first delayed item that was not yet logged (they are
6010	 * sorted by index number).
6011	 */
6012	list_for_each_entry(curr, delayed_ins_list, log_list) {
6013		if (curr->index > inode->last_dir_index_offset) {
6014			first = curr;
6015			break;
6016		}
6017	}
6018
6019	/* Empty list or all delayed items were already logged. */
6020	if (!first)
6021		return 0;
6022
6023	ins_data = kmalloc(max_batch_size * sizeof(u32) +
6024			   max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
6025	if (!ins_data)
6026		return -ENOMEM;
6027	ins_sizes = (u32 *)ins_data;
6028	batch.data_sizes = ins_sizes;
6029	ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
6030	batch.keys = ins_keys;
6031
6032	curr = first;
6033	while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
6034		const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
6035
6036		if (curr_batch_size + curr_size > leaf_data_size ||
6037		    batch.nr == max_batch_size) {
6038			ret = insert_delayed_items_batch(trans, log, path,
6039							 &batch, first);
6040			if (ret)
6041				goto out;
6042			batch_idx = 0;
6043			batch.nr = 0;
6044			batch.total_data_size = 0;
6045			curr_batch_size = 0;
6046			first = curr;
6047		}
6048
6049		ins_sizes[batch_idx] = curr->data_len;
6050		ins_keys[batch_idx].objectid = ino;
6051		ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
6052		ins_keys[batch_idx].offset = curr->index;
6053		curr_batch_size += curr_size;
6054		batch.total_data_size += curr->data_len;
6055		batch.nr++;
6056		batch_idx++;
6057		curr = list_next_entry(curr, log_list);
6058	}
6059
6060	ASSERT(batch.nr >= 1);
6061	ret = insert_delayed_items_batch(trans, log, path, &batch, first);
6062
6063	curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
6064			       log_list);
6065	inode->last_dir_index_offset = curr->index;
6066out:
6067	kfree(ins_data);
6068
6069	return ret;
6070}
6071
6072static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
6073				      struct btrfs_inode *inode,
6074				      struct btrfs_path *path,
6075				      const struct list_head *delayed_del_list,
6076				      struct btrfs_log_ctx *ctx)
6077{
6078	const u64 ino = btrfs_ino(inode);
6079	const struct btrfs_delayed_item *curr;
6080
6081	curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6082				log_list);
6083
6084	while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6085		u64 first_dir_index = curr->index;
6086		u64 last_dir_index;
6087		const struct btrfs_delayed_item *next;
6088		int ret;
6089
6090		/*
6091		 * Find a range of consecutive dir index items to delete. Like
6092		 * this we log a single dir range item spanning several contiguous
6093		 * dir items instead of logging one range item per dir index item.
6094		 */
6095		next = list_next_entry(curr, log_list);
6096		while (!list_entry_is_head(next, delayed_del_list, log_list)) {
6097			if (next->index != curr->index + 1)
6098				break;
6099			curr = next;
6100			next = list_next_entry(next, log_list);
6101		}
6102
6103		last_dir_index = curr->index;
6104		ASSERT(last_dir_index >= first_dir_index);
6105
6106		ret = insert_dir_log_key(trans, inode->root->log_root, path,
6107					 ino, first_dir_index, last_dir_index);
6108		if (ret)
6109			return ret;
6110		curr = list_next_entry(curr, log_list);
6111	}
6112
6113	return 0;
6114}
6115
6116static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
6117					struct btrfs_inode *inode,
6118					struct btrfs_path *path,
6119					struct btrfs_log_ctx *ctx,
6120					const struct list_head *delayed_del_list,
6121					const struct btrfs_delayed_item *first,
6122					const struct btrfs_delayed_item **last_ret)
6123{
6124	const struct btrfs_delayed_item *next;
6125	struct extent_buffer *leaf = path->nodes[0];
6126	const int last_slot = btrfs_header_nritems(leaf) - 1;
6127	int slot = path->slots[0] + 1;
6128	const u64 ino = btrfs_ino(inode);
6129
6130	next = list_next_entry(first, log_list);
6131
6132	while (slot < last_slot &&
6133	       !list_entry_is_head(next, delayed_del_list, log_list)) {
6134		struct btrfs_key key;
6135
6136		btrfs_item_key_to_cpu(leaf, &key, slot);
6137		if (key.objectid != ino ||
6138		    key.type != BTRFS_DIR_INDEX_KEY ||
6139		    key.offset != next->index)
6140			break;
6141
6142		slot++;
6143		*last_ret = next;
6144		next = list_next_entry(next, log_list);
6145	}
6146
6147	return btrfs_del_items(trans, inode->root->log_root, path,
6148			       path->slots[0], slot - path->slots[0]);
6149}
6150
6151static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
6152					     struct btrfs_inode *inode,
6153					     struct btrfs_path *path,
6154					     const struct list_head *delayed_del_list,
6155					     struct btrfs_log_ctx *ctx)
6156{
6157	struct btrfs_root *log = inode->root->log_root;
6158	const struct btrfs_delayed_item *curr;
6159	u64 last_range_start = 0;
6160	u64 last_range_end = 0;
6161	struct btrfs_key key;
6162
6163	key.objectid = btrfs_ino(inode);
6164	key.type = BTRFS_DIR_INDEX_KEY;
6165	curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6166				log_list);
6167
6168	while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6169		const struct btrfs_delayed_item *last = curr;
6170		u64 first_dir_index = curr->index;
6171		u64 last_dir_index;
6172		bool deleted_items = false;
6173		int ret;
6174
6175		key.offset = curr->index;
6176		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
6177		if (ret < 0) {
6178			return ret;
6179		} else if (ret == 0) {
6180			ret = batch_delete_dir_index_items(trans, inode, path, ctx,
6181							   delayed_del_list, curr,
6182							   &last);
6183			if (ret)
6184				return ret;
6185			deleted_items = true;
6186		}
6187
6188		btrfs_release_path(path);
6189
6190		/*
6191		 * If we deleted items from the leaf, it means we have a range
6192		 * item logging their range, so no need to add one or update an
6193		 * existing one. Otherwise we have to log a dir range item.
6194		 */
6195		if (deleted_items)
6196			goto next_batch;
6197
6198		last_dir_index = last->index;
6199		ASSERT(last_dir_index >= first_dir_index);
6200		/*
6201		 * If this range starts right after where the previous one ends,
6202		 * then we want to reuse the previous range item and change its
6203		 * end offset to the end of this range. This is just to minimize
6204		 * leaf space usage, by avoiding adding a new range item.
6205		 */
6206		if (last_range_end != 0 && first_dir_index == last_range_end + 1)
6207			first_dir_index = last_range_start;
6208
6209		ret = insert_dir_log_key(trans, log, path, key.objectid,
6210					 first_dir_index, last_dir_index);
6211		if (ret)
6212			return ret;
6213
6214		last_range_start = first_dir_index;
6215		last_range_end = last_dir_index;
6216next_batch:
6217		curr = list_next_entry(last, log_list);
6218	}
6219
6220	return 0;
6221}
6222
6223static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
6224				      struct btrfs_inode *inode,
6225				      struct btrfs_path *path,
6226				      const struct list_head *delayed_del_list,
6227				      struct btrfs_log_ctx *ctx)
6228{
6229	/*
6230	 * We are deleting dir index items from the log tree or adding range
6231	 * items to it.
6232	 */
6233	lockdep_assert_held(&inode->log_mutex);
6234
6235	if (list_empty(delayed_del_list))
6236		return 0;
6237
6238	if (ctx->logged_before)
6239		return log_delayed_deletions_incremental(trans, inode, path,
6240							 delayed_del_list, ctx);
6241
6242	return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
6243					  ctx);
6244}
6245
6246/*
6247 * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
6248 * items instead of the subvolume tree.
6249 */
6250static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
6251				    struct btrfs_inode *inode,
6252				    const struct list_head *delayed_ins_list,
6253				    struct btrfs_log_ctx *ctx)
6254{
6255	const bool orig_log_new_dentries = ctx->log_new_dentries;
6256	struct btrfs_fs_info *fs_info = trans->fs_info;
6257	struct btrfs_delayed_item *item;
6258	int ret = 0;
6259
6260	/*
6261	 * No need for the log mutex, plus to avoid potential deadlocks or
6262	 * lockdep annotations due to nesting of delayed inode mutexes and log
6263	 * mutexes.
6264	 */
6265	lockdep_assert_not_held(&inode->log_mutex);
6266
6267	ASSERT(!ctx->logging_new_delayed_dentries);
6268	ctx->logging_new_delayed_dentries = true;
6269
6270	list_for_each_entry(item, delayed_ins_list, log_list) {
6271		struct btrfs_dir_item *dir_item;
6272		struct inode *di_inode;
6273		struct btrfs_key key;
6274		int log_mode = LOG_INODE_EXISTS;
6275
6276		dir_item = (struct btrfs_dir_item *)item->data;
6277		btrfs_disk_key_to_cpu(&key, &dir_item->location);
6278
6279		if (key.type == BTRFS_ROOT_ITEM_KEY)
6280			continue;
6281
6282		di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root);
6283		if (IS_ERR(di_inode)) {
6284			ret = PTR_ERR(di_inode);
6285			break;
6286		}
6287
6288		if (!need_log_inode(trans, BTRFS_I(di_inode))) {
6289			btrfs_add_delayed_iput(BTRFS_I(di_inode));
6290			continue;
6291		}
6292
6293		if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR)
6294			log_mode = LOG_INODE_ALL;
6295
6296		ctx->log_new_dentries = false;
6297		ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);
6298
6299		if (!ret && ctx->log_new_dentries)
6300			ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
6301
6302		btrfs_add_delayed_iput(BTRFS_I(di_inode));
6303
6304		if (ret)
6305			break;
6306	}
6307
6308	ctx->log_new_dentries = orig_log_new_dentries;
6309	ctx->logging_new_delayed_dentries = false;
6310
6311	return ret;
6312}
6313
6314/* log a single inode in the tree log.
6315 * At least one parent directory for this inode must exist in the tree
6316 * or be logged already.
6317 *
6318 * Any items from this inode changed by the current transaction are copied
6319 * to the log tree.  An extra reference is taken on any extents in this
6320 * file, allowing us to avoid a whole pile of corner cases around logging
6321 * blocks that have been removed from the tree.
6322 *
6323 * See LOG_INODE_ALL and related defines for a description of what inode_only
6324 * does.
6325 *
6326 * This handles both files and directories.
6327 */
6328static int btrfs_log_inode(struct btrfs_trans_handle *trans,
6329			   struct btrfs_inode *inode,
6330			   int inode_only,
6331			   struct btrfs_log_ctx *ctx)
6332{
6333	struct btrfs_path *path;
6334	struct btrfs_path *dst_path;
6335	struct btrfs_key min_key;
6336	struct btrfs_key max_key;
6337	struct btrfs_root *log = inode->root->log_root;
6338	int ret;
6339	bool fast_search = false;
6340	u64 ino = btrfs_ino(inode);
6341	struct extent_map_tree *em_tree = &inode->extent_tree;
6342	u64 logged_isize = 0;
6343	bool need_log_inode_item = true;
6344	bool xattrs_logged = false;
6345	bool inode_item_dropped = true;
6346	bool full_dir_logging = false;
6347	LIST_HEAD(delayed_ins_list);
6348	LIST_HEAD(delayed_del_list);
6349
6350	path = btrfs_alloc_path();
6351	if (!path)
6352		return -ENOMEM;
6353	dst_path = btrfs_alloc_path();
6354	if (!dst_path) {
6355		btrfs_free_path(path);
6356		return -ENOMEM;
6357	}
6358
6359	min_key.objectid = ino;
6360	min_key.type = BTRFS_INODE_ITEM_KEY;
6361	min_key.offset = 0;
6362
6363	max_key.objectid = ino;
6364
6365
6366	/* today the code can only do partial logging of directories */
6367	if (S_ISDIR(inode->vfs_inode.i_mode) ||
6368	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6369		       &inode->runtime_flags) &&
6370	     inode_only >= LOG_INODE_EXISTS))
6371		max_key.type = BTRFS_XATTR_ITEM_KEY;
6372	else
6373		max_key.type = (u8)-1;
6374	max_key.offset = (u64)-1;
6375
6376	if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
6377		full_dir_logging = true;
6378
6379	/*
6380	 * If we are logging a directory while we are logging dentries of the
6381	 * delayed items of some other inode, then we need to flush the delayed
6382	 * items of this directory and not log the delayed items directly. This
6383	 * is to prevent more than one level of recursion into btrfs_log_inode()
6384	 * by having something like this:
6385	 *
6386	 *     $ mkdir -p a/b/c/d/e/f/g/h/...
6387	 *     $ xfs_io -c "fsync" a
6388	 *
6389	 * Where all directories in the path did not exist before and are
6390	 * created in the current transaction.
6391	 * So in such a case we directly log the delayed items of the main
6392	 * directory ("a") without flushing them first, while for each of its
6393	 * subdirectories we flush their delayed items before logging them.
6394	 * This prevents a potential unbounded recursion like this:
6395	 *
6396	 * btrfs_log_inode()
6397	 *   log_new_delayed_dentries()
6398	 *      btrfs_log_inode()
6399	 *        log_new_delayed_dentries()
6400	 *          btrfs_log_inode()
6401	 *            log_new_delayed_dentries()
6402	 *              (...)
6403	 *
6404	 * We have thresholds for the maximum number of delayed items to have in
6405	 * memory, and once they are hit, the items are flushed asynchronously.
6406	 * However the limit is quite high, so lets prevent deep levels of
6407	 * recursion to happen by limiting the maximum depth to be 1.
6408	 */
6409	if (full_dir_logging && ctx->logging_new_delayed_dentries) {
6410		ret = btrfs_commit_inode_delayed_items(trans, inode);
6411		if (ret)
6412			goto out;
6413	}
6414
6415	mutex_lock(&inode->log_mutex);
6416
6417	/*
6418	 * For symlinks, we must always log their content, which is stored in an
6419	 * inline extent, otherwise we could end up with an empty symlink after
6420	 * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
6421	 * one attempts to create an empty symlink).
6422	 * We don't need to worry about flushing delalloc, because when we create
6423	 * the inline extent when the symlink is created (we never have delalloc
6424	 * for symlinks).
6425	 */
6426	if (S_ISLNK(inode->vfs_inode.i_mode))
6427		inode_only = LOG_INODE_ALL;
6428
6429	/*
6430	 * Before logging the inode item, cache the value returned by
6431	 * inode_logged(), because after that we have the need to figure out if
6432	 * the inode was previously logged in this transaction.
6433	 */
6434	ret = inode_logged(trans, inode, path);
6435	if (ret < 0)
6436		goto out_unlock;
6437	ctx->logged_before = (ret == 1);
6438	ret = 0;
6439
6440	/*
6441	 * This is for cases where logging a directory could result in losing a
6442	 * a file after replaying the log. For example, if we move a file from a
6443	 * directory A to a directory B, then fsync directory A, we have no way
6444	 * to known the file was moved from A to B, so logging just A would
6445	 * result in losing the file after a log replay.
6446	 */
6447	if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
6448		ret = BTRFS_LOG_FORCE_COMMIT;
6449		goto out_unlock;
6450	}
6451
6452	/*
6453	 * a brute force approach to making sure we get the most uptodate
6454	 * copies of everything.
6455	 */
6456	if (S_ISDIR(inode->vfs_inode.i_mode)) {
6457		clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
6458		if (ctx->logged_before)
6459			ret = drop_inode_items(trans, log, path, inode,
6460					       BTRFS_XATTR_ITEM_KEY);
6461	} else {
6462		if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
6463			/*
6464			 * Make sure the new inode item we write to the log has
6465			 * the same isize as the current one (if it exists).
6466			 * This is necessary to prevent data loss after log
6467			 * replay, and also to prevent doing a wrong expanding
6468			 * truncate - for e.g. create file, write 4K into offset
6469			 * 0, fsync, write 4K into offset 4096, add hard link,
6470			 * fsync some other file (to sync log), power fail - if
6471			 * we use the inode's current i_size, after log replay
6472			 * we get a 8Kb file, with the last 4Kb extent as a hole
6473			 * (zeroes), as if an expanding truncate happened,
6474			 * instead of getting a file of 4Kb only.
6475			 */
6476			ret = logged_inode_size(log, inode, path, &logged_isize);
6477			if (ret)
6478				goto out_unlock;
6479		}
6480		if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6481			     &inode->runtime_flags)) {
6482			if (inode_only == LOG_INODE_EXISTS) {
6483				max_key.type = BTRFS_XATTR_ITEM_KEY;
6484				if (ctx->logged_before)
6485					ret = drop_inode_items(trans, log, path,
6486							       inode, max_key.type);
6487			} else {
6488				clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6489					  &inode->runtime_flags);
6490				clear_bit(BTRFS_INODE_COPY_EVERYTHING,
6491					  &inode->runtime_flags);
6492				if (ctx->logged_before)
6493					ret = truncate_inode_items(trans, log,
6494								   inode, 0, 0);
6495			}
6496		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
6497					      &inode->runtime_flags) ||
6498			   inode_only == LOG_INODE_EXISTS) {
6499			if (inode_only == LOG_INODE_ALL)
6500				fast_search = true;
6501			max_key.type = BTRFS_XATTR_ITEM_KEY;
6502			if (ctx->logged_before)
6503				ret = drop_inode_items(trans, log, path, inode,
6504						       max_key.type);
6505		} else {
6506			if (inode_only == LOG_INODE_ALL)
6507				fast_search = true;
6508			inode_item_dropped = false;
6509			goto log_extents;
6510		}
6511
6512	}
6513	if (ret)
6514		goto out_unlock;
6515
6516	/*
6517	 * If we are logging a directory in full mode, collect the delayed items
6518	 * before iterating the subvolume tree, so that we don't miss any new
6519	 * dir index items in case they get flushed while or right after we are
6520	 * iterating the subvolume tree.
6521	 */
6522	if (full_dir_logging && !ctx->logging_new_delayed_dentries)
6523		btrfs_log_get_delayed_items(inode, &delayed_ins_list,
6524					    &delayed_del_list);
6525
6526	ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
6527				      path, dst_path, logged_isize,
6528				      inode_only, ctx,
6529				      &need_log_inode_item);
6530	if (ret)
6531		goto out_unlock;
6532
6533	btrfs_release_path(path);
6534	btrfs_release_path(dst_path);
6535	ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
6536	if (ret)
6537		goto out_unlock;
6538	xattrs_logged = true;
6539	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
6540		btrfs_release_path(path);
6541		btrfs_release_path(dst_path);
6542		ret = btrfs_log_holes(trans, inode, path);
6543		if (ret)
6544			goto out_unlock;
6545	}
6546log_extents:
6547	btrfs_release_path(path);
6548	btrfs_release_path(dst_path);
6549	if (need_log_inode_item) {
6550		ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
6551		if (ret)
6552			goto out_unlock;
6553		/*
6554		 * If we are doing a fast fsync and the inode was logged before
6555		 * in this transaction, we don't need to log the xattrs because
6556		 * they were logged before. If xattrs were added, changed or
6557		 * deleted since the last time we logged the inode, then we have
6558		 * already logged them because the inode had the runtime flag
6559		 * BTRFS_INODE_COPY_EVERYTHING set.
6560		 */
6561		if (!xattrs_logged && inode->logged_trans < trans->transid) {
6562			ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
6563			if (ret)
6564				goto out_unlock;
6565			btrfs_release_path(path);
6566		}
6567	}
6568	if (fast_search) {
6569		ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
6570		if (ret)
6571			goto out_unlock;
6572	} else if (inode_only == LOG_INODE_ALL) {
6573		struct extent_map *em, *n;
6574
6575		write_lock(&em_tree->lock);
6576		list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
6577			list_del_init(&em->list);
6578		write_unlock(&em_tree->lock);
6579	}
6580
6581	if (full_dir_logging) {
6582		ret = log_directory_changes(trans, inode, path, dst_path, ctx);
6583		if (ret)
6584			goto out_unlock;
6585		ret = log_delayed_insertion_items(trans, inode, path,
6586						  &delayed_ins_list, ctx);
6587		if (ret)
6588			goto out_unlock;
6589		ret = log_delayed_deletion_items(trans, inode, path,
6590						 &delayed_del_list, ctx);
6591		if (ret)
6592			goto out_unlock;
6593	}
6594
6595	spin_lock(&inode->lock);
6596	inode->logged_trans = trans->transid;
6597	/*
6598	 * Don't update last_log_commit if we logged that an inode exists.
6599	 * We do this for three reasons:
6600	 *
6601	 * 1) We might have had buffered writes to this inode that were
6602	 *    flushed and had their ordered extents completed in this
6603	 *    transaction, but we did not previously log the inode with
6604	 *    LOG_INODE_ALL. Later the inode was evicted and after that
6605	 *    it was loaded again and this LOG_INODE_EXISTS log operation
6606	 *    happened. We must make sure that if an explicit fsync against
6607	 *    the inode is performed later, it logs the new extents, an
6608	 *    updated inode item, etc, and syncs the log. The same logic
6609	 *    applies to direct IO writes instead of buffered writes.
6610	 *
6611	 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
6612	 *    is logged with an i_size of 0 or whatever value was logged
6613	 *    before. If later the i_size of the inode is increased by a
6614	 *    truncate operation, the log is synced through an fsync of
6615	 *    some other inode and then finally an explicit fsync against
6616	 *    this inode is made, we must make sure this fsync logs the
6617	 *    inode with the new i_size, the hole between old i_size and
6618	 *    the new i_size, and syncs the log.
6619	 *
6620	 * 3) If we are logging that an ancestor inode exists as part of
6621	 *    logging a new name from a link or rename operation, don't update
6622	 *    its last_log_commit - otherwise if an explicit fsync is made
6623	 *    against an ancestor, the fsync considers the inode in the log
6624	 *    and doesn't sync the log, resulting in the ancestor missing after
6625	 *    a power failure unless the log was synced as part of an fsync
6626	 *    against any other unrelated inode.
6627	 */
6628	if (inode_only != LOG_INODE_EXISTS)
6629		inode->last_log_commit = inode->last_sub_trans;
6630	spin_unlock(&inode->lock);
6631
6632	/*
6633	 * Reset the last_reflink_trans so that the next fsync does not need to
6634	 * go through the slower path when logging extents and their checksums.
6635	 */
6636	if (inode_only == LOG_INODE_ALL)
6637		inode->last_reflink_trans = 0;
6638
6639out_unlock:
6640	mutex_unlock(&inode->log_mutex);
6641out:
6642	btrfs_free_path(path);
6643	btrfs_free_path(dst_path);
6644
6645	if (ret)
6646		free_conflicting_inodes(ctx);
6647	else
6648		ret = log_conflicting_inodes(trans, inode->root, ctx);
6649
6650	if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
6651		if (!ret)
6652			ret = log_new_delayed_dentries(trans, inode,
6653						       &delayed_ins_list, ctx);
6654
6655		btrfs_log_put_delayed_items(inode, &delayed_ins_list,
6656					    &delayed_del_list);
6657	}
6658
6659	return ret;
6660}
6661
6662static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
6663				 struct btrfs_inode *inode,
6664				 struct btrfs_log_ctx *ctx)
6665{
6666	struct btrfs_fs_info *fs_info = trans->fs_info;
6667	int ret;
6668	struct btrfs_path *path;
6669	struct btrfs_key key;
6670	struct btrfs_root *root = inode->root;
6671	const u64 ino = btrfs_ino(inode);
6672
6673	path = btrfs_alloc_path();
6674	if (!path)
6675		return -ENOMEM;
6676	path->skip_locking = 1;
6677	path->search_commit_root = 1;
6678
6679	key.objectid = ino;
6680	key.type = BTRFS_INODE_REF_KEY;
6681	key.offset = 0;
6682	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6683	if (ret < 0)
6684		goto out;
6685
6686	while (true) {
6687		struct extent_buffer *leaf = path->nodes[0];
6688		int slot = path->slots[0];
6689		u32 cur_offset = 0;
6690		u32 item_size;
6691		unsigned long ptr;
6692
6693		if (slot >= btrfs_header_nritems(leaf)) {
6694			ret = btrfs_next_leaf(root, path);
6695			if (ret < 0)
6696				goto out;
6697			else if (ret > 0)
6698				break;
6699			continue;
6700		}
6701
6702		btrfs_item_key_to_cpu(leaf, &key, slot);
6703		/* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
6704		if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
6705			break;
6706
6707		item_size = btrfs_item_size(leaf, slot);
6708		ptr = btrfs_item_ptr_offset(leaf, slot);
6709		while (cur_offset < item_size) {
6710			struct btrfs_key inode_key;
6711			struct inode *dir_inode;
6712
6713			inode_key.type = BTRFS_INODE_ITEM_KEY;
6714			inode_key.offset = 0;
6715
6716			if (key.type == BTRFS_INODE_EXTREF_KEY) {
6717				struct btrfs_inode_extref *extref;
6718
6719				extref = (struct btrfs_inode_extref *)
6720					(ptr + cur_offset);
6721				inode_key.objectid = btrfs_inode_extref_parent(
6722					leaf, extref);
6723				cur_offset += sizeof(*extref);
6724				cur_offset += btrfs_inode_extref_name_len(leaf,
6725					extref);
6726			} else {
6727				inode_key.objectid = key.offset;
6728				cur_offset = item_size;
6729			}
6730
6731			dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
6732					       root);
6733			/*
6734			 * If the parent inode was deleted, return an error to
6735			 * fallback to a transaction commit. This is to prevent
6736			 * getting an inode that was moved from one parent A to
6737			 * a parent B, got its former parent A deleted and then
6738			 * it got fsync'ed, from existing at both parents after
6739			 * a log replay (and the old parent still existing).
6740			 * Example:
6741			 *
6742			 * mkdir /mnt/A
6743			 * mkdir /mnt/B
6744			 * touch /mnt/B/bar
6745			 * sync
6746			 * mv /mnt/B/bar /mnt/A/bar
6747			 * mv -T /mnt/A /mnt/B
6748			 * fsync /mnt/B/bar
6749			 * <power fail>
6750			 *
6751			 * If we ignore the old parent B which got deleted,
6752			 * after a log replay we would have file bar linked
6753			 * at both parents and the old parent B would still
6754			 * exist.
6755			 */
6756			if (IS_ERR(dir_inode)) {
6757				ret = PTR_ERR(dir_inode);
6758				goto out;
6759			}
6760
6761			if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
6762				btrfs_add_delayed_iput(BTRFS_I(dir_inode));
6763				continue;
6764			}
6765
6766			ctx->log_new_dentries = false;
6767			ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
6768					      LOG_INODE_ALL, ctx);
6769			if (!ret && ctx->log_new_dentries)
6770				ret = log_new_dir_dentries(trans,
6771						   BTRFS_I(dir_inode), ctx);
6772			btrfs_add_delayed_iput(BTRFS_I(dir_inode));
6773			if (ret)
6774				goto out;
6775		}
6776		path->slots[0]++;
6777	}
6778	ret = 0;
6779out:
6780	btrfs_free_path(path);
6781	return ret;
6782}
6783
6784static int log_new_ancestors(struct btrfs_trans_handle *trans,
6785			     struct btrfs_root *root,
6786			     struct btrfs_path *path,
6787			     struct btrfs_log_ctx *ctx)
6788{
6789	struct btrfs_key found_key;
6790
6791	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
6792
6793	while (true) {
6794		struct btrfs_fs_info *fs_info = root->fs_info;
6795		struct extent_buffer *leaf;
6796		int slot;
6797		struct btrfs_key search_key;
6798		struct inode *inode;
6799		u64 ino;
6800		int ret = 0;
6801
6802		btrfs_release_path(path);
6803
6804		ino = found_key.offset;
6805
6806		search_key.objectid = found_key.offset;
6807		search_key.type = BTRFS_INODE_ITEM_KEY;
6808		search_key.offset = 0;
6809		inode = btrfs_iget(fs_info->sb, ino, root);
6810		if (IS_ERR(inode))
6811			return PTR_ERR(inode);
6812
6813		if (BTRFS_I(inode)->generation >= trans->transid &&
6814		    need_log_inode(trans, BTRFS_I(inode)))
6815			ret = btrfs_log_inode(trans, BTRFS_I(inode),
6816					      LOG_INODE_EXISTS, ctx);
6817		btrfs_add_delayed_iput(BTRFS_I(inode));
6818		if (ret)
6819			return ret;
6820
6821		if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
6822			break;
6823
6824		search_key.type = BTRFS_INODE_REF_KEY;
6825		ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6826		if (ret < 0)
6827			return ret;
6828
6829		leaf = path->nodes[0];
6830		slot = path->slots[0];
6831		if (slot >= btrfs_header_nritems(leaf)) {
6832			ret = btrfs_next_leaf(root, path);
6833			if (ret < 0)
6834				return ret;
6835			else if (ret > 0)
6836				return -ENOENT;
6837			leaf = path->nodes[0];
6838			slot = path->slots[0];
6839		}
6840
6841		btrfs_item_key_to_cpu(leaf, &found_key, slot);
6842		if (found_key.objectid != search_key.objectid ||
6843		    found_key.type != BTRFS_INODE_REF_KEY)
6844			return -ENOENT;
6845	}
6846	return 0;
6847}
6848
6849static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
6850				  struct btrfs_inode *inode,
6851				  struct dentry *parent,
6852				  struct btrfs_log_ctx *ctx)
6853{
6854	struct btrfs_root *root = inode->root;
6855	struct dentry *old_parent = NULL;
6856	struct super_block *sb = inode->vfs_inode.i_sb;
6857	int ret = 0;
6858
6859	while (true) {
6860		if (!parent || d_really_is_negative(parent) ||
6861		    sb != parent->d_sb)
6862			break;
6863
6864		inode = BTRFS_I(d_inode(parent));
6865		if (root != inode->root)
6866			break;
6867
6868		if (inode->generation >= trans->transid &&
6869		    need_log_inode(trans, inode)) {
6870			ret = btrfs_log_inode(trans, inode,
6871					      LOG_INODE_EXISTS, ctx);
6872			if (ret)
6873				break;
6874		}
6875		if (IS_ROOT(parent))
6876			break;
6877
6878		parent = dget_parent(parent);
6879		dput(old_parent);
6880		old_parent = parent;
6881	}
6882	dput(old_parent);
6883
6884	return ret;
6885}
6886
6887static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
6888				 struct btrfs_inode *inode,
6889				 struct dentry *parent,
6890				 struct btrfs_log_ctx *ctx)
6891{
6892	struct btrfs_root *root = inode->root;
6893	const u64 ino = btrfs_ino(inode);
6894	struct btrfs_path *path;
6895	struct btrfs_key search_key;
6896	int ret;
6897
6898	/*
6899	 * For a single hard link case, go through a fast path that does not
6900	 * need to iterate the fs/subvolume tree.
6901	 */
6902	if (inode->vfs_inode.i_nlink < 2)
6903		return log_new_ancestors_fast(trans, inode, parent, ctx);
6904
6905	path = btrfs_alloc_path();
6906	if (!path)
6907		return -ENOMEM;
6908
6909	search_key.objectid = ino;
6910	search_key.type = BTRFS_INODE_REF_KEY;
6911	search_key.offset = 0;
6912again:
6913	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6914	if (ret < 0)
6915		goto out;
6916	if (ret == 0)
6917		path->slots[0]++;
6918
6919	while (true) {
6920		struct extent_buffer *leaf = path->nodes[0];
6921		int slot = path->slots[0];
6922		struct btrfs_key found_key;
6923
6924		if (slot >= btrfs_header_nritems(leaf)) {
6925			ret = btrfs_next_leaf(root, path);
6926			if (ret < 0)
6927				goto out;
6928			else if (ret > 0)
6929				break;
6930			continue;
6931		}
6932
6933		btrfs_item_key_to_cpu(leaf, &found_key, slot);
6934		if (found_key.objectid != ino ||
6935		    found_key.type > BTRFS_INODE_EXTREF_KEY)
6936			break;
6937
6938		/*
6939		 * Don't deal with extended references because they are rare
6940		 * cases and too complex to deal with (we would need to keep
6941		 * track of which subitem we are processing for each item in
6942		 * this loop, etc). So just return some error to fallback to
6943		 * a transaction commit.
6944		 */
6945		if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
6946			ret = -EMLINK;
6947			goto out;
6948		}
6949
6950		/*
6951		 * Logging ancestors needs to do more searches on the fs/subvol
6952		 * tree, so it releases the path as needed to avoid deadlocks.
6953		 * Keep track of the last inode ref key and resume from that key
6954		 * after logging all new ancestors for the current hard link.
6955		 */
6956		memcpy(&search_key, &found_key, sizeof(search_key));
6957
6958		ret = log_new_ancestors(trans, root, path, ctx);
6959		if (ret)
6960			goto out;
6961		btrfs_release_path(path);
6962		goto again;
6963	}
6964	ret = 0;
6965out:
6966	btrfs_free_path(path);
6967	return ret;
6968}
6969
6970/*
6971 * helper function around btrfs_log_inode to make sure newly created
6972 * parent directories also end up in the log.  A minimal inode and backref
6973 * only logging is done of any parent directories that are older than
6974 * the last committed transaction
6975 */
6976static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
6977				  struct btrfs_inode *inode,
6978				  struct dentry *parent,
6979				  int inode_only,
6980				  struct btrfs_log_ctx *ctx)
6981{
6982	struct btrfs_root *root = inode->root;
6983	struct btrfs_fs_info *fs_info = root->fs_info;
6984	int ret = 0;
6985	bool log_dentries = false;
6986
6987	if (btrfs_test_opt(fs_info, NOTREELOG)) {
6988		ret = BTRFS_LOG_FORCE_COMMIT;
6989		goto end_no_trans;
6990	}
6991
6992	if (btrfs_root_refs(&root->root_item) == 0) {
6993		ret = BTRFS_LOG_FORCE_COMMIT;
6994		goto end_no_trans;
6995	}
6996
6997	/*
6998	 * Skip already logged inodes or inodes corresponding to tmpfiles
6999	 * (since logging them is pointless, a link count of 0 means they
7000	 * will never be accessible).
7001	 */
7002	if ((btrfs_inode_in_log(inode, trans->transid) &&
7003	     list_empty(&ctx->ordered_extents)) ||
7004	    inode->vfs_inode.i_nlink == 0) {
7005		ret = BTRFS_NO_LOG_SYNC;
7006		goto end_no_trans;
7007	}
7008
7009	ret = start_log_trans(trans, root, ctx);
7010	if (ret)
7011		goto end_no_trans;
7012
7013	ret = btrfs_log_inode(trans, inode, inode_only, ctx);
7014	if (ret)
7015		goto end_trans;
7016
7017	/*
7018	 * for regular files, if its inode is already on disk, we don't
7019	 * have to worry about the parents at all.  This is because
7020	 * we can use the last_unlink_trans field to record renames
7021	 * and other fun in this file.
7022	 */
7023	if (S_ISREG(inode->vfs_inode.i_mode) &&
7024	    inode->generation < trans->transid &&
7025	    inode->last_unlink_trans < trans->transid) {
7026		ret = 0;
7027		goto end_trans;
7028	}
7029
7030	if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
7031		log_dentries = true;
7032
7033	/*
7034	 * On unlink we must make sure all our current and old parent directory
7035	 * inodes are fully logged. This is to prevent leaving dangling
7036	 * directory index entries in directories that were our parents but are
7037	 * not anymore. Not doing this results in old parent directory being
7038	 * impossible to delete after log replay (rmdir will always fail with
7039	 * error -ENOTEMPTY).
7040	 *
7041	 * Example 1:
7042	 *
7043	 * mkdir testdir
7044	 * touch testdir/foo
7045	 * ln testdir/foo testdir/bar
7046	 * sync
7047	 * unlink testdir/bar
7048	 * xfs_io -c fsync testdir/foo
7049	 * <power failure>
7050	 * mount fs, triggers log replay
7051	 *
7052	 * If we don't log the parent directory (testdir), after log replay the
7053	 * directory still has an entry pointing to the file inode using the bar
7054	 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
7055	 * the file inode has a link count of 1.
7056	 *
7057	 * Example 2:
7058	 *
7059	 * mkdir testdir
7060	 * touch foo
7061	 * ln foo testdir/foo2
7062	 * ln foo testdir/foo3
7063	 * sync
7064	 * unlink testdir/foo3
7065	 * xfs_io -c fsync foo
7066	 * <power failure>
7067	 * mount fs, triggers log replay
7068	 *
7069	 * Similar as the first example, after log replay the parent directory
7070	 * testdir still has an entry pointing to the inode file with name foo3
7071	 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
7072	 * and has a link count of 2.
7073	 */
7074	if (inode->last_unlink_trans >= trans->transid) {
7075		ret = btrfs_log_all_parents(trans, inode, ctx);
7076		if (ret)
7077			goto end_trans;
7078	}
7079
7080	ret = log_all_new_ancestors(trans, inode, parent, ctx);
7081	if (ret)
7082		goto end_trans;
7083
7084	if (log_dentries)
7085		ret = log_new_dir_dentries(trans, inode, ctx);
7086	else
7087		ret = 0;
7088end_trans:
7089	if (ret < 0) {
7090		btrfs_set_log_full_commit(trans);
7091		ret = BTRFS_LOG_FORCE_COMMIT;
7092	}
7093
7094	if (ret)
7095		btrfs_remove_log_ctx(root, ctx);
7096	btrfs_end_log_trans(root);
7097end_no_trans:
7098	return ret;
7099}
7100
7101/*
7102 * it is not safe to log dentry if the chunk root has added new
7103 * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
7104 * If this returns 1, you must commit the transaction to safely get your
7105 * data on disk.
7106 */
7107int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
7108			  struct dentry *dentry,
7109			  struct btrfs_log_ctx *ctx)
7110{
7111	struct dentry *parent = dget_parent(dentry);
7112	int ret;
7113
7114	ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
7115				     LOG_INODE_ALL, ctx);
7116	dput(parent);
7117
7118	return ret;
7119}
7120
7121/*
7122 * should be called during mount to recover any replay any log trees
7123 * from the FS
7124 */
7125int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
7126{
7127	int ret;
7128	struct btrfs_path *path;
7129	struct btrfs_trans_handle *trans;
7130	struct btrfs_key key;
7131	struct btrfs_key found_key;
7132	struct btrfs_root *log;
7133	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
7134	struct walk_control wc = {
7135		.process_func = process_one_buffer,
7136		.stage = LOG_WALK_PIN_ONLY,
7137	};
7138
7139	path = btrfs_alloc_path();
7140	if (!path)
7141		return -ENOMEM;
7142
7143	set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7144
7145	trans = btrfs_start_transaction(fs_info->tree_root, 0);
7146	if (IS_ERR(trans)) {
7147		ret = PTR_ERR(trans);
7148		goto error;
7149	}
7150
7151	wc.trans = trans;
7152	wc.pin = 1;
7153
7154	ret = walk_log_tree(trans, log_root_tree, &wc);
7155	if (ret) {
7156		btrfs_abort_transaction(trans, ret);
7157		goto error;
7158	}
7159
7160again:
7161	key.objectid = BTRFS_TREE_LOG_OBJECTID;
7162	key.offset = (u64)-1;
7163	key.type = BTRFS_ROOT_ITEM_KEY;
7164
7165	while (1) {
7166		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
7167
7168		if (ret < 0) {
7169			btrfs_abort_transaction(trans, ret);
7170			goto error;
7171		}
7172		if (ret > 0) {
7173			if (path->slots[0] == 0)
7174				break;
7175			path->slots[0]--;
7176		}
7177		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
7178				      path->slots[0]);
7179		btrfs_release_path(path);
7180		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
7181			break;
7182
7183		log = btrfs_read_tree_root(log_root_tree, &found_key);
7184		if (IS_ERR(log)) {
7185			ret = PTR_ERR(log);
7186			btrfs_abort_transaction(trans, ret);
7187			goto error;
7188		}
7189
7190		wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
7191						   true);
7192		if (IS_ERR(wc.replay_dest)) {
7193			ret = PTR_ERR(wc.replay_dest);
7194
7195			/*
7196			 * We didn't find the subvol, likely because it was
7197			 * deleted.  This is ok, simply skip this log and go to
7198			 * the next one.
7199			 *
7200			 * We need to exclude the root because we can't have
7201			 * other log replays overwriting this log as we'll read
7202			 * it back in a few more times.  This will keep our
7203			 * block from being modified, and we'll just bail for
7204			 * each subsequent pass.
7205			 */
7206			if (ret == -ENOENT)
7207				ret = btrfs_pin_extent_for_log_replay(trans,
7208							log->node->start,
7209							log->node->len);
7210			btrfs_put_root(log);
7211
7212			if (!ret)
7213				goto next;
7214			btrfs_abort_transaction(trans, ret);
7215			goto error;
7216		}
7217
7218		wc.replay_dest->log_root = log;
7219		ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
7220		if (ret)
7221			/* The loop needs to continue due to the root refs */
7222			btrfs_abort_transaction(trans, ret);
7223		else
7224			ret = walk_log_tree(trans, log, &wc);
7225
7226		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
7227			ret = fixup_inode_link_counts(trans, wc.replay_dest,
7228						      path);
7229			if (ret)
7230				btrfs_abort_transaction(trans, ret);
7231		}
7232
7233		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
7234			struct btrfs_root *root = wc.replay_dest;
7235
7236			btrfs_release_path(path);
7237
7238			/*
7239			 * We have just replayed everything, and the highest
7240			 * objectid of fs roots probably has changed in case
7241			 * some inode_item's got replayed.
7242			 *
7243			 * root->objectid_mutex is not acquired as log replay
7244			 * could only happen during mount.
7245			 */
7246			ret = btrfs_init_root_free_objectid(root);
7247			if (ret)
7248				btrfs_abort_transaction(trans, ret);
7249		}
7250
7251		wc.replay_dest->log_root = NULL;
7252		btrfs_put_root(wc.replay_dest);
7253		btrfs_put_root(log);
7254
7255		if (ret)
7256			goto error;
7257next:
7258		if (found_key.offset == 0)
7259			break;
7260		key.offset = found_key.offset - 1;
7261	}
7262	btrfs_release_path(path);
7263
7264	/* step one is to pin it all, step two is to replay just inodes */
7265	if (wc.pin) {
7266		wc.pin = 0;
7267		wc.process_func = replay_one_buffer;
7268		wc.stage = LOG_WALK_REPLAY_INODES;
7269		goto again;
7270	}
7271	/* step three is to replay everything */
7272	if (wc.stage < LOG_WALK_REPLAY_ALL) {
7273		wc.stage++;
7274		goto again;
7275	}
7276
7277	btrfs_free_path(path);
7278
7279	/* step 4: commit the transaction, which also unpins the blocks */
7280	ret = btrfs_commit_transaction(trans);
7281	if (ret)
7282		return ret;
7283
7284	log_root_tree->log_root = NULL;
7285	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7286	btrfs_put_root(log_root_tree);
7287
7288	return 0;
7289error:
7290	if (wc.trans)
7291		btrfs_end_transaction(wc.trans);
7292	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7293	btrfs_free_path(path);
7294	return ret;
7295}
7296
7297/*
7298 * there are some corner cases where we want to force a full
7299 * commit instead of allowing a directory to be logged.
7300 *
7301 * They revolve around files there were unlinked from the directory, and
7302 * this function updates the parent directory so that a full commit is
7303 * properly done if it is fsync'd later after the unlinks are done.
7304 *
7305 * Must be called before the unlink operations (updates to the subvolume tree,
7306 * inodes, etc) are done.
7307 */
7308void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
7309			     struct btrfs_inode *dir, struct btrfs_inode *inode,
7310			     bool for_rename)
7311{
7312	/*
7313	 * when we're logging a file, if it hasn't been renamed
7314	 * or unlinked, and its inode is fully committed on disk,
7315	 * we don't have to worry about walking up the directory chain
7316	 * to log its parents.
7317	 *
7318	 * So, we use the last_unlink_trans field to put this transid
7319	 * into the file.  When the file is logged we check it and
7320	 * don't log the parents if the file is fully on disk.
7321	 */
7322	mutex_lock(&inode->log_mutex);
7323	inode->last_unlink_trans = trans->transid;
7324	mutex_unlock(&inode->log_mutex);
7325
7326	if (!for_rename)
7327		return;
7328
7329	/*
7330	 * If this directory was already logged, any new names will be logged
7331	 * with btrfs_log_new_name() and old names will be deleted from the log
7332	 * tree with btrfs_del_dir_entries_in_log() or with
7333	 * btrfs_del_inode_ref_in_log().
7334	 */
7335	if (inode_logged(trans, dir, NULL) == 1)
7336		return;
7337
7338	/*
7339	 * If the inode we're about to unlink was logged before, the log will be
7340	 * properly updated with the new name with btrfs_log_new_name() and the
7341	 * old name removed with btrfs_del_dir_entries_in_log() or with
7342	 * btrfs_del_inode_ref_in_log().
7343	 */
7344	if (inode_logged(trans, inode, NULL) == 1)
7345		return;
7346
7347	/*
7348	 * when renaming files across directories, if the directory
7349	 * there we're unlinking from gets fsync'd later on, there's
7350	 * no way to find the destination directory later and fsync it
7351	 * properly.  So, we have to be conservative and force commits
7352	 * so the new name gets discovered.
7353	 */
7354	mutex_lock(&dir->log_mutex);
7355	dir->last_unlink_trans = trans->transid;
7356	mutex_unlock(&dir->log_mutex);
7357}
7358
7359/*
7360 * Make sure that if someone attempts to fsync the parent directory of a deleted
7361 * snapshot, it ends up triggering a transaction commit. This is to guarantee
7362 * that after replaying the log tree of the parent directory's root we will not
7363 * see the snapshot anymore and at log replay time we will not see any log tree
7364 * corresponding to the deleted snapshot's root, which could lead to replaying
7365 * it after replaying the log tree of the parent directory (which would replay
7366 * the snapshot delete operation).
7367 *
7368 * Must be called before the actual snapshot destroy operation (updates to the
7369 * parent root and tree of tree roots trees, etc) are done.
7370 */
7371void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
7372				   struct btrfs_inode *dir)
7373{
7374	mutex_lock(&dir->log_mutex);
7375	dir->last_unlink_trans = trans->transid;
7376	mutex_unlock(&dir->log_mutex);
7377}
7378
7379/*
7380 * Update the log after adding a new name for an inode.
7381 *
7382 * @trans:              Transaction handle.
7383 * @old_dentry:         The dentry associated with the old name and the old
7384 *                      parent directory.
7385 * @old_dir:            The inode of the previous parent directory for the case
7386 *                      of a rename. For a link operation, it must be NULL.
7387 * @old_dir_index:      The index number associated with the old name, meaningful
7388 *                      only for rename operations (when @old_dir is not NULL).
7389 *                      Ignored for link operations.
7390 * @parent:             The dentry associated with the directory under which the
7391 *                      new name is located.
7392 *
7393 * Call this after adding a new name for an inode, as a result of a link or
7394 * rename operation, and it will properly update the log to reflect the new name.
7395 */
7396void btrfs_log_new_name(struct btrfs_trans_handle *trans,
7397			struct dentry *old_dentry, struct btrfs_inode *old_dir,
7398			u64 old_dir_index, struct dentry *parent)
7399{
7400	struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
7401	struct btrfs_root *root = inode->root;
7402	struct btrfs_log_ctx ctx;
7403	bool log_pinned = false;
7404	int ret;
7405
7406	/*
7407	 * this will force the logging code to walk the dentry chain
7408	 * up for the file
7409	 */
7410	if (!S_ISDIR(inode->vfs_inode.i_mode))
7411		inode->last_unlink_trans = trans->transid;
7412
7413	/*
7414	 * if this inode hasn't been logged and directory we're renaming it
7415	 * from hasn't been logged, we don't need to log it
7416	 */
7417	ret = inode_logged(trans, inode, NULL);
7418	if (ret < 0) {
7419		goto out;
7420	} else if (ret == 0) {
7421		if (!old_dir)
7422			return;
7423		/*
7424		 * If the inode was not logged and we are doing a rename (old_dir is not
7425		 * NULL), check if old_dir was logged - if it was not we can return and
7426		 * do nothing.
7427		 */
7428		ret = inode_logged(trans, old_dir, NULL);
7429		if (ret < 0)
7430			goto out;
7431		else if (ret == 0)
7432			return;
7433	}
7434	ret = 0;
7435
7436	/*
7437	 * If we are doing a rename (old_dir is not NULL) from a directory that
7438	 * was previously logged, make sure that on log replay we get the old
7439	 * dir entry deleted. This is needed because we will also log the new
7440	 * name of the renamed inode, so we need to make sure that after log
7441	 * replay we don't end up with both the new and old dir entries existing.
7442	 */
7443	if (old_dir && old_dir->logged_trans == trans->transid) {
7444		struct btrfs_root *log = old_dir->root->log_root;
7445		struct btrfs_path *path;
7446		struct fscrypt_name fname;
7447
7448		ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
7449
7450		ret = fscrypt_setup_filename(&old_dir->vfs_inode,
7451					     &old_dentry->d_name, 0, &fname);
7452		if (ret)
7453			goto out;
7454		/*
7455		 * We have two inodes to update in the log, the old directory and
7456		 * the inode that got renamed, so we must pin the log to prevent
7457		 * anyone from syncing the log until we have updated both inodes
7458		 * in the log.
7459		 */
7460		ret = join_running_log_trans(root);
7461		/*
7462		 * At least one of the inodes was logged before, so this should
7463		 * not fail, but if it does, it's not serious, just bail out and
7464		 * mark the log for a full commit.
7465		 */
7466		if (WARN_ON_ONCE(ret < 0)) {
7467			fscrypt_free_filename(&fname);
7468			goto out;
7469		}
7470
7471		log_pinned = true;
7472
7473		path = btrfs_alloc_path();
7474		if (!path) {
7475			ret = -ENOMEM;
7476			fscrypt_free_filename(&fname);
7477			goto out;
7478		}
7479
7480		/*
7481		 * Other concurrent task might be logging the old directory,
7482		 * as it can be triggered when logging other inode that had or
7483		 * still has a dentry in the old directory. We lock the old
7484		 * directory's log_mutex to ensure the deletion of the old
7485		 * name is persisted, because during directory logging we
7486		 * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
7487		 * the old name's dir index item is in the delayed items, so
7488		 * it could be missed by an in progress directory logging.
7489		 */
7490		mutex_lock(&old_dir->log_mutex);
7491		ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
7492					&fname.disk_name, old_dir_index);
7493		if (ret > 0) {
7494			/*
7495			 * The dentry does not exist in the log, so record its
7496			 * deletion.
7497			 */
7498			btrfs_release_path(path);
7499			ret = insert_dir_log_key(trans, log, path,
7500						 btrfs_ino(old_dir),
7501						 old_dir_index, old_dir_index);
7502		}
7503		mutex_unlock(&old_dir->log_mutex);
7504
7505		btrfs_free_path(path);
7506		fscrypt_free_filename(&fname);
7507		if (ret < 0)
7508			goto out;
7509	}
7510
7511	btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
7512	ctx.logging_new_name = true;
7513	/*
7514	 * We don't care about the return value. If we fail to log the new name
7515	 * then we know the next attempt to sync the log will fallback to a full
7516	 * transaction commit (due to a call to btrfs_set_log_full_commit()), so
7517	 * we don't need to worry about getting a log committed that has an
7518	 * inconsistent state after a rename operation.
7519	 */
7520	btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
7521	ASSERT(list_empty(&ctx.conflict_inodes));
7522out:
7523	/*
7524	 * If an error happened mark the log for a full commit because it's not
7525	 * consistent and up to date or we couldn't find out if one of the
7526	 * inodes was logged before in this transaction. Do it before unpinning
7527	 * the log, to avoid any races with someone else trying to commit it.
7528	 */
7529	if (ret < 0)
7530		btrfs_set_log_full_commit(trans);
7531	if (log_pinned)
7532		btrfs_end_log_trans(root);
7533}
7534
7535