xref: /kernel/linux/linux-5.10/fs/btrfs/tree-log.c (revision 8c2ecf20)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2008 Oracle.  All rights reserved.
4 */
5
6#include <linux/sched.h>
7#include <linux/slab.h>
8#include <linux/blkdev.h>
9#include <linux/list_sort.h>
10#include <linux/iversion.h>
11#include "misc.h"
12#include "ctree.h"
13#include "tree-log.h"
14#include "disk-io.h"
15#include "locking.h"
16#include "print-tree.h"
17#include "backref.h"
18#include "compression.h"
19#include "qgroup.h"
20#include "inode-map.h"
21#include "block-group.h"
22#include "space-info.h"
23
24/* magic values for the inode_only field in btrfs_log_inode:
25 *
26 * LOG_INODE_ALL means to log everything
27 * LOG_INODE_EXISTS means to log just enough to recreate the inode
28 * during log replay
29 */
30enum {
31	LOG_INODE_ALL,
32	LOG_INODE_EXISTS,
33	LOG_OTHER_INODE,
34	LOG_OTHER_INODE_ALL,
35};
36
37/*
38 * directory trouble cases
39 *
40 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
41 * log, we must force a full commit before doing an fsync of the directory
42 * where the unlink was done.
43 * ---> record transid of last unlink/rename per directory
44 *
45 * mkdir foo/some_dir
46 * normal commit
47 * rename foo/some_dir foo2/some_dir
48 * mkdir foo/some_dir
49 * fsync foo/some_dir/some_file
50 *
51 * The fsync above will unlink the original some_dir without recording
52 * it in its new location (foo2).  After a crash, some_dir will be gone
53 * unless the fsync of some_file forces a full commit
54 *
55 * 2) we must log any new names for any file or dir that is in the fsync
56 * log. ---> check inode while renaming/linking.
57 *
58 * 2a) we must log any new names for any file or dir during rename
59 * when the directory they are being removed from was logged.
60 * ---> check inode and old parent dir during rename
61 *
62 *  2a is actually the more important variant.  With the extra logging
63 *  a crash might unlink the old name without recreating the new one
64 *
65 * 3) after a crash, we must go through any directories with a link count
66 * of zero and redo the rm -rf
67 *
68 * mkdir f1/foo
69 * normal commit
70 * rm -rf f1/foo
71 * fsync(f1)
72 *
73 * The directory f1 was fully removed from the FS, but fsync was never
74 * called on f1, only its parent dir.  After a crash the rm -rf must
75 * be replayed.  This must be able to recurse down the entire
76 * directory tree.  The inode link count fixup code takes care of the
77 * ugly details.
78 */
79
80/*
81 * stages for the tree walking.  The first
82 * stage (0) is to only pin down the blocks we find
83 * the second stage (1) is to make sure that all the inodes
84 * we find in the log are created in the subvolume.
85 *
86 * The last stage is to deal with directories and links and extents
87 * and all the other fun semantics
88 */
89enum {
90	LOG_WALK_PIN_ONLY,
91	LOG_WALK_REPLAY_INODES,
92	LOG_WALK_REPLAY_DIR_INDEX,
93	LOG_WALK_REPLAY_ALL,
94};
95
96static int btrfs_log_inode(struct btrfs_trans_handle *trans,
97			   struct btrfs_root *root, struct btrfs_inode *inode,
98			   int inode_only,
99			   struct btrfs_log_ctx *ctx);
100static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
101			     struct btrfs_root *root,
102			     struct btrfs_path *path, u64 objectid);
103static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
104				       struct btrfs_root *root,
105				       struct btrfs_root *log,
106				       struct btrfs_path *path,
107				       u64 dirid, int del_all);
108
109/*
110 * tree logging is a special write ahead log used to make sure that
111 * fsyncs and O_SYNCs can happen without doing full tree commits.
112 *
113 * Full tree commits are expensive because they require commonly
114 * modified blocks to be recowed, creating many dirty pages in the
115 * extent tree an 4x-6x higher write load than ext3.
116 *
117 * Instead of doing a tree commit on every fsync, we use the
118 * key ranges and transaction ids to find items for a given file or directory
119 * that have changed in this transaction.  Those items are copied into
120 * a special tree (one per subvolume root), that tree is written to disk
121 * and then the fsync is considered complete.
122 *
123 * After a crash, items are copied out of the log-tree back into the
124 * subvolume tree.  Any file data extents found are recorded in the extent
125 * allocation tree, and the log-tree freed.
126 *
127 * The log tree is read three times, once to pin down all the extents it is
128 * using in ram and once, once to create all the inodes logged in the tree
129 * and once to do all the other items.
130 */
131
132/*
133 * start a sub transaction and setup the log tree
134 * this increments the log tree writer count to make the people
135 * syncing the tree wait for us to finish
136 */
137static int start_log_trans(struct btrfs_trans_handle *trans,
138			   struct btrfs_root *root,
139			   struct btrfs_log_ctx *ctx)
140{
141	struct btrfs_fs_info *fs_info = root->fs_info;
142	int ret = 0;
143
144	mutex_lock(&root->log_mutex);
145
146	if (root->log_root) {
147		if (btrfs_need_log_full_commit(trans)) {
148			ret = -EAGAIN;
149			goto out;
150		}
151
152		if (!root->log_start_pid) {
153			clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
154			root->log_start_pid = current->pid;
155		} else if (root->log_start_pid != current->pid) {
156			set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
157		}
158	} else {
159		mutex_lock(&fs_info->tree_log_mutex);
160		if (!fs_info->log_root_tree)
161			ret = btrfs_init_log_root_tree(trans, fs_info);
162		mutex_unlock(&fs_info->tree_log_mutex);
163		if (ret)
164			goto out;
165
166		ret = btrfs_add_log_tree(trans, root);
167		if (ret)
168			goto out;
169
170		set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
171		clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
172		root->log_start_pid = current->pid;
173	}
174
175	atomic_inc(&root->log_batch);
176	atomic_inc(&root->log_writers);
177	if (ctx && !ctx->logging_new_name) {
178		int index = root->log_transid % 2;
179		list_add_tail(&ctx->list, &root->log_ctxs[index]);
180		ctx->log_transid = root->log_transid;
181	}
182
183out:
184	mutex_unlock(&root->log_mutex);
185	return ret;
186}
187
188/*
189 * returns 0 if there was a log transaction running and we were able
190 * to join, or returns -ENOENT if there were not transactions
191 * in progress
192 */
193static int join_running_log_trans(struct btrfs_root *root)
194{
195	int ret = -ENOENT;
196
197	if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
198		return ret;
199
200	mutex_lock(&root->log_mutex);
201	if (root->log_root) {
202		ret = 0;
203		atomic_inc(&root->log_writers);
204	}
205	mutex_unlock(&root->log_mutex);
206	return ret;
207}
208
209/*
210 * This either makes the current running log transaction wait
211 * until you call btrfs_end_log_trans() or it makes any future
212 * log transactions wait until you call btrfs_end_log_trans()
213 */
214void btrfs_pin_log_trans(struct btrfs_root *root)
215{
216	atomic_inc(&root->log_writers);
217}
218
219/*
220 * indicate we're done making changes to the log tree
221 * and wake up anyone waiting to do a sync
222 */
223void btrfs_end_log_trans(struct btrfs_root *root)
224{
225	if (atomic_dec_and_test(&root->log_writers)) {
226		/* atomic_dec_and_test implies a barrier */
227		cond_wake_up_nomb(&root->log_writer_wait);
228	}
229}
230
231static int btrfs_write_tree_block(struct extent_buffer *buf)
232{
233	return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
234					buf->start + buf->len - 1);
235}
236
237static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
238{
239	filemap_fdatawait_range(buf->pages[0]->mapping,
240			        buf->start, buf->start + buf->len - 1);
241}
242
243/*
244 * the walk control struct is used to pass state down the chain when
245 * processing the log tree.  The stage field tells us which part
246 * of the log tree processing we are currently doing.  The others
247 * are state fields used for that specific part
248 */
249struct walk_control {
250	/* should we free the extent on disk when done?  This is used
251	 * at transaction commit time while freeing a log tree
252	 */
253	int free;
254
255	/* should we write out the extent buffer?  This is used
256	 * while flushing the log tree to disk during a sync
257	 */
258	int write;
259
260	/* should we wait for the extent buffer io to finish?  Also used
261	 * while flushing the log tree to disk for a sync
262	 */
263	int wait;
264
265	/* pin only walk, we record which extents on disk belong to the
266	 * log trees
267	 */
268	int pin;
269
270	/* what stage of the replay code we're currently in */
271	int stage;
272
273	/*
274	 * Ignore any items from the inode currently being processed. Needs
275	 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
276	 * the LOG_WALK_REPLAY_INODES stage.
277	 */
278	bool ignore_cur_inode;
279
280	/* the root we are currently replaying */
281	struct btrfs_root *replay_dest;
282
283	/* the trans handle for the current replay */
284	struct btrfs_trans_handle *trans;
285
286	/* the function that gets used to process blocks we find in the
287	 * tree.  Note the extent_buffer might not be up to date when it is
288	 * passed in, and it must be checked or read if you need the data
289	 * inside it
290	 */
291	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
292			    struct walk_control *wc, u64 gen, int level);
293};
294
295/*
296 * process_func used to pin down extents, write them or wait on them
297 */
298static int process_one_buffer(struct btrfs_root *log,
299			      struct extent_buffer *eb,
300			      struct walk_control *wc, u64 gen, int level)
301{
302	struct btrfs_fs_info *fs_info = log->fs_info;
303	int ret = 0;
304
305	/*
306	 * If this fs is mixed then we need to be able to process the leaves to
307	 * pin down any logged extents, so we have to read the block.
308	 */
309	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
310		ret = btrfs_read_buffer(eb, gen, level, NULL);
311		if (ret)
312			return ret;
313	}
314
315	if (wc->pin)
316		ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
317						      eb->len);
318
319	if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
320		if (wc->pin && btrfs_header_level(eb) == 0)
321			ret = btrfs_exclude_logged_extents(eb);
322		if (wc->write)
323			btrfs_write_tree_block(eb);
324		if (wc->wait)
325			btrfs_wait_tree_block_writeback(eb);
326	}
327	return ret;
328}
329
330/*
331 * Item overwrite used by replay and tree logging.  eb, slot and key all refer
332 * to the src data we are copying out.
333 *
334 * root is the tree we are copying into, and path is a scratch
335 * path for use in this function (it should be released on entry and
336 * will be released on exit).
337 *
338 * If the key is already in the destination tree the existing item is
339 * overwritten.  If the existing item isn't big enough, it is extended.
340 * If it is too large, it is truncated.
341 *
342 * If the key isn't in the destination yet, a new item is inserted.
343 */
344static noinline int overwrite_item(struct btrfs_trans_handle *trans,
345				   struct btrfs_root *root,
346				   struct btrfs_path *path,
347				   struct extent_buffer *eb, int slot,
348				   struct btrfs_key *key)
349{
350	int ret;
351	u32 item_size;
352	u64 saved_i_size = 0;
353	int save_old_i_size = 0;
354	unsigned long src_ptr;
355	unsigned long dst_ptr;
356	int overwrite_root = 0;
357	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
358
359	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
360		overwrite_root = 1;
361
362	item_size = btrfs_item_size_nr(eb, slot);
363	src_ptr = btrfs_item_ptr_offset(eb, slot);
364
365	/* look for the key in the destination tree */
366	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
367	if (ret < 0)
368		return ret;
369
370	if (ret == 0) {
371		char *src_copy;
372		char *dst_copy;
373		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
374						  path->slots[0]);
375		if (dst_size != item_size)
376			goto insert;
377
378		if (item_size == 0) {
379			btrfs_release_path(path);
380			return 0;
381		}
382		dst_copy = kmalloc(item_size, GFP_NOFS);
383		src_copy = kmalloc(item_size, GFP_NOFS);
384		if (!dst_copy || !src_copy) {
385			btrfs_release_path(path);
386			kfree(dst_copy);
387			kfree(src_copy);
388			return -ENOMEM;
389		}
390
391		read_extent_buffer(eb, src_copy, src_ptr, item_size);
392
393		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
394		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
395				   item_size);
396		ret = memcmp(dst_copy, src_copy, item_size);
397
398		kfree(dst_copy);
399		kfree(src_copy);
400		/*
401		 * they have the same contents, just return, this saves
402		 * us from cowing blocks in the destination tree and doing
403		 * extra writes that may not have been done by a previous
404		 * sync
405		 */
406		if (ret == 0) {
407			btrfs_release_path(path);
408			return 0;
409		}
410
411		/*
412		 * We need to load the old nbytes into the inode so when we
413		 * replay the extents we've logged we get the right nbytes.
414		 */
415		if (inode_item) {
416			struct btrfs_inode_item *item;
417			u64 nbytes;
418			u32 mode;
419
420			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
421					      struct btrfs_inode_item);
422			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
423			item = btrfs_item_ptr(eb, slot,
424					      struct btrfs_inode_item);
425			btrfs_set_inode_nbytes(eb, item, nbytes);
426
427			/*
428			 * If this is a directory we need to reset the i_size to
429			 * 0 so that we can set it up properly when replaying
430			 * the rest of the items in this log.
431			 */
432			mode = btrfs_inode_mode(eb, item);
433			if (S_ISDIR(mode))
434				btrfs_set_inode_size(eb, item, 0);
435		}
436	} else if (inode_item) {
437		struct btrfs_inode_item *item;
438		u32 mode;
439
440		/*
441		 * New inode, set nbytes to 0 so that the nbytes comes out
442		 * properly when we replay the extents.
443		 */
444		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
445		btrfs_set_inode_nbytes(eb, item, 0);
446
447		/*
448		 * If this is a directory we need to reset the i_size to 0 so
449		 * that we can set it up properly when replaying the rest of
450		 * the items in this log.
451		 */
452		mode = btrfs_inode_mode(eb, item);
453		if (S_ISDIR(mode))
454			btrfs_set_inode_size(eb, item, 0);
455	}
456insert:
457	btrfs_release_path(path);
458	/* try to insert the key into the destination tree */
459	path->skip_release_on_error = 1;
460	ret = btrfs_insert_empty_item(trans, root, path,
461				      key, item_size);
462	path->skip_release_on_error = 0;
463
464	/* make sure any existing item is the correct size */
465	if (ret == -EEXIST || ret == -EOVERFLOW) {
466		u32 found_size;
467		found_size = btrfs_item_size_nr(path->nodes[0],
468						path->slots[0]);
469		if (found_size > item_size)
470			btrfs_truncate_item(path, item_size, 1);
471		else if (found_size < item_size)
472			btrfs_extend_item(path, item_size - found_size);
473	} else if (ret) {
474		return ret;
475	}
476	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
477					path->slots[0]);
478
479	/* don't overwrite an existing inode if the generation number
480	 * was logged as zero.  This is done when the tree logging code
481	 * is just logging an inode to make sure it exists after recovery.
482	 *
483	 * Also, don't overwrite i_size on directories during replay.
484	 * log replay inserts and removes directory items based on the
485	 * state of the tree found in the subvolume, and i_size is modified
486	 * as it goes
487	 */
488	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
489		struct btrfs_inode_item *src_item;
490		struct btrfs_inode_item *dst_item;
491
492		src_item = (struct btrfs_inode_item *)src_ptr;
493		dst_item = (struct btrfs_inode_item *)dst_ptr;
494
495		if (btrfs_inode_generation(eb, src_item) == 0) {
496			struct extent_buffer *dst_eb = path->nodes[0];
497			const u64 ino_size = btrfs_inode_size(eb, src_item);
498
499			/*
500			 * For regular files an ino_size == 0 is used only when
501			 * logging that an inode exists, as part of a directory
502			 * fsync, and the inode wasn't fsynced before. In this
503			 * case don't set the size of the inode in the fs/subvol
504			 * tree, otherwise we would be throwing valid data away.
505			 */
506			if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
507			    S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
508			    ino_size != 0)
509				btrfs_set_inode_size(dst_eb, dst_item, ino_size);
510			goto no_copy;
511		}
512
513		if (overwrite_root &&
514		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
515		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
516			save_old_i_size = 1;
517			saved_i_size = btrfs_inode_size(path->nodes[0],
518							dst_item);
519		}
520	}
521
522	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
523			   src_ptr, item_size);
524
525	if (save_old_i_size) {
526		struct btrfs_inode_item *dst_item;
527		dst_item = (struct btrfs_inode_item *)dst_ptr;
528		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
529	}
530
531	/* make sure the generation is filled in */
532	if (key->type == BTRFS_INODE_ITEM_KEY) {
533		struct btrfs_inode_item *dst_item;
534		dst_item = (struct btrfs_inode_item *)dst_ptr;
535		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
536			btrfs_set_inode_generation(path->nodes[0], dst_item,
537						   trans->transid);
538		}
539	}
540no_copy:
541	btrfs_mark_buffer_dirty(path->nodes[0]);
542	btrfs_release_path(path);
543	return 0;
544}
545
546/*
547 * simple helper to read an inode off the disk from a given root
548 * This can only be called for subvolume roots and not for the log
549 */
550static noinline struct inode *read_one_inode(struct btrfs_root *root,
551					     u64 objectid)
552{
553	struct inode *inode;
554
555	inode = btrfs_iget(root->fs_info->sb, objectid, root);
556	if (IS_ERR(inode))
557		inode = NULL;
558	return inode;
559}
560
561/* replays a single extent in 'eb' at 'slot' with 'key' into the
562 * subvolume 'root'.  path is released on entry and should be released
563 * on exit.
564 *
565 * extents in the log tree have not been allocated out of the extent
566 * tree yet.  So, this completes the allocation, taking a reference
567 * as required if the extent already exists or creating a new extent
568 * if it isn't in the extent allocation tree yet.
569 *
570 * The extent is inserted into the file, dropping any existing extents
571 * from the file that overlap the new one.
572 */
573static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
574				      struct btrfs_root *root,
575				      struct btrfs_path *path,
576				      struct extent_buffer *eb, int slot,
577				      struct btrfs_key *key)
578{
579	struct btrfs_fs_info *fs_info = root->fs_info;
580	int found_type;
581	u64 extent_end;
582	u64 start = key->offset;
583	u64 nbytes = 0;
584	struct btrfs_file_extent_item *item;
585	struct inode *inode = NULL;
586	unsigned long size;
587	int ret = 0;
588
589	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
590	found_type = btrfs_file_extent_type(eb, item);
591
592	if (found_type == BTRFS_FILE_EXTENT_REG ||
593	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
594		nbytes = btrfs_file_extent_num_bytes(eb, item);
595		extent_end = start + nbytes;
596
597		/*
598		 * We don't add to the inodes nbytes if we are prealloc or a
599		 * hole.
600		 */
601		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
602			nbytes = 0;
603	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
604		size = btrfs_file_extent_ram_bytes(eb, item);
605		nbytes = btrfs_file_extent_ram_bytes(eb, item);
606		extent_end = ALIGN(start + size,
607				   fs_info->sectorsize);
608	} else {
609		ret = 0;
610		goto out;
611	}
612
613	inode = read_one_inode(root, key->objectid);
614	if (!inode) {
615		ret = -EIO;
616		goto out;
617	}
618
619	/*
620	 * first check to see if we already have this extent in the
621	 * file.  This must be done before the btrfs_drop_extents run
622	 * so we don't try to drop this extent.
623	 */
624	ret = btrfs_lookup_file_extent(trans, root, path,
625			btrfs_ino(BTRFS_I(inode)), start, 0);
626
627	if (ret == 0 &&
628	    (found_type == BTRFS_FILE_EXTENT_REG ||
629	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
630		struct btrfs_file_extent_item cmp1;
631		struct btrfs_file_extent_item cmp2;
632		struct btrfs_file_extent_item *existing;
633		struct extent_buffer *leaf;
634
635		leaf = path->nodes[0];
636		existing = btrfs_item_ptr(leaf, path->slots[0],
637					  struct btrfs_file_extent_item);
638
639		read_extent_buffer(eb, &cmp1, (unsigned long)item,
640				   sizeof(cmp1));
641		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
642				   sizeof(cmp2));
643
644		/*
645		 * we already have a pointer to this exact extent,
646		 * we don't have to do anything
647		 */
648		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
649			btrfs_release_path(path);
650			goto out;
651		}
652	}
653	btrfs_release_path(path);
654
655	/* drop any overlapping extents */
656	ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
657	if (ret)
658		goto out;
659
660	if (found_type == BTRFS_FILE_EXTENT_REG ||
661	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
662		u64 offset;
663		unsigned long dest_offset;
664		struct btrfs_key ins;
665
666		if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
667		    btrfs_fs_incompat(fs_info, NO_HOLES))
668			goto update_inode;
669
670		ret = btrfs_insert_empty_item(trans, root, path, key,
671					      sizeof(*item));
672		if (ret)
673			goto out;
674		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
675						    path->slots[0]);
676		copy_extent_buffer(path->nodes[0], eb, dest_offset,
677				(unsigned long)item,  sizeof(*item));
678
679		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
680		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
681		ins.type = BTRFS_EXTENT_ITEM_KEY;
682		offset = key->offset - btrfs_file_extent_offset(eb, item);
683
684		/*
685		 * Manually record dirty extent, as here we did a shallow
686		 * file extent item copy and skip normal backref update,
687		 * but modifying extent tree all by ourselves.
688		 * So need to manually record dirty extent for qgroup,
689		 * as the owner of the file extent changed from log tree
690		 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
691		 */
692		ret = btrfs_qgroup_trace_extent(trans,
693				btrfs_file_extent_disk_bytenr(eb, item),
694				btrfs_file_extent_disk_num_bytes(eb, item),
695				GFP_NOFS);
696		if (ret < 0)
697			goto out;
698
699		if (ins.objectid > 0) {
700			struct btrfs_ref ref = { 0 };
701			u64 csum_start;
702			u64 csum_end;
703			LIST_HEAD(ordered_sums);
704
705			/*
706			 * is this extent already allocated in the extent
707			 * allocation tree?  If so, just add a reference
708			 */
709			ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
710						ins.offset);
711			if (ret < 0) {
712				goto out;
713			} else if (ret == 0) {
714				btrfs_init_generic_ref(&ref,
715						BTRFS_ADD_DELAYED_REF,
716						ins.objectid, ins.offset, 0);
717				btrfs_init_data_ref(&ref,
718						root->root_key.objectid,
719						key->objectid, offset);
720				ret = btrfs_inc_extent_ref(trans, &ref);
721				if (ret)
722					goto out;
723			} else {
724				/*
725				 * insert the extent pointer in the extent
726				 * allocation tree
727				 */
728				ret = btrfs_alloc_logged_file_extent(trans,
729						root->root_key.objectid,
730						key->objectid, offset, &ins);
731				if (ret)
732					goto out;
733			}
734			btrfs_release_path(path);
735
736			if (btrfs_file_extent_compression(eb, item)) {
737				csum_start = ins.objectid;
738				csum_end = csum_start + ins.offset;
739			} else {
740				csum_start = ins.objectid +
741					btrfs_file_extent_offset(eb, item);
742				csum_end = csum_start +
743					btrfs_file_extent_num_bytes(eb, item);
744			}
745
746			ret = btrfs_lookup_csums_range(root->log_root,
747						csum_start, csum_end - 1,
748						&ordered_sums, 0);
749			if (ret)
750				goto out;
751			/*
752			 * Now delete all existing cums in the csum root that
753			 * cover our range. We do this because we can have an
754			 * extent that is completely referenced by one file
755			 * extent item and partially referenced by another
756			 * file extent item (like after using the clone or
757			 * extent_same ioctls). In this case if we end up doing
758			 * the replay of the one that partially references the
759			 * extent first, and we do not do the csum deletion
760			 * below, we can get 2 csum items in the csum tree that
761			 * overlap each other. For example, imagine our log has
762			 * the two following file extent items:
763			 *
764			 * key (257 EXTENT_DATA 409600)
765			 *     extent data disk byte 12845056 nr 102400
766			 *     extent data offset 20480 nr 20480 ram 102400
767			 *
768			 * key (257 EXTENT_DATA 819200)
769			 *     extent data disk byte 12845056 nr 102400
770			 *     extent data offset 0 nr 102400 ram 102400
771			 *
772			 * Where the second one fully references the 100K extent
773			 * that starts at disk byte 12845056, and the log tree
774			 * has a single csum item that covers the entire range
775			 * of the extent:
776			 *
777			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
778			 *
779			 * After the first file extent item is replayed, the
780			 * csum tree gets the following csum item:
781			 *
782			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
783			 *
784			 * Which covers the 20K sub-range starting at offset 20K
785			 * of our extent. Now when we replay the second file
786			 * extent item, if we do not delete existing csum items
787			 * that cover any of its blocks, we end up getting two
788			 * csum items in our csum tree that overlap each other:
789			 *
790			 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
791			 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
792			 *
793			 * Which is a problem, because after this anyone trying
794			 * to lookup up for the checksum of any block of our
795			 * extent starting at an offset of 40K or higher, will
796			 * end up looking at the second csum item only, which
797			 * does not contain the checksum for any block starting
798			 * at offset 40K or higher of our extent.
799			 */
800			while (!list_empty(&ordered_sums)) {
801				struct btrfs_ordered_sum *sums;
802				sums = list_entry(ordered_sums.next,
803						struct btrfs_ordered_sum,
804						list);
805				if (!ret)
806					ret = btrfs_del_csums(trans,
807							      fs_info->csum_root,
808							      sums->bytenr,
809							      sums->len);
810				if (!ret)
811					ret = btrfs_csum_file_blocks(trans,
812						fs_info->csum_root, sums);
813				list_del(&sums->list);
814				kfree(sums);
815			}
816			if (ret)
817				goto out;
818		} else {
819			btrfs_release_path(path);
820		}
821	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
822		/* inline extents are easy, we just overwrite them */
823		ret = overwrite_item(trans, root, path, eb, slot, key);
824		if (ret)
825			goto out;
826	}
827
828	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
829						extent_end - start);
830	if (ret)
831		goto out;
832
833	inode_add_bytes(inode, nbytes);
834update_inode:
835	ret = btrfs_update_inode(trans, root, inode);
836out:
837	if (inode)
838		iput(inode);
839	return ret;
840}
841
842/*
843 * when cleaning up conflicts between the directory names in the
844 * subvolume, directory names in the log and directory names in the
845 * inode back references, we may have to unlink inodes from directories.
846 *
847 * This is a helper function to do the unlink of a specific directory
848 * item
849 */
850static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
851				      struct btrfs_root *root,
852				      struct btrfs_path *path,
853				      struct btrfs_inode *dir,
854				      struct btrfs_dir_item *di)
855{
856	struct inode *inode;
857	char *name;
858	int name_len;
859	struct extent_buffer *leaf;
860	struct btrfs_key location;
861	int ret;
862
863	leaf = path->nodes[0];
864
865	btrfs_dir_item_key_to_cpu(leaf, di, &location);
866	name_len = btrfs_dir_name_len(leaf, di);
867	name = kmalloc(name_len, GFP_NOFS);
868	if (!name)
869		return -ENOMEM;
870
871	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
872	btrfs_release_path(path);
873
874	inode = read_one_inode(root, location.objectid);
875	if (!inode) {
876		ret = -EIO;
877		goto out;
878	}
879
880	ret = link_to_fixup_dir(trans, root, path, location.objectid);
881	if (ret)
882		goto out;
883
884	ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
885			name_len);
886	if (ret)
887		goto out;
888	else
889		ret = btrfs_run_delayed_items(trans);
890out:
891	kfree(name);
892	iput(inode);
893	return ret;
894}
895
896/*
897 * See if a given name and sequence number found in an inode back reference are
898 * already in a directory and correctly point to this inode.
899 *
900 * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
901 * exists.
902 */
903static noinline int inode_in_dir(struct btrfs_root *root,
904				 struct btrfs_path *path,
905				 u64 dirid, u64 objectid, u64 index,
906				 const char *name, int name_len)
907{
908	struct btrfs_dir_item *di;
909	struct btrfs_key location;
910	int ret = 0;
911
912	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
913					 index, name, name_len, 0);
914	if (IS_ERR(di)) {
915		if (PTR_ERR(di) != -ENOENT)
916			ret = PTR_ERR(di);
917		goto out;
918	} else if (di) {
919		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
920		if (location.objectid != objectid)
921			goto out;
922	} else {
923		goto out;
924	}
925
926	btrfs_release_path(path);
927	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
928	if (IS_ERR(di)) {
929		ret = PTR_ERR(di);
930		goto out;
931	} else if (di) {
932		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
933		if (location.objectid == objectid)
934			ret = 1;
935	}
936out:
937	btrfs_release_path(path);
938	return ret;
939}
940
941/*
942 * helper function to check a log tree for a named back reference in
943 * an inode.  This is used to decide if a back reference that is
944 * found in the subvolume conflicts with what we find in the log.
945 *
946 * inode backreferences may have multiple refs in a single item,
947 * during replay we process one reference at a time, and we don't
948 * want to delete valid links to a file from the subvolume if that
949 * link is also in the log.
950 */
951static noinline int backref_in_log(struct btrfs_root *log,
952				   struct btrfs_key *key,
953				   u64 ref_objectid,
954				   const char *name, int namelen)
955{
956	struct btrfs_path *path;
957	int ret;
958
959	path = btrfs_alloc_path();
960	if (!path)
961		return -ENOMEM;
962
963	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
964	if (ret < 0) {
965		goto out;
966	} else if (ret == 1) {
967		ret = 0;
968		goto out;
969	}
970
971	if (key->type == BTRFS_INODE_EXTREF_KEY)
972		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
973						       path->slots[0],
974						       ref_objectid,
975						       name, namelen);
976	else
977		ret = !!btrfs_find_name_in_backref(path->nodes[0],
978						   path->slots[0],
979						   name, namelen);
980out:
981	btrfs_free_path(path);
982	return ret;
983}
984
985static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
986				  struct btrfs_root *root,
987				  struct btrfs_path *path,
988				  struct btrfs_root *log_root,
989				  struct btrfs_inode *dir,
990				  struct btrfs_inode *inode,
991				  u64 inode_objectid, u64 parent_objectid,
992				  u64 ref_index, char *name, int namelen,
993				  int *search_done)
994{
995	int ret;
996	char *victim_name;
997	int victim_name_len;
998	struct extent_buffer *leaf;
999	struct btrfs_dir_item *di;
1000	struct btrfs_key search_key;
1001	struct btrfs_inode_extref *extref;
1002
1003again:
1004	/* Search old style refs */
1005	search_key.objectid = inode_objectid;
1006	search_key.type = BTRFS_INODE_REF_KEY;
1007	search_key.offset = parent_objectid;
1008	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1009	if (ret == 0) {
1010		struct btrfs_inode_ref *victim_ref;
1011		unsigned long ptr;
1012		unsigned long ptr_end;
1013
1014		leaf = path->nodes[0];
1015
1016		/* are we trying to overwrite a back ref for the root directory
1017		 * if so, just jump out, we're done
1018		 */
1019		if (search_key.objectid == search_key.offset)
1020			return 1;
1021
1022		/* check all the names in this back reference to see
1023		 * if they are in the log.  if so, we allow them to stay
1024		 * otherwise they must be unlinked as a conflict
1025		 */
1026		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1027		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
1028		while (ptr < ptr_end) {
1029			victim_ref = (struct btrfs_inode_ref *)ptr;
1030			victim_name_len = btrfs_inode_ref_name_len(leaf,
1031								   victim_ref);
1032			victim_name = kmalloc(victim_name_len, GFP_NOFS);
1033			if (!victim_name)
1034				return -ENOMEM;
1035
1036			read_extent_buffer(leaf, victim_name,
1037					   (unsigned long)(victim_ref + 1),
1038					   victim_name_len);
1039
1040			ret = backref_in_log(log_root, &search_key,
1041					     parent_objectid, victim_name,
1042					     victim_name_len);
1043			if (ret < 0) {
1044				kfree(victim_name);
1045				return ret;
1046			} else if (!ret) {
1047				inc_nlink(&inode->vfs_inode);
1048				btrfs_release_path(path);
1049
1050				ret = btrfs_unlink_inode(trans, root, dir, inode,
1051						victim_name, victim_name_len);
1052				kfree(victim_name);
1053				if (ret)
1054					return ret;
1055				ret = btrfs_run_delayed_items(trans);
1056				if (ret)
1057					return ret;
1058				*search_done = 1;
1059				goto again;
1060			}
1061			kfree(victim_name);
1062
1063			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
1064		}
1065
1066		/*
1067		 * NOTE: we have searched root tree and checked the
1068		 * corresponding ref, it does not need to check again.
1069		 */
1070		*search_done = 1;
1071	}
1072	btrfs_release_path(path);
1073
1074	/* Same search but for extended refs */
1075	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1076					   inode_objectid, parent_objectid, 0,
1077					   0);
1078	if (IS_ERR(extref)) {
1079		return PTR_ERR(extref);
1080	} else if (extref) {
1081		u32 item_size;
1082		u32 cur_offset = 0;
1083		unsigned long base;
1084		struct inode *victim_parent;
1085
1086		leaf = path->nodes[0];
1087
1088		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1089		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1090
1091		while (cur_offset < item_size) {
1092			extref = (struct btrfs_inode_extref *)(base + cur_offset);
1093
1094			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1095
1096			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1097				goto next;
1098
1099			victim_name = kmalloc(victim_name_len, GFP_NOFS);
1100			if (!victim_name)
1101				return -ENOMEM;
1102			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1103					   victim_name_len);
1104
1105			search_key.objectid = inode_objectid;
1106			search_key.type = BTRFS_INODE_EXTREF_KEY;
1107			search_key.offset = btrfs_extref_hash(parent_objectid,
1108							      victim_name,
1109							      victim_name_len);
1110			ret = backref_in_log(log_root, &search_key,
1111					     parent_objectid, victim_name,
1112					     victim_name_len);
1113			if (ret < 0) {
1114				kfree(victim_name);
1115				return ret;
1116			} else if (!ret) {
1117				ret = -ENOENT;
1118				victim_parent = read_one_inode(root,
1119						parent_objectid);
1120				if (victim_parent) {
1121					inc_nlink(&inode->vfs_inode);
1122					btrfs_release_path(path);
1123
1124					ret = btrfs_unlink_inode(trans, root,
1125							BTRFS_I(victim_parent),
1126							inode,
1127							victim_name,
1128							victim_name_len);
1129					if (!ret)
1130						ret = btrfs_run_delayed_items(
1131								  trans);
1132				}
1133				iput(victim_parent);
1134				kfree(victim_name);
1135				if (ret)
1136					return ret;
1137				*search_done = 1;
1138				goto again;
1139			}
1140			kfree(victim_name);
1141next:
1142			cur_offset += victim_name_len + sizeof(*extref);
1143		}
1144		*search_done = 1;
1145	}
1146	btrfs_release_path(path);
1147
1148	/* look for a conflicting sequence number */
1149	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1150					 ref_index, name, namelen, 0);
1151	if (IS_ERR(di)) {
1152		if (PTR_ERR(di) != -ENOENT)
1153			return PTR_ERR(di);
1154	} else if (di) {
1155		ret = drop_one_dir_item(trans, root, path, dir, di);
1156		if (ret)
1157			return ret;
1158	}
1159	btrfs_release_path(path);
1160
1161	/* look for a conflicting name */
1162	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1163				   name, namelen, 0);
1164	if (IS_ERR(di)) {
1165		return PTR_ERR(di);
1166	} else if (di) {
1167		ret = drop_one_dir_item(trans, root, path, dir, di);
1168		if (ret)
1169			return ret;
1170	}
1171	btrfs_release_path(path);
1172
1173	return 0;
1174}
1175
1176static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1177			     u32 *namelen, char **name, u64 *index,
1178			     u64 *parent_objectid)
1179{
1180	struct btrfs_inode_extref *extref;
1181
1182	extref = (struct btrfs_inode_extref *)ref_ptr;
1183
1184	*namelen = btrfs_inode_extref_name_len(eb, extref);
1185	*name = kmalloc(*namelen, GFP_NOFS);
1186	if (*name == NULL)
1187		return -ENOMEM;
1188
1189	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1190			   *namelen);
1191
1192	if (index)
1193		*index = btrfs_inode_extref_index(eb, extref);
1194	if (parent_objectid)
1195		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
1196
1197	return 0;
1198}
1199
1200static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1201			  u32 *namelen, char **name, u64 *index)
1202{
1203	struct btrfs_inode_ref *ref;
1204
1205	ref = (struct btrfs_inode_ref *)ref_ptr;
1206
1207	*namelen = btrfs_inode_ref_name_len(eb, ref);
1208	*name = kmalloc(*namelen, GFP_NOFS);
1209	if (*name == NULL)
1210		return -ENOMEM;
1211
1212	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1213
1214	if (index)
1215		*index = btrfs_inode_ref_index(eb, ref);
1216
1217	return 0;
1218}
1219
1220/*
1221 * Take an inode reference item from the log tree and iterate all names from the
1222 * inode reference item in the subvolume tree with the same key (if it exists).
1223 * For any name that is not in the inode reference item from the log tree, do a
1224 * proper unlink of that name (that is, remove its entry from the inode
1225 * reference item and both dir index keys).
1226 */
1227static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1228				 struct btrfs_root *root,
1229				 struct btrfs_path *path,
1230				 struct btrfs_inode *inode,
1231				 struct extent_buffer *log_eb,
1232				 int log_slot,
1233				 struct btrfs_key *key)
1234{
1235	int ret;
1236	unsigned long ref_ptr;
1237	unsigned long ref_end;
1238	struct extent_buffer *eb;
1239
1240again:
1241	btrfs_release_path(path);
1242	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1243	if (ret > 0) {
1244		ret = 0;
1245		goto out;
1246	}
1247	if (ret < 0)
1248		goto out;
1249
1250	eb = path->nodes[0];
1251	ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1252	ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
1253	while (ref_ptr < ref_end) {
1254		char *name = NULL;
1255		int namelen;
1256		u64 parent_id;
1257
1258		if (key->type == BTRFS_INODE_EXTREF_KEY) {
1259			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1260						NULL, &parent_id);
1261		} else {
1262			parent_id = key->offset;
1263			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1264					     NULL);
1265		}
1266		if (ret)
1267			goto out;
1268
1269		if (key->type == BTRFS_INODE_EXTREF_KEY)
1270			ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
1271							       parent_id, name,
1272							       namelen);
1273		else
1274			ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
1275							   name, namelen);
1276
1277		if (!ret) {
1278			struct inode *dir;
1279
1280			btrfs_release_path(path);
1281			dir = read_one_inode(root, parent_id);
1282			if (!dir) {
1283				ret = -ENOENT;
1284				kfree(name);
1285				goto out;
1286			}
1287			ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
1288						 inode, name, namelen);
1289			kfree(name);
1290			iput(dir);
1291			/*
1292			 * Whenever we need to check if a name exists or not, we
1293			 * check the subvolume tree. So after an unlink we must
1294			 * run delayed items, so that future checks for a name
1295			 * during log replay see that the name does not exists
1296			 * anymore.
1297			 */
1298			if (!ret)
1299				ret = btrfs_run_delayed_items(trans);
1300			if (ret)
1301				goto out;
1302			goto again;
1303		}
1304
1305		kfree(name);
1306		ref_ptr += namelen;
1307		if (key->type == BTRFS_INODE_EXTREF_KEY)
1308			ref_ptr += sizeof(struct btrfs_inode_extref);
1309		else
1310			ref_ptr += sizeof(struct btrfs_inode_ref);
1311	}
1312	ret = 0;
1313 out:
1314	btrfs_release_path(path);
1315	return ret;
1316}
1317
1318static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
1319				  const u8 ref_type, const char *name,
1320				  const int namelen)
1321{
1322	struct btrfs_key key;
1323	struct btrfs_path *path;
1324	const u64 parent_id = btrfs_ino(BTRFS_I(dir));
1325	int ret;
1326
1327	path = btrfs_alloc_path();
1328	if (!path)
1329		return -ENOMEM;
1330
1331	key.objectid = btrfs_ino(BTRFS_I(inode));
1332	key.type = ref_type;
1333	if (key.type == BTRFS_INODE_REF_KEY)
1334		key.offset = parent_id;
1335	else
1336		key.offset = btrfs_extref_hash(parent_id, name, namelen);
1337
1338	ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
1339	if (ret < 0)
1340		goto out;
1341	if (ret > 0) {
1342		ret = 0;
1343		goto out;
1344	}
1345	if (key.type == BTRFS_INODE_EXTREF_KEY)
1346		ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1347				path->slots[0], parent_id, name, namelen);
1348	else
1349		ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
1350						   name, namelen);
1351
1352out:
1353	btrfs_free_path(path);
1354	return ret;
1355}
1356
1357static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1358		    struct inode *dir, struct inode *inode, const char *name,
1359		    int namelen, u64 ref_index)
1360{
1361	struct btrfs_dir_item *dir_item;
1362	struct btrfs_key key;
1363	struct btrfs_path *path;
1364	struct inode *other_inode = NULL;
1365	int ret;
1366
1367	path = btrfs_alloc_path();
1368	if (!path)
1369		return -ENOMEM;
1370
1371	dir_item = btrfs_lookup_dir_item(NULL, root, path,
1372					 btrfs_ino(BTRFS_I(dir)),
1373					 name, namelen, 0);
1374	if (!dir_item) {
1375		btrfs_release_path(path);
1376		goto add_link;
1377	} else if (IS_ERR(dir_item)) {
1378		ret = PTR_ERR(dir_item);
1379		goto out;
1380	}
1381
1382	/*
1383	 * Our inode's dentry collides with the dentry of another inode which is
1384	 * in the log but not yet processed since it has a higher inode number.
1385	 * So delete that other dentry.
1386	 */
1387	btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
1388	btrfs_release_path(path);
1389	other_inode = read_one_inode(root, key.objectid);
1390	if (!other_inode) {
1391		ret = -ENOENT;
1392		goto out;
1393	}
1394	ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
1395				 name, namelen);
1396	if (ret)
1397		goto out;
1398	/*
1399	 * If we dropped the link count to 0, bump it so that later the iput()
1400	 * on the inode will not free it. We will fixup the link count later.
1401	 */
1402	if (other_inode->i_nlink == 0)
1403		inc_nlink(other_inode);
1404
1405	ret = btrfs_run_delayed_items(trans);
1406	if (ret)
1407		goto out;
1408add_link:
1409	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1410			     name, namelen, 0, ref_index);
1411out:
1412	iput(other_inode);
1413	btrfs_free_path(path);
1414
1415	return ret;
1416}
1417
1418/*
1419 * replay one inode back reference item found in the log tree.
1420 * eb, slot and key refer to the buffer and key found in the log tree.
1421 * root is the destination we are replaying into, and path is for temp
1422 * use by this function.  (it should be released on return).
1423 */
1424static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1425				  struct btrfs_root *root,
1426				  struct btrfs_root *log,
1427				  struct btrfs_path *path,
1428				  struct extent_buffer *eb, int slot,
1429				  struct btrfs_key *key)
1430{
1431	struct inode *dir = NULL;
1432	struct inode *inode = NULL;
1433	unsigned long ref_ptr;
1434	unsigned long ref_end;
1435	char *name = NULL;
1436	int namelen;
1437	int ret;
1438	int search_done = 0;
1439	int log_ref_ver = 0;
1440	u64 parent_objectid;
1441	u64 inode_objectid;
1442	u64 ref_index = 0;
1443	int ref_struct_size;
1444
1445	ref_ptr = btrfs_item_ptr_offset(eb, slot);
1446	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1447
1448	if (key->type == BTRFS_INODE_EXTREF_KEY) {
1449		struct btrfs_inode_extref *r;
1450
1451		ref_struct_size = sizeof(struct btrfs_inode_extref);
1452		log_ref_ver = 1;
1453		r = (struct btrfs_inode_extref *)ref_ptr;
1454		parent_objectid = btrfs_inode_extref_parent(eb, r);
1455	} else {
1456		ref_struct_size = sizeof(struct btrfs_inode_ref);
1457		parent_objectid = key->offset;
1458	}
1459	inode_objectid = key->objectid;
1460
1461	/*
1462	 * it is possible that we didn't log all the parent directories
1463	 * for a given inode.  If we don't find the dir, just don't
1464	 * copy the back ref in.  The link count fixup code will take
1465	 * care of the rest
1466	 */
1467	dir = read_one_inode(root, parent_objectid);
1468	if (!dir) {
1469		ret = -ENOENT;
1470		goto out;
1471	}
1472
1473	inode = read_one_inode(root, inode_objectid);
1474	if (!inode) {
1475		ret = -EIO;
1476		goto out;
1477	}
1478
1479	while (ref_ptr < ref_end) {
1480		if (log_ref_ver) {
1481			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1482						&ref_index, &parent_objectid);
1483			/*
1484			 * parent object can change from one array
1485			 * item to another.
1486			 */
1487			if (!dir)
1488				dir = read_one_inode(root, parent_objectid);
1489			if (!dir) {
1490				ret = -ENOENT;
1491				goto out;
1492			}
1493		} else {
1494			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1495					     &ref_index);
1496		}
1497		if (ret)
1498			goto out;
1499
1500		ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1501				   btrfs_ino(BTRFS_I(inode)), ref_index,
1502				   name, namelen);
1503		if (ret < 0) {
1504			goto out;
1505		} else if (ret == 0) {
1506			/*
1507			 * look for a conflicting back reference in the
1508			 * metadata. if we find one we have to unlink that name
1509			 * of the file before we add our new link.  Later on, we
1510			 * overwrite any existing back reference, and we don't
1511			 * want to create dangling pointers in the directory.
1512			 */
1513
1514			if (!search_done) {
1515				ret = __add_inode_ref(trans, root, path, log,
1516						      BTRFS_I(dir),
1517						      BTRFS_I(inode),
1518						      inode_objectid,
1519						      parent_objectid,
1520						      ref_index, name, namelen,
1521						      &search_done);
1522				if (ret) {
1523					if (ret == 1)
1524						ret = 0;
1525					goto out;
1526				}
1527			}
1528
1529			/*
1530			 * If a reference item already exists for this inode
1531			 * with the same parent and name, but different index,
1532			 * drop it and the corresponding directory index entries
1533			 * from the parent before adding the new reference item
1534			 * and dir index entries, otherwise we would fail with
1535			 * -EEXIST returned from btrfs_add_link() below.
1536			 */
1537			ret = btrfs_inode_ref_exists(inode, dir, key->type,
1538						     name, namelen);
1539			if (ret > 0) {
1540				ret = btrfs_unlink_inode(trans, root,
1541							 BTRFS_I(dir),
1542							 BTRFS_I(inode),
1543							 name, namelen);
1544				/*
1545				 * If we dropped the link count to 0, bump it so
1546				 * that later the iput() on the inode will not
1547				 * free it. We will fixup the link count later.
1548				 */
1549				if (!ret && inode->i_nlink == 0)
1550					inc_nlink(inode);
1551				/*
1552				 * Whenever we need to check if a name exists or
1553				 * not, we check the subvolume tree. So after an
1554				 * unlink we must run delayed items, so that future
1555				 * checks for a name during log replay see that the
1556				 * name does not exists anymore.
1557				 */
1558				if (!ret)
1559					ret = btrfs_run_delayed_items(trans);
1560			}
1561			if (ret < 0)
1562				goto out;
1563
1564			/* insert our name */
1565			ret = add_link(trans, root, dir, inode, name, namelen,
1566				       ref_index);
1567			if (ret)
1568				goto out;
1569
1570			btrfs_update_inode(trans, root, inode);
1571		}
1572		/* Else, ret == 1, we already have a perfect match, we're done. */
1573
1574		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1575		kfree(name);
1576		name = NULL;
1577		if (log_ref_ver) {
1578			iput(dir);
1579			dir = NULL;
1580		}
1581	}
1582
1583	/*
1584	 * Before we overwrite the inode reference item in the subvolume tree
1585	 * with the item from the log tree, we must unlink all names from the
1586	 * parent directory that are in the subvolume's tree inode reference
1587	 * item, otherwise we end up with an inconsistent subvolume tree where
1588	 * dir index entries exist for a name but there is no inode reference
1589	 * item with the same name.
1590	 */
1591	ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
1592				    key);
1593	if (ret)
1594		goto out;
1595
1596	/* finally write the back reference in the inode */
1597	ret = overwrite_item(trans, root, path, eb, slot, key);
1598out:
1599	btrfs_release_path(path);
1600	kfree(name);
1601	iput(dir);
1602	iput(inode);
1603	return ret;
1604}
1605
1606static int insert_orphan_item(struct btrfs_trans_handle *trans,
1607			      struct btrfs_root *root, u64 ino)
1608{
1609	int ret;
1610
1611	ret = btrfs_insert_orphan_item(trans, root, ino);
1612	if (ret == -EEXIST)
1613		ret = 0;
1614
1615	return ret;
1616}
1617
1618static int count_inode_extrefs(struct btrfs_root *root,
1619		struct btrfs_inode *inode, struct btrfs_path *path)
1620{
1621	int ret = 0;
1622	int name_len;
1623	unsigned int nlink = 0;
1624	u32 item_size;
1625	u32 cur_offset = 0;
1626	u64 inode_objectid = btrfs_ino(inode);
1627	u64 offset = 0;
1628	unsigned long ptr;
1629	struct btrfs_inode_extref *extref;
1630	struct extent_buffer *leaf;
1631
1632	while (1) {
1633		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1634					    &extref, &offset);
1635		if (ret)
1636			break;
1637
1638		leaf = path->nodes[0];
1639		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1640		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1641		cur_offset = 0;
1642
1643		while (cur_offset < item_size) {
1644			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1645			name_len = btrfs_inode_extref_name_len(leaf, extref);
1646
1647			nlink++;
1648
1649			cur_offset += name_len + sizeof(*extref);
1650		}
1651
1652		offset++;
1653		btrfs_release_path(path);
1654	}
1655	btrfs_release_path(path);
1656
1657	if (ret < 0 && ret != -ENOENT)
1658		return ret;
1659	return nlink;
1660}
1661
1662static int count_inode_refs(struct btrfs_root *root,
1663			struct btrfs_inode *inode, struct btrfs_path *path)
1664{
1665	int ret;
1666	struct btrfs_key key;
1667	unsigned int nlink = 0;
1668	unsigned long ptr;
1669	unsigned long ptr_end;
1670	int name_len;
1671	u64 ino = btrfs_ino(inode);
1672
1673	key.objectid = ino;
1674	key.type = BTRFS_INODE_REF_KEY;
1675	key.offset = (u64)-1;
1676
1677	while (1) {
1678		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1679		if (ret < 0)
1680			break;
1681		if (ret > 0) {
1682			if (path->slots[0] == 0)
1683				break;
1684			path->slots[0]--;
1685		}
1686process_slot:
1687		btrfs_item_key_to_cpu(path->nodes[0], &key,
1688				      path->slots[0]);
1689		if (key.objectid != ino ||
1690		    key.type != BTRFS_INODE_REF_KEY)
1691			break;
1692		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1693		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1694						   path->slots[0]);
1695		while (ptr < ptr_end) {
1696			struct btrfs_inode_ref *ref;
1697
1698			ref = (struct btrfs_inode_ref *)ptr;
1699			name_len = btrfs_inode_ref_name_len(path->nodes[0],
1700							    ref);
1701			ptr = (unsigned long)(ref + 1) + name_len;
1702			nlink++;
1703		}
1704
1705		if (key.offset == 0)
1706			break;
1707		if (path->slots[0] > 0) {
1708			path->slots[0]--;
1709			goto process_slot;
1710		}
1711		key.offset--;
1712		btrfs_release_path(path);
1713	}
1714	btrfs_release_path(path);
1715
1716	return nlink;
1717}
1718
1719/*
1720 * There are a few corners where the link count of the file can't
1721 * be properly maintained during replay.  So, instead of adding
1722 * lots of complexity to the log code, we just scan the backrefs
1723 * for any file that has been through replay.
1724 *
1725 * The scan will update the link count on the inode to reflect the
1726 * number of back refs found.  If it goes down to zero, the iput
1727 * will free the inode.
1728 */
1729static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1730					   struct btrfs_root *root,
1731					   struct inode *inode)
1732{
1733	struct btrfs_path *path;
1734	int ret;
1735	u64 nlink = 0;
1736	u64 ino = btrfs_ino(BTRFS_I(inode));
1737
1738	path = btrfs_alloc_path();
1739	if (!path)
1740		return -ENOMEM;
1741
1742	ret = count_inode_refs(root, BTRFS_I(inode), path);
1743	if (ret < 0)
1744		goto out;
1745
1746	nlink = ret;
1747
1748	ret = count_inode_extrefs(root, BTRFS_I(inode), path);
1749	if (ret < 0)
1750		goto out;
1751
1752	nlink += ret;
1753
1754	ret = 0;
1755
1756	if (nlink != inode->i_nlink) {
1757		set_nlink(inode, nlink);
1758		btrfs_update_inode(trans, root, inode);
1759	}
1760	BTRFS_I(inode)->index_cnt = (u64)-1;
1761
1762	if (inode->i_nlink == 0) {
1763		if (S_ISDIR(inode->i_mode)) {
1764			ret = replay_dir_deletes(trans, root, NULL, path,
1765						 ino, 1);
1766			if (ret)
1767				goto out;
1768		}
1769		ret = insert_orphan_item(trans, root, ino);
1770	}
1771
1772out:
1773	btrfs_free_path(path);
1774	return ret;
1775}
1776
1777static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1778					    struct btrfs_root *root,
1779					    struct btrfs_path *path)
1780{
1781	int ret;
1782	struct btrfs_key key;
1783	struct inode *inode;
1784
1785	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1786	key.type = BTRFS_ORPHAN_ITEM_KEY;
1787	key.offset = (u64)-1;
1788	while (1) {
1789		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1790		if (ret < 0)
1791			break;
1792
1793		if (ret == 1) {
1794			ret = 0;
1795			if (path->slots[0] == 0)
1796				break;
1797			path->slots[0]--;
1798		}
1799
1800		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1801		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1802		    key.type != BTRFS_ORPHAN_ITEM_KEY)
1803			break;
1804
1805		ret = btrfs_del_item(trans, root, path);
1806		if (ret)
1807			break;
1808
1809		btrfs_release_path(path);
1810		inode = read_one_inode(root, key.offset);
1811		if (!inode) {
1812			ret = -EIO;
1813			break;
1814		}
1815
1816		ret = fixup_inode_link_count(trans, root, inode);
1817		iput(inode);
1818		if (ret)
1819			break;
1820
1821		/*
1822		 * fixup on a directory may create new entries,
1823		 * make sure we always look for the highset possible
1824		 * offset
1825		 */
1826		key.offset = (u64)-1;
1827	}
1828	btrfs_release_path(path);
1829	return ret;
1830}
1831
1832
1833/*
1834 * record a given inode in the fixup dir so we can check its link
1835 * count when replay is done.  The link count is incremented here
1836 * so the inode won't go away until we check it
1837 */
1838static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1839				      struct btrfs_root *root,
1840				      struct btrfs_path *path,
1841				      u64 objectid)
1842{
1843	struct btrfs_key key;
1844	int ret = 0;
1845	struct inode *inode;
1846
1847	inode = read_one_inode(root, objectid);
1848	if (!inode)
1849		return -EIO;
1850
1851	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1852	key.type = BTRFS_ORPHAN_ITEM_KEY;
1853	key.offset = objectid;
1854
1855	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1856
1857	btrfs_release_path(path);
1858	if (ret == 0) {
1859		if (!inode->i_nlink)
1860			set_nlink(inode, 1);
1861		else
1862			inc_nlink(inode);
1863		ret = btrfs_update_inode(trans, root, inode);
1864	} else if (ret == -EEXIST) {
1865		ret = 0;
1866	}
1867	iput(inode);
1868
1869	return ret;
1870}
1871
1872/*
1873 * when replaying the log for a directory, we only insert names
1874 * for inodes that actually exist.  This means an fsync on a directory
1875 * does not implicitly fsync all the new files in it
1876 */
1877static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1878				    struct btrfs_root *root,
1879				    u64 dirid, u64 index,
1880				    char *name, int name_len,
1881				    struct btrfs_key *location)
1882{
1883	struct inode *inode;
1884	struct inode *dir;
1885	int ret;
1886
1887	inode = read_one_inode(root, location->objectid);
1888	if (!inode)
1889		return -ENOENT;
1890
1891	dir = read_one_inode(root, dirid);
1892	if (!dir) {
1893		iput(inode);
1894		return -EIO;
1895	}
1896
1897	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1898			name_len, 1, index);
1899
1900	/* FIXME, put inode into FIXUP list */
1901
1902	iput(inode);
1903	iput(dir);
1904	return ret;
1905}
1906
1907/*
1908 * take a single entry in a log directory item and replay it into
1909 * the subvolume.
1910 *
1911 * if a conflicting item exists in the subdirectory already,
1912 * the inode it points to is unlinked and put into the link count
1913 * fix up tree.
1914 *
1915 * If a name from the log points to a file or directory that does
1916 * not exist in the FS, it is skipped.  fsyncs on directories
1917 * do not force down inodes inside that directory, just changes to the
1918 * names or unlinks in a directory.
1919 *
1920 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1921 * non-existing inode) and 1 if the name was replayed.
1922 */
1923static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1924				    struct btrfs_root *root,
1925				    struct btrfs_path *path,
1926				    struct extent_buffer *eb,
1927				    struct btrfs_dir_item *di,
1928				    struct btrfs_key *key)
1929{
1930	char *name;
1931	int name_len;
1932	struct btrfs_dir_item *dst_di;
1933	struct btrfs_key found_key;
1934	struct btrfs_key log_key;
1935	struct inode *dir;
1936	u8 log_type;
1937	bool exists;
1938	int ret;
1939	bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1940	bool name_added = false;
1941
1942	dir = read_one_inode(root, key->objectid);
1943	if (!dir)
1944		return -EIO;
1945
1946	name_len = btrfs_dir_name_len(eb, di);
1947	name = kmalloc(name_len, GFP_NOFS);
1948	if (!name) {
1949		ret = -ENOMEM;
1950		goto out;
1951	}
1952
1953	log_type = btrfs_dir_type(eb, di);
1954	read_extent_buffer(eb, name, (unsigned long)(di + 1),
1955		   name_len);
1956
1957	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1958	ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1959	btrfs_release_path(path);
1960	if (ret < 0)
1961		goto out;
1962	exists = (ret == 0);
1963	ret = 0;
1964
1965	if (key->type == BTRFS_DIR_ITEM_KEY) {
1966		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1967				       name, name_len, 1);
1968	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
1969		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1970						     key->objectid,
1971						     key->offset, name,
1972						     name_len, 1);
1973	} else {
1974		/* Corruption */
1975		ret = -EINVAL;
1976		goto out;
1977	}
1978
1979	if (dst_di == ERR_PTR(-ENOENT))
1980		dst_di = NULL;
1981
1982	if (IS_ERR(dst_di)) {
1983		ret = PTR_ERR(dst_di);
1984		goto out;
1985	} else if (!dst_di) {
1986		/* we need a sequence number to insert, so we only
1987		 * do inserts for the BTRFS_DIR_INDEX_KEY types
1988		 */
1989		if (key->type != BTRFS_DIR_INDEX_KEY)
1990			goto out;
1991		goto insert;
1992	}
1993
1994	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1995	/* the existing item matches the logged item */
1996	if (found_key.objectid == log_key.objectid &&
1997	    found_key.type == log_key.type &&
1998	    found_key.offset == log_key.offset &&
1999	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
2000		update_size = false;
2001		goto out;
2002	}
2003
2004	/*
2005	 * don't drop the conflicting directory entry if the inode
2006	 * for the new entry doesn't exist
2007	 */
2008	if (!exists)
2009		goto out;
2010
2011	ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
2012	if (ret)
2013		goto out;
2014
2015	if (key->type == BTRFS_DIR_INDEX_KEY)
2016		goto insert;
2017out:
2018	btrfs_release_path(path);
2019	if (!ret && update_size) {
2020		btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
2021		ret = btrfs_update_inode(trans, root, dir);
2022	}
2023	kfree(name);
2024	iput(dir);
2025	if (!ret && name_added)
2026		ret = 1;
2027	return ret;
2028
2029insert:
2030	/*
2031	 * Check if the inode reference exists in the log for the given name,
2032	 * inode and parent inode
2033	 */
2034	found_key.objectid = log_key.objectid;
2035	found_key.type = BTRFS_INODE_REF_KEY;
2036	found_key.offset = key->objectid;
2037	ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
2038	if (ret < 0) {
2039	        goto out;
2040	} else if (ret) {
2041	        /* The dentry will be added later. */
2042	        ret = 0;
2043	        update_size = false;
2044	        goto out;
2045	}
2046
2047	found_key.objectid = log_key.objectid;
2048	found_key.type = BTRFS_INODE_EXTREF_KEY;
2049	found_key.offset = key->objectid;
2050	ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
2051			     name_len);
2052	if (ret < 0) {
2053		goto out;
2054	} else if (ret) {
2055		/* The dentry will be added later. */
2056		ret = 0;
2057		update_size = false;
2058		goto out;
2059	}
2060	btrfs_release_path(path);
2061	ret = insert_one_name(trans, root, key->objectid, key->offset,
2062			      name, name_len, &log_key);
2063	if (ret && ret != -ENOENT && ret != -EEXIST)
2064		goto out;
2065	if (!ret)
2066		name_added = true;
2067	update_size = false;
2068	ret = 0;
2069	goto out;
2070}
2071
2072/*
2073 * find all the names in a directory item and reconcile them into
2074 * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
2075 * one name in a directory item, but the same code gets used for
2076 * both directory index types
2077 */
2078static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
2079					struct btrfs_root *root,
2080					struct btrfs_path *path,
2081					struct extent_buffer *eb, int slot,
2082					struct btrfs_key *key)
2083{
2084	int ret = 0;
2085	u32 item_size = btrfs_item_size_nr(eb, slot);
2086	struct btrfs_dir_item *di;
2087	int name_len;
2088	unsigned long ptr;
2089	unsigned long ptr_end;
2090	struct btrfs_path *fixup_path = NULL;
2091
2092	ptr = btrfs_item_ptr_offset(eb, slot);
2093	ptr_end = ptr + item_size;
2094	while (ptr < ptr_end) {
2095		di = (struct btrfs_dir_item *)ptr;
2096		name_len = btrfs_dir_name_len(eb, di);
2097		ret = replay_one_name(trans, root, path, eb, di, key);
2098		if (ret < 0)
2099			break;
2100		ptr = (unsigned long)(di + 1);
2101		ptr += name_len;
2102
2103		/*
2104		 * If this entry refers to a non-directory (directories can not
2105		 * have a link count > 1) and it was added in the transaction
2106		 * that was not committed, make sure we fixup the link count of
2107		 * the inode it the entry points to. Otherwise something like
2108		 * the following would result in a directory pointing to an
2109		 * inode with a wrong link that does not account for this dir
2110		 * entry:
2111		 *
2112		 * mkdir testdir
2113		 * touch testdir/foo
2114		 * touch testdir/bar
2115		 * sync
2116		 *
2117		 * ln testdir/bar testdir/bar_link
2118		 * ln testdir/foo testdir/foo_link
2119		 * xfs_io -c "fsync" testdir/bar
2120		 *
2121		 * <power failure>
2122		 *
2123		 * mount fs, log replay happens
2124		 *
2125		 * File foo would remain with a link count of 1 when it has two
2126		 * entries pointing to it in the directory testdir. This would
2127		 * make it impossible to ever delete the parent directory has
2128		 * it would result in stale dentries that can never be deleted.
2129		 */
2130		if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
2131			struct btrfs_key di_key;
2132
2133			if (!fixup_path) {
2134				fixup_path = btrfs_alloc_path();
2135				if (!fixup_path) {
2136					ret = -ENOMEM;
2137					break;
2138				}
2139			}
2140
2141			btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2142			ret = link_to_fixup_dir(trans, root, fixup_path,
2143						di_key.objectid);
2144			if (ret)
2145				break;
2146		}
2147		ret = 0;
2148	}
2149	btrfs_free_path(fixup_path);
2150	return ret;
2151}
2152
2153/*
2154 * directory replay has two parts.  There are the standard directory
2155 * items in the log copied from the subvolume, and range items
2156 * created in the log while the subvolume was logged.
2157 *
2158 * The range items tell us which parts of the key space the log
2159 * is authoritative for.  During replay, if a key in the subvolume
2160 * directory is in a logged range item, but not actually in the log
2161 * that means it was deleted from the directory before the fsync
2162 * and should be removed.
2163 */
2164static noinline int find_dir_range(struct btrfs_root *root,
2165				   struct btrfs_path *path,
2166				   u64 dirid, int key_type,
2167				   u64 *start_ret, u64 *end_ret)
2168{
2169	struct btrfs_key key;
2170	u64 found_end;
2171	struct btrfs_dir_log_item *item;
2172	int ret;
2173	int nritems;
2174
2175	if (*start_ret == (u64)-1)
2176		return 1;
2177
2178	key.objectid = dirid;
2179	key.type = key_type;
2180	key.offset = *start_ret;
2181
2182	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2183	if (ret < 0)
2184		goto out;
2185	if (ret > 0) {
2186		if (path->slots[0] == 0)
2187			goto out;
2188		path->slots[0]--;
2189	}
2190	if (ret != 0)
2191		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2192
2193	if (key.type != key_type || key.objectid != dirid) {
2194		ret = 1;
2195		goto next;
2196	}
2197	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2198			      struct btrfs_dir_log_item);
2199	found_end = btrfs_dir_log_end(path->nodes[0], item);
2200
2201	if (*start_ret >= key.offset && *start_ret <= found_end) {
2202		ret = 0;
2203		*start_ret = key.offset;
2204		*end_ret = found_end;
2205		goto out;
2206	}
2207	ret = 1;
2208next:
2209	/* check the next slot in the tree to see if it is a valid item */
2210	nritems = btrfs_header_nritems(path->nodes[0]);
2211	path->slots[0]++;
2212	if (path->slots[0] >= nritems) {
2213		ret = btrfs_next_leaf(root, path);
2214		if (ret)
2215			goto out;
2216	}
2217
2218	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2219
2220	if (key.type != key_type || key.objectid != dirid) {
2221		ret = 1;
2222		goto out;
2223	}
2224	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2225			      struct btrfs_dir_log_item);
2226	found_end = btrfs_dir_log_end(path->nodes[0], item);
2227	*start_ret = key.offset;
2228	*end_ret = found_end;
2229	ret = 0;
2230out:
2231	btrfs_release_path(path);
2232	return ret;
2233}
2234
2235/*
2236 * this looks for a given directory item in the log.  If the directory
2237 * item is not in the log, the item is removed and the inode it points
2238 * to is unlinked
2239 */
2240static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2241				      struct btrfs_root *root,
2242				      struct btrfs_root *log,
2243				      struct btrfs_path *path,
2244				      struct btrfs_path *log_path,
2245				      struct inode *dir,
2246				      struct btrfs_key *dir_key)
2247{
2248	int ret;
2249	struct extent_buffer *eb;
2250	int slot;
2251	u32 item_size;
2252	struct btrfs_dir_item *di;
2253	struct btrfs_dir_item *log_di;
2254	int name_len;
2255	unsigned long ptr;
2256	unsigned long ptr_end;
2257	char *name;
2258	struct inode *inode;
2259	struct btrfs_key location;
2260
2261again:
2262	eb = path->nodes[0];
2263	slot = path->slots[0];
2264	item_size = btrfs_item_size_nr(eb, slot);
2265	ptr = btrfs_item_ptr_offset(eb, slot);
2266	ptr_end = ptr + item_size;
2267	while (ptr < ptr_end) {
2268		di = (struct btrfs_dir_item *)ptr;
2269		name_len = btrfs_dir_name_len(eb, di);
2270		name = kmalloc(name_len, GFP_NOFS);
2271		if (!name) {
2272			ret = -ENOMEM;
2273			goto out;
2274		}
2275		read_extent_buffer(eb, name, (unsigned long)(di + 1),
2276				  name_len);
2277		log_di = NULL;
2278		if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
2279			log_di = btrfs_lookup_dir_item(trans, log, log_path,
2280						       dir_key->objectid,
2281						       name, name_len, 0);
2282		} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
2283			log_di = btrfs_lookup_dir_index_item(trans, log,
2284						     log_path,
2285						     dir_key->objectid,
2286						     dir_key->offset,
2287						     name, name_len, 0);
2288		}
2289		if (!log_di || log_di == ERR_PTR(-ENOENT)) {
2290			btrfs_dir_item_key_to_cpu(eb, di, &location);
2291			btrfs_release_path(path);
2292			btrfs_release_path(log_path);
2293			inode = read_one_inode(root, location.objectid);
2294			if (!inode) {
2295				kfree(name);
2296				return -EIO;
2297			}
2298
2299			ret = link_to_fixup_dir(trans, root,
2300						path, location.objectid);
2301			if (ret) {
2302				kfree(name);
2303				iput(inode);
2304				goto out;
2305			}
2306
2307			inc_nlink(inode);
2308			ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
2309					BTRFS_I(inode), name, name_len);
2310			if (!ret)
2311				ret = btrfs_run_delayed_items(trans);
2312			kfree(name);
2313			iput(inode);
2314			if (ret)
2315				goto out;
2316
2317			/* there might still be more names under this key
2318			 * check and repeat if required
2319			 */
2320			ret = btrfs_search_slot(NULL, root, dir_key, path,
2321						0, 0);
2322			if (ret == 0)
2323				goto again;
2324			ret = 0;
2325			goto out;
2326		} else if (IS_ERR(log_di)) {
2327			kfree(name);
2328			return PTR_ERR(log_di);
2329		}
2330		btrfs_release_path(log_path);
2331		kfree(name);
2332
2333		ptr = (unsigned long)(di + 1);
2334		ptr += name_len;
2335	}
2336	ret = 0;
2337out:
2338	btrfs_release_path(path);
2339	btrfs_release_path(log_path);
2340	return ret;
2341}
2342
2343static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2344			      struct btrfs_root *root,
2345			      struct btrfs_root *log,
2346			      struct btrfs_path *path,
2347			      const u64 ino)
2348{
2349	struct btrfs_key search_key;
2350	struct btrfs_path *log_path;
2351	int i;
2352	int nritems;
2353	int ret;
2354
2355	log_path = btrfs_alloc_path();
2356	if (!log_path)
2357		return -ENOMEM;
2358
2359	search_key.objectid = ino;
2360	search_key.type = BTRFS_XATTR_ITEM_KEY;
2361	search_key.offset = 0;
2362again:
2363	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2364	if (ret < 0)
2365		goto out;
2366process_leaf:
2367	nritems = btrfs_header_nritems(path->nodes[0]);
2368	for (i = path->slots[0]; i < nritems; i++) {
2369		struct btrfs_key key;
2370		struct btrfs_dir_item *di;
2371		struct btrfs_dir_item *log_di;
2372		u32 total_size;
2373		u32 cur;
2374
2375		btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2376		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2377			ret = 0;
2378			goto out;
2379		}
2380
2381		di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2382		total_size = btrfs_item_size_nr(path->nodes[0], i);
2383		cur = 0;
2384		while (cur < total_size) {
2385			u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2386			u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2387			u32 this_len = sizeof(*di) + name_len + data_len;
2388			char *name;
2389
2390			name = kmalloc(name_len, GFP_NOFS);
2391			if (!name) {
2392				ret = -ENOMEM;
2393				goto out;
2394			}
2395			read_extent_buffer(path->nodes[0], name,
2396					   (unsigned long)(di + 1), name_len);
2397
2398			log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2399						    name, name_len, 0);
2400			btrfs_release_path(log_path);
2401			if (!log_di) {
2402				/* Doesn't exist in log tree, so delete it. */
2403				btrfs_release_path(path);
2404				di = btrfs_lookup_xattr(trans, root, path, ino,
2405							name, name_len, -1);
2406				kfree(name);
2407				if (IS_ERR(di)) {
2408					ret = PTR_ERR(di);
2409					goto out;
2410				}
2411				ASSERT(di);
2412				ret = btrfs_delete_one_dir_name(trans, root,
2413								path, di);
2414				if (ret)
2415					goto out;
2416				btrfs_release_path(path);
2417				search_key = key;
2418				goto again;
2419			}
2420			kfree(name);
2421			if (IS_ERR(log_di)) {
2422				ret = PTR_ERR(log_di);
2423				goto out;
2424			}
2425			cur += this_len;
2426			di = (struct btrfs_dir_item *)((char *)di + this_len);
2427		}
2428	}
2429	ret = btrfs_next_leaf(root, path);
2430	if (ret > 0)
2431		ret = 0;
2432	else if (ret == 0)
2433		goto process_leaf;
2434out:
2435	btrfs_free_path(log_path);
2436	btrfs_release_path(path);
2437	return ret;
2438}
2439
2440
2441/*
2442 * deletion replay happens before we copy any new directory items
2443 * out of the log or out of backreferences from inodes.  It
2444 * scans the log to find ranges of keys that log is authoritative for,
2445 * and then scans the directory to find items in those ranges that are
2446 * not present in the log.
2447 *
2448 * Anything we don't find in the log is unlinked and removed from the
2449 * directory.
2450 */
2451static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2452				       struct btrfs_root *root,
2453				       struct btrfs_root *log,
2454				       struct btrfs_path *path,
2455				       u64 dirid, int del_all)
2456{
2457	u64 range_start;
2458	u64 range_end;
2459	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2460	int ret = 0;
2461	struct btrfs_key dir_key;
2462	struct btrfs_key found_key;
2463	struct btrfs_path *log_path;
2464	struct inode *dir;
2465
2466	dir_key.objectid = dirid;
2467	dir_key.type = BTRFS_DIR_ITEM_KEY;
2468	log_path = btrfs_alloc_path();
2469	if (!log_path)
2470		return -ENOMEM;
2471
2472	dir = read_one_inode(root, dirid);
2473	/* it isn't an error if the inode isn't there, that can happen
2474	 * because we replay the deletes before we copy in the inode item
2475	 * from the log
2476	 */
2477	if (!dir) {
2478		btrfs_free_path(log_path);
2479		return 0;
2480	}
2481again:
2482	range_start = 0;
2483	range_end = 0;
2484	while (1) {
2485		if (del_all)
2486			range_end = (u64)-1;
2487		else {
2488			ret = find_dir_range(log, path, dirid, key_type,
2489					     &range_start, &range_end);
2490			if (ret < 0)
2491				goto out;
2492			else if (ret > 0)
2493				break;
2494		}
2495
2496		dir_key.offset = range_start;
2497		while (1) {
2498			int nritems;
2499			ret = btrfs_search_slot(NULL, root, &dir_key, path,
2500						0, 0);
2501			if (ret < 0)
2502				goto out;
2503
2504			nritems = btrfs_header_nritems(path->nodes[0]);
2505			if (path->slots[0] >= nritems) {
2506				ret = btrfs_next_leaf(root, path);
2507				if (ret == 1)
2508					break;
2509				else if (ret < 0)
2510					goto out;
2511			}
2512			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2513					      path->slots[0]);
2514			if (found_key.objectid != dirid ||
2515			    found_key.type != dir_key.type)
2516				goto next_type;
2517
2518			if (found_key.offset > range_end)
2519				break;
2520
2521			ret = check_item_in_log(trans, root, log, path,
2522						log_path, dir,
2523						&found_key);
2524			if (ret)
2525				goto out;
2526			if (found_key.offset == (u64)-1)
2527				break;
2528			dir_key.offset = found_key.offset + 1;
2529		}
2530		btrfs_release_path(path);
2531		if (range_end == (u64)-1)
2532			break;
2533		range_start = range_end + 1;
2534	}
2535
2536next_type:
2537	ret = 0;
2538	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2539		key_type = BTRFS_DIR_LOG_INDEX_KEY;
2540		dir_key.type = BTRFS_DIR_INDEX_KEY;
2541		btrfs_release_path(path);
2542		goto again;
2543	}
2544out:
2545	btrfs_release_path(path);
2546	btrfs_free_path(log_path);
2547	iput(dir);
2548	return ret;
2549}
2550
2551/*
2552 * the process_func used to replay items from the log tree.  This
2553 * gets called in two different stages.  The first stage just looks
2554 * for inodes and makes sure they are all copied into the subvolume.
2555 *
2556 * The second stage copies all the other item types from the log into
2557 * the subvolume.  The two stage approach is slower, but gets rid of
2558 * lots of complexity around inodes referencing other inodes that exist
2559 * only in the log (references come from either directory items or inode
2560 * back refs).
2561 */
2562static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2563			     struct walk_control *wc, u64 gen, int level)
2564{
2565	int nritems;
2566	struct btrfs_path *path;
2567	struct btrfs_root *root = wc->replay_dest;
2568	struct btrfs_key key;
2569	int i;
2570	int ret;
2571
2572	ret = btrfs_read_buffer(eb, gen, level, NULL);
2573	if (ret)
2574		return ret;
2575
2576	level = btrfs_header_level(eb);
2577
2578	if (level != 0)
2579		return 0;
2580
2581	path = btrfs_alloc_path();
2582	if (!path)
2583		return -ENOMEM;
2584
2585	nritems = btrfs_header_nritems(eb);
2586	for (i = 0; i < nritems; i++) {
2587		btrfs_item_key_to_cpu(eb, &key, i);
2588
2589		/* inode keys are done during the first stage */
2590		if (key.type == BTRFS_INODE_ITEM_KEY &&
2591		    wc->stage == LOG_WALK_REPLAY_INODES) {
2592			struct btrfs_inode_item *inode_item;
2593			u32 mode;
2594
2595			inode_item = btrfs_item_ptr(eb, i,
2596					    struct btrfs_inode_item);
2597			/*
2598			 * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2599			 * and never got linked before the fsync, skip it, as
2600			 * replaying it is pointless since it would be deleted
2601			 * later. We skip logging tmpfiles, but it's always
2602			 * possible we are replaying a log created with a kernel
2603			 * that used to log tmpfiles.
2604			 */
2605			if (btrfs_inode_nlink(eb, inode_item) == 0) {
2606				wc->ignore_cur_inode = true;
2607				continue;
2608			} else {
2609				wc->ignore_cur_inode = false;
2610			}
2611			ret = replay_xattr_deletes(wc->trans, root, log,
2612						   path, key.objectid);
2613			if (ret)
2614				break;
2615			mode = btrfs_inode_mode(eb, inode_item);
2616			if (S_ISDIR(mode)) {
2617				ret = replay_dir_deletes(wc->trans,
2618					 root, log, path, key.objectid, 0);
2619				if (ret)
2620					break;
2621			}
2622			ret = overwrite_item(wc->trans, root, path,
2623					     eb, i, &key);
2624			if (ret)
2625				break;
2626
2627			/*
2628			 * Before replaying extents, truncate the inode to its
2629			 * size. We need to do it now and not after log replay
2630			 * because before an fsync we can have prealloc extents
2631			 * added beyond the inode's i_size. If we did it after,
2632			 * through orphan cleanup for example, we would drop
2633			 * those prealloc extents just after replaying them.
2634			 */
2635			if (S_ISREG(mode)) {
2636				struct inode *inode;
2637				u64 from;
2638
2639				inode = read_one_inode(root, key.objectid);
2640				if (!inode) {
2641					ret = -EIO;
2642					break;
2643				}
2644				from = ALIGN(i_size_read(inode),
2645					     root->fs_info->sectorsize);
2646				ret = btrfs_drop_extents(wc->trans, root, inode,
2647							 from, (u64)-1, 1);
2648				if (!ret) {
2649					/* Update the inode's nbytes. */
2650					ret = btrfs_update_inode(wc->trans,
2651								 root, inode);
2652				}
2653				iput(inode);
2654				if (ret)
2655					break;
2656			}
2657
2658			ret = link_to_fixup_dir(wc->trans, root,
2659						path, key.objectid);
2660			if (ret)
2661				break;
2662		}
2663
2664		if (wc->ignore_cur_inode)
2665			continue;
2666
2667		if (key.type == BTRFS_DIR_INDEX_KEY &&
2668		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2669			ret = replay_one_dir_item(wc->trans, root, path,
2670						  eb, i, &key);
2671			if (ret)
2672				break;
2673		}
2674
2675		if (wc->stage < LOG_WALK_REPLAY_ALL)
2676			continue;
2677
2678		/* these keys are simply copied */
2679		if (key.type == BTRFS_XATTR_ITEM_KEY) {
2680			ret = overwrite_item(wc->trans, root, path,
2681					     eb, i, &key);
2682			if (ret)
2683				break;
2684		} else if (key.type == BTRFS_INODE_REF_KEY ||
2685			   key.type == BTRFS_INODE_EXTREF_KEY) {
2686			ret = add_inode_ref(wc->trans, root, log, path,
2687					    eb, i, &key);
2688			if (ret && ret != -ENOENT)
2689				break;
2690			ret = 0;
2691		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2692			ret = replay_one_extent(wc->trans, root, path,
2693						eb, i, &key);
2694			if (ret)
2695				break;
2696		} else if (key.type == BTRFS_DIR_ITEM_KEY) {
2697			ret = replay_one_dir_item(wc->trans, root, path,
2698						  eb, i, &key);
2699			if (ret)
2700				break;
2701		}
2702	}
2703	btrfs_free_path(path);
2704	return ret;
2705}
2706
2707/*
2708 * Correctly adjust the reserved bytes occupied by a log tree extent buffer
2709 */
2710static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
2711{
2712	struct btrfs_block_group *cache;
2713
2714	cache = btrfs_lookup_block_group(fs_info, start);
2715	if (!cache) {
2716		btrfs_err(fs_info, "unable to find block group for %llu", start);
2717		return;
2718	}
2719
2720	spin_lock(&cache->space_info->lock);
2721	spin_lock(&cache->lock);
2722	cache->reserved -= fs_info->nodesize;
2723	cache->space_info->bytes_reserved -= fs_info->nodesize;
2724	spin_unlock(&cache->lock);
2725	spin_unlock(&cache->space_info->lock);
2726
2727	btrfs_put_block_group(cache);
2728}
2729
2730static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2731				   struct btrfs_root *root,
2732				   struct btrfs_path *path, int *level,
2733				   struct walk_control *wc)
2734{
2735	struct btrfs_fs_info *fs_info = root->fs_info;
2736	u64 bytenr;
2737	u64 ptr_gen;
2738	struct extent_buffer *next;
2739	struct extent_buffer *cur;
2740	u32 blocksize;
2741	int ret = 0;
2742
2743	while (*level > 0) {
2744		struct btrfs_key first_key;
2745
2746		cur = path->nodes[*level];
2747
2748		WARN_ON(btrfs_header_level(cur) != *level);
2749
2750		if (path->slots[*level] >=
2751		    btrfs_header_nritems(cur))
2752			break;
2753
2754		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2755		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2756		btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
2757		blocksize = fs_info->nodesize;
2758
2759		next = btrfs_find_create_tree_block(fs_info, bytenr);
2760		if (IS_ERR(next))
2761			return PTR_ERR(next);
2762
2763		if (*level == 1) {
2764			ret = wc->process_func(root, next, wc, ptr_gen,
2765					       *level - 1);
2766			if (ret) {
2767				free_extent_buffer(next);
2768				return ret;
2769			}
2770
2771			path->slots[*level]++;
2772			if (wc->free) {
2773				ret = btrfs_read_buffer(next, ptr_gen,
2774							*level - 1, &first_key);
2775				if (ret) {
2776					free_extent_buffer(next);
2777					return ret;
2778				}
2779
2780				if (trans) {
2781					btrfs_tree_lock(next);
2782					btrfs_set_lock_blocking_write(next);
2783					btrfs_clean_tree_block(next);
2784					btrfs_wait_tree_block_writeback(next);
2785					btrfs_tree_unlock(next);
2786					ret = btrfs_pin_reserved_extent(trans,
2787							bytenr, blocksize);
2788					if (ret) {
2789						free_extent_buffer(next);
2790						return ret;
2791					}
2792				} else {
2793					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2794						clear_extent_buffer_dirty(next);
2795					unaccount_log_buffer(fs_info, bytenr);
2796				}
2797			}
2798			free_extent_buffer(next);
2799			continue;
2800		}
2801		ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
2802		if (ret) {
2803			free_extent_buffer(next);
2804			return ret;
2805		}
2806
2807		if (path->nodes[*level-1])
2808			free_extent_buffer(path->nodes[*level-1]);
2809		path->nodes[*level-1] = next;
2810		*level = btrfs_header_level(next);
2811		path->slots[*level] = 0;
2812		cond_resched();
2813	}
2814	path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2815
2816	cond_resched();
2817	return 0;
2818}
2819
2820static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2821				 struct btrfs_root *root,
2822				 struct btrfs_path *path, int *level,
2823				 struct walk_control *wc)
2824{
2825	struct btrfs_fs_info *fs_info = root->fs_info;
2826	int i;
2827	int slot;
2828	int ret;
2829
2830	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2831		slot = path->slots[i];
2832		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2833			path->slots[i]++;
2834			*level = i;
2835			WARN_ON(*level == 0);
2836			return 0;
2837		} else {
2838			ret = wc->process_func(root, path->nodes[*level], wc,
2839				 btrfs_header_generation(path->nodes[*level]),
2840				 *level);
2841			if (ret)
2842				return ret;
2843
2844			if (wc->free) {
2845				struct extent_buffer *next;
2846
2847				next = path->nodes[*level];
2848
2849				if (trans) {
2850					btrfs_tree_lock(next);
2851					btrfs_set_lock_blocking_write(next);
2852					btrfs_clean_tree_block(next);
2853					btrfs_wait_tree_block_writeback(next);
2854					btrfs_tree_unlock(next);
2855					ret = btrfs_pin_reserved_extent(trans,
2856						     path->nodes[*level]->start,
2857						     path->nodes[*level]->len);
2858					if (ret)
2859						return ret;
2860				} else {
2861					if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2862						clear_extent_buffer_dirty(next);
2863
2864					unaccount_log_buffer(fs_info,
2865						path->nodes[*level]->start);
2866				}
2867			}
2868			free_extent_buffer(path->nodes[*level]);
2869			path->nodes[*level] = NULL;
2870			*level = i + 1;
2871		}
2872	}
2873	return 1;
2874}
2875
2876/*
2877 * drop the reference count on the tree rooted at 'snap'.  This traverses
2878 * the tree freeing any blocks that have a ref count of zero after being
2879 * decremented.
2880 */
2881static int walk_log_tree(struct btrfs_trans_handle *trans,
2882			 struct btrfs_root *log, struct walk_control *wc)
2883{
2884	struct btrfs_fs_info *fs_info = log->fs_info;
2885	int ret = 0;
2886	int wret;
2887	int level;
2888	struct btrfs_path *path;
2889	int orig_level;
2890
2891	path = btrfs_alloc_path();
2892	if (!path)
2893		return -ENOMEM;
2894
2895	level = btrfs_header_level(log->node);
2896	orig_level = level;
2897	path->nodes[level] = log->node;
2898	atomic_inc(&log->node->refs);
2899	path->slots[level] = 0;
2900
2901	while (1) {
2902		wret = walk_down_log_tree(trans, log, path, &level, wc);
2903		if (wret > 0)
2904			break;
2905		if (wret < 0) {
2906			ret = wret;
2907			goto out;
2908		}
2909
2910		wret = walk_up_log_tree(trans, log, path, &level, wc);
2911		if (wret > 0)
2912			break;
2913		if (wret < 0) {
2914			ret = wret;
2915			goto out;
2916		}
2917	}
2918
2919	/* was the root node processed? if not, catch it here */
2920	if (path->nodes[orig_level]) {
2921		ret = wc->process_func(log, path->nodes[orig_level], wc,
2922			 btrfs_header_generation(path->nodes[orig_level]),
2923			 orig_level);
2924		if (ret)
2925			goto out;
2926		if (wc->free) {
2927			struct extent_buffer *next;
2928
2929			next = path->nodes[orig_level];
2930
2931			if (trans) {
2932				btrfs_tree_lock(next);
2933				btrfs_set_lock_blocking_write(next);
2934				btrfs_clean_tree_block(next);
2935				btrfs_wait_tree_block_writeback(next);
2936				btrfs_tree_unlock(next);
2937				ret = btrfs_pin_reserved_extent(trans,
2938						next->start, next->len);
2939				if (ret)
2940					goto out;
2941			} else {
2942				if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2943					clear_extent_buffer_dirty(next);
2944				unaccount_log_buffer(fs_info, next->start);
2945			}
2946		}
2947	}
2948
2949out:
2950	btrfs_free_path(path);
2951	return ret;
2952}
2953
2954/*
2955 * helper function to update the item for a given subvolumes log root
2956 * in the tree of log roots
2957 */
2958static int update_log_root(struct btrfs_trans_handle *trans,
2959			   struct btrfs_root *log,
2960			   struct btrfs_root_item *root_item)
2961{
2962	struct btrfs_fs_info *fs_info = log->fs_info;
2963	int ret;
2964
2965	if (log->log_transid == 1) {
2966		/* insert root item on the first sync */
2967		ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2968				&log->root_key, root_item);
2969	} else {
2970		ret = btrfs_update_root(trans, fs_info->log_root_tree,
2971				&log->root_key, root_item);
2972	}
2973	return ret;
2974}
2975
2976static void wait_log_commit(struct btrfs_root *root, int transid)
2977{
2978	DEFINE_WAIT(wait);
2979	int index = transid % 2;
2980
2981	/*
2982	 * we only allow two pending log transactions at a time,
2983	 * so we know that if ours is more than 2 older than the
2984	 * current transaction, we're done
2985	 */
2986	for (;;) {
2987		prepare_to_wait(&root->log_commit_wait[index],
2988				&wait, TASK_UNINTERRUPTIBLE);
2989
2990		if (!(root->log_transid_committed < transid &&
2991		      atomic_read(&root->log_commit[index])))
2992			break;
2993
2994		mutex_unlock(&root->log_mutex);
2995		schedule();
2996		mutex_lock(&root->log_mutex);
2997	}
2998	finish_wait(&root->log_commit_wait[index], &wait);
2999}
3000
3001static void wait_for_writer(struct btrfs_root *root)
3002{
3003	DEFINE_WAIT(wait);
3004
3005	for (;;) {
3006		prepare_to_wait(&root->log_writer_wait, &wait,
3007				TASK_UNINTERRUPTIBLE);
3008		if (!atomic_read(&root->log_writers))
3009			break;
3010
3011		mutex_unlock(&root->log_mutex);
3012		schedule();
3013		mutex_lock(&root->log_mutex);
3014	}
3015	finish_wait(&root->log_writer_wait, &wait);
3016}
3017
3018static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
3019					struct btrfs_log_ctx *ctx)
3020{
3021	if (!ctx)
3022		return;
3023
3024	mutex_lock(&root->log_mutex);
3025	list_del_init(&ctx->list);
3026	mutex_unlock(&root->log_mutex);
3027}
3028
3029/*
3030 * Invoked in log mutex context, or be sure there is no other task which
3031 * can access the list.
3032 */
3033static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
3034					     int index, int error)
3035{
3036	struct btrfs_log_ctx *ctx;
3037	struct btrfs_log_ctx *safe;
3038
3039	list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
3040		list_del_init(&ctx->list);
3041		ctx->log_ret = error;
3042	}
3043
3044	INIT_LIST_HEAD(&root->log_ctxs[index]);
3045}
3046
3047/*
3048 * btrfs_sync_log does sends a given tree log down to the disk and
3049 * updates the super blocks to record it.  When this call is done,
3050 * you know that any inodes previously logged are safely on disk only
3051 * if it returns 0.
3052 *
3053 * Any other return value means you need to call btrfs_commit_transaction.
3054 * Some of the edge cases for fsyncing directories that have had unlinks
3055 * or renames done in the past mean that sometimes the only safe
3056 * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
3057 * that has happened.
3058 */
3059int btrfs_sync_log(struct btrfs_trans_handle *trans,
3060		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
3061{
3062	int index1;
3063	int index2;
3064	int mark;
3065	int ret;
3066	struct btrfs_fs_info *fs_info = root->fs_info;
3067	struct btrfs_root *log = root->log_root;
3068	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
3069	struct btrfs_root_item new_root_item;
3070	int log_transid = 0;
3071	struct btrfs_log_ctx root_log_ctx;
3072	struct blk_plug plug;
3073
3074	mutex_lock(&root->log_mutex);
3075	log_transid = ctx->log_transid;
3076	if (root->log_transid_committed >= log_transid) {
3077		mutex_unlock(&root->log_mutex);
3078		return ctx->log_ret;
3079	}
3080
3081	index1 = log_transid % 2;
3082	if (atomic_read(&root->log_commit[index1])) {
3083		wait_log_commit(root, log_transid);
3084		mutex_unlock(&root->log_mutex);
3085		return ctx->log_ret;
3086	}
3087	ASSERT(log_transid == root->log_transid);
3088	atomic_set(&root->log_commit[index1], 1);
3089
3090	/* wait for previous tree log sync to complete */
3091	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
3092		wait_log_commit(root, log_transid - 1);
3093
3094	while (1) {
3095		int batch = atomic_read(&root->log_batch);
3096		/* when we're on an ssd, just kick the log commit out */
3097		if (!btrfs_test_opt(fs_info, SSD) &&
3098		    test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
3099			mutex_unlock(&root->log_mutex);
3100			schedule_timeout_uninterruptible(1);
3101			mutex_lock(&root->log_mutex);
3102		}
3103		wait_for_writer(root);
3104		if (batch == atomic_read(&root->log_batch))
3105			break;
3106	}
3107
3108	/* bail out if we need to do a full commit */
3109	if (btrfs_need_log_full_commit(trans)) {
3110		ret = -EAGAIN;
3111		mutex_unlock(&root->log_mutex);
3112		goto out;
3113	}
3114
3115	if (log_transid % 2 == 0)
3116		mark = EXTENT_DIRTY;
3117	else
3118		mark = EXTENT_NEW;
3119
3120	/* we start IO on  all the marked extents here, but we don't actually
3121	 * wait for them until later.
3122	 */
3123	blk_start_plug(&plug);
3124	ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
3125	if (ret) {
3126		blk_finish_plug(&plug);
3127		btrfs_abort_transaction(trans, ret);
3128		btrfs_set_log_full_commit(trans);
3129		mutex_unlock(&root->log_mutex);
3130		goto out;
3131	}
3132
3133	/*
3134	 * We _must_ update under the root->log_mutex in order to make sure we
3135	 * have a consistent view of the log root we are trying to commit at
3136	 * this moment.
3137	 *
3138	 * We _must_ copy this into a local copy, because we are not holding the
3139	 * log_root_tree->log_mutex yet.  This is important because when we
3140	 * commit the log_root_tree we must have a consistent view of the
3141	 * log_root_tree when we update the super block to point at the
3142	 * log_root_tree bytenr.  If we update the log_root_tree here we'll race
3143	 * with the commit and possibly point at the new block which we may not
3144	 * have written out.
3145	 */
3146	btrfs_set_root_node(&log->root_item, log->node);
3147	memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
3148
3149	root->log_transid++;
3150	log->log_transid = root->log_transid;
3151	root->log_start_pid = 0;
3152	/*
3153	 * IO has been started, blocks of the log tree have WRITTEN flag set
3154	 * in their headers. new modifications of the log will be written to
3155	 * new positions. so it's safe to allow log writers to go in.
3156	 */
3157	mutex_unlock(&root->log_mutex);
3158
3159	btrfs_init_log_ctx(&root_log_ctx, NULL);
3160
3161	mutex_lock(&log_root_tree->log_mutex);
3162
3163	index2 = log_root_tree->log_transid % 2;
3164	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3165	root_log_ctx.log_transid = log_root_tree->log_transid;
3166
3167	/*
3168	 * Now we are safe to update the log_root_tree because we're under the
3169	 * log_mutex, and we're a current writer so we're holding the commit
3170	 * open until we drop the log_mutex.
3171	 */
3172	ret = update_log_root(trans, log, &new_root_item);
3173	if (ret) {
3174		if (!list_empty(&root_log_ctx.list))
3175			list_del_init(&root_log_ctx.list);
3176
3177		blk_finish_plug(&plug);
3178		btrfs_set_log_full_commit(trans);
3179
3180		if (ret != -ENOSPC) {
3181			btrfs_abort_transaction(trans, ret);
3182			mutex_unlock(&log_root_tree->log_mutex);
3183			goto out;
3184		}
3185		btrfs_wait_tree_log_extents(log, mark);
3186		mutex_unlock(&log_root_tree->log_mutex);
3187		ret = -EAGAIN;
3188		goto out;
3189	}
3190
3191	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3192		blk_finish_plug(&plug);
3193		list_del_init(&root_log_ctx.list);
3194		mutex_unlock(&log_root_tree->log_mutex);
3195		ret = root_log_ctx.log_ret;
3196		goto out;
3197	}
3198
3199	index2 = root_log_ctx.log_transid % 2;
3200	if (atomic_read(&log_root_tree->log_commit[index2])) {
3201		blk_finish_plug(&plug);
3202		ret = btrfs_wait_tree_log_extents(log, mark);
3203		wait_log_commit(log_root_tree,
3204				root_log_ctx.log_transid);
3205		mutex_unlock(&log_root_tree->log_mutex);
3206		if (!ret)
3207			ret = root_log_ctx.log_ret;
3208		goto out;
3209	}
3210	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
3211	atomic_set(&log_root_tree->log_commit[index2], 1);
3212
3213	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3214		wait_log_commit(log_root_tree,
3215				root_log_ctx.log_transid - 1);
3216	}
3217
3218	/*
3219	 * now that we've moved on to the tree of log tree roots,
3220	 * check the full commit flag again
3221	 */
3222	if (btrfs_need_log_full_commit(trans)) {
3223		blk_finish_plug(&plug);
3224		btrfs_wait_tree_log_extents(log, mark);
3225		mutex_unlock(&log_root_tree->log_mutex);
3226		ret = -EAGAIN;
3227		goto out_wake_log_root;
3228	}
3229
3230	ret = btrfs_write_marked_extents(fs_info,
3231					 &log_root_tree->dirty_log_pages,
3232					 EXTENT_DIRTY | EXTENT_NEW);
3233	blk_finish_plug(&plug);
3234	if (ret) {
3235		btrfs_set_log_full_commit(trans);
3236		btrfs_abort_transaction(trans, ret);
3237		mutex_unlock(&log_root_tree->log_mutex);
3238		goto out_wake_log_root;
3239	}
3240	ret = btrfs_wait_tree_log_extents(log, mark);
3241	if (!ret)
3242		ret = btrfs_wait_tree_log_extents(log_root_tree,
3243						  EXTENT_NEW | EXTENT_DIRTY);
3244	if (ret) {
3245		btrfs_set_log_full_commit(trans);
3246		mutex_unlock(&log_root_tree->log_mutex);
3247		goto out_wake_log_root;
3248	}
3249
3250	btrfs_set_super_log_root(fs_info->super_for_commit,
3251				 log_root_tree->node->start);
3252	btrfs_set_super_log_root_level(fs_info->super_for_commit,
3253				       btrfs_header_level(log_root_tree->node));
3254
3255	log_root_tree->log_transid++;
3256	mutex_unlock(&log_root_tree->log_mutex);
3257
3258	/*
3259	 * Nobody else is going to jump in and write the ctree
3260	 * super here because the log_commit atomic below is protecting
3261	 * us.  We must be called with a transaction handle pinning
3262	 * the running transaction open, so a full commit can't hop
3263	 * in and cause problems either.
3264	 */
3265	ret = write_all_supers(fs_info, 1);
3266	if (ret) {
3267		btrfs_set_log_full_commit(trans);
3268		btrfs_abort_transaction(trans, ret);
3269		goto out_wake_log_root;
3270	}
3271
3272	mutex_lock(&root->log_mutex);
3273	if (root->last_log_commit < log_transid)
3274		root->last_log_commit = log_transid;
3275	mutex_unlock(&root->log_mutex);
3276
3277out_wake_log_root:
3278	mutex_lock(&log_root_tree->log_mutex);
3279	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3280
3281	log_root_tree->log_transid_committed++;
3282	atomic_set(&log_root_tree->log_commit[index2], 0);
3283	mutex_unlock(&log_root_tree->log_mutex);
3284
3285	/*
3286	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3287	 * all the updates above are seen by the woken threads. It might not be
3288	 * necessary, but proving that seems to be hard.
3289	 */
3290	cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3291out:
3292	mutex_lock(&root->log_mutex);
3293	btrfs_remove_all_log_ctxs(root, index1, ret);
3294	root->log_transid_committed++;
3295	atomic_set(&root->log_commit[index1], 0);
3296	mutex_unlock(&root->log_mutex);
3297
3298	/*
3299	 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3300	 * all the updates above are seen by the woken threads. It might not be
3301	 * necessary, but proving that seems to be hard.
3302	 */
3303	cond_wake_up(&root->log_commit_wait[index1]);
3304	return ret;
3305}
3306
3307static void free_log_tree(struct btrfs_trans_handle *trans,
3308			  struct btrfs_root *log)
3309{
3310	int ret;
3311	struct walk_control wc = {
3312		.free = 1,
3313		.process_func = process_one_buffer
3314	};
3315
3316	ret = walk_log_tree(trans, log, &wc);
3317	if (ret) {
3318		if (trans)
3319			btrfs_abort_transaction(trans, ret);
3320		else
3321			btrfs_handle_fs_error(log->fs_info, ret, NULL);
3322	}
3323
3324	clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
3325			  EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
3326	extent_io_tree_release(&log->log_csum_range);
3327	btrfs_put_root(log);
3328}
3329
3330/*
3331 * free all the extents used by the tree log.  This should be called
3332 * at commit time of the full transaction
3333 */
3334int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3335{
3336	if (root->log_root) {
3337		free_log_tree(trans, root->log_root);
3338		root->log_root = NULL;
3339		clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
3340	}
3341	return 0;
3342}
3343
3344int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3345			     struct btrfs_fs_info *fs_info)
3346{
3347	if (fs_info->log_root_tree) {
3348		free_log_tree(trans, fs_info->log_root_tree);
3349		fs_info->log_root_tree = NULL;
3350	}
3351	return 0;
3352}
3353
3354/*
3355 * Check if an inode was logged in the current transaction. We can't always rely
3356 * on an inode's logged_trans value, because it's an in-memory only field and
3357 * therefore not persisted. This means that its value is lost if the inode gets
3358 * evicted and loaded again from disk (in which case it has a value of 0, and
3359 * certainly it is smaller then any possible transaction ID), when that happens
3360 * the full_sync flag is set in the inode's runtime flags, so on that case we
3361 * assume eviction happened and ignore the logged_trans value, assuming the
3362 * worst case, that the inode was logged before in the current transaction.
3363 */
3364static bool inode_logged(struct btrfs_trans_handle *trans,
3365			 struct btrfs_inode *inode)
3366{
3367	if (inode->logged_trans == trans->transid)
3368		return true;
3369
3370	if (inode->last_trans == trans->transid &&
3371	    test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
3372	    !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
3373		return true;
3374
3375	return false;
3376}
3377
3378/*
3379 * If both a file and directory are logged, and unlinks or renames are
3380 * mixed in, we have a few interesting corners:
3381 *
3382 * create file X in dir Y
3383 * link file X to X.link in dir Y
3384 * fsync file X
3385 * unlink file X but leave X.link
3386 * fsync dir Y
3387 *
3388 * After a crash we would expect only X.link to exist.  But file X
3389 * didn't get fsync'd again so the log has back refs for X and X.link.
3390 *
3391 * We solve this by removing directory entries and inode backrefs from the
3392 * log when a file that was logged in the current transaction is
3393 * unlinked.  Any later fsync will include the updated log entries, and
3394 * we'll be able to reconstruct the proper directory items from backrefs.
3395 *
3396 * This optimizations allows us to avoid relogging the entire inode
3397 * or the entire directory.
3398 */
3399int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3400				 struct btrfs_root *root,
3401				 const char *name, int name_len,
3402				 struct btrfs_inode *dir, u64 index)
3403{
3404	struct btrfs_root *log;
3405	struct btrfs_dir_item *di;
3406	struct btrfs_path *path;
3407	int ret;
3408	int err = 0;
3409	int bytes_del = 0;
3410	u64 dir_ino = btrfs_ino(dir);
3411
3412	if (!inode_logged(trans, dir))
3413		return 0;
3414
3415	ret = join_running_log_trans(root);
3416	if (ret)
3417		return 0;
3418
3419	mutex_lock(&dir->log_mutex);
3420
3421	log = root->log_root;
3422	path = btrfs_alloc_path();
3423	if (!path) {
3424		err = -ENOMEM;
3425		goto out_unlock;
3426	}
3427
3428	di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
3429				   name, name_len, -1);
3430	if (IS_ERR(di)) {
3431		err = PTR_ERR(di);
3432		goto fail;
3433	}
3434	if (di) {
3435		ret = btrfs_delete_one_dir_name(trans, log, path, di);
3436		bytes_del += name_len;
3437		if (ret) {
3438			err = ret;
3439			goto fail;
3440		}
3441	}
3442	btrfs_release_path(path);
3443	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3444					 index, name, name_len, -1);
3445	if (IS_ERR(di)) {
3446		err = PTR_ERR(di);
3447		goto fail;
3448	}
3449	if (di) {
3450		ret = btrfs_delete_one_dir_name(trans, log, path, di);
3451		bytes_del += name_len;
3452		if (ret) {
3453			err = ret;
3454			goto fail;
3455		}
3456	}
3457
3458	/* update the directory size in the log to reflect the names
3459	 * we have removed
3460	 */
3461	if (bytes_del) {
3462		struct btrfs_key key;
3463
3464		key.objectid = dir_ino;
3465		key.offset = 0;
3466		key.type = BTRFS_INODE_ITEM_KEY;
3467		btrfs_release_path(path);
3468
3469		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
3470		if (ret < 0) {
3471			err = ret;
3472			goto fail;
3473		}
3474		if (ret == 0) {
3475			struct btrfs_inode_item *item;
3476			u64 i_size;
3477
3478			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3479					      struct btrfs_inode_item);
3480			i_size = btrfs_inode_size(path->nodes[0], item);
3481			if (i_size > bytes_del)
3482				i_size -= bytes_del;
3483			else
3484				i_size = 0;
3485			btrfs_set_inode_size(path->nodes[0], item, i_size);
3486			btrfs_mark_buffer_dirty(path->nodes[0]);
3487		} else
3488			ret = 0;
3489		btrfs_release_path(path);
3490	}
3491fail:
3492	btrfs_free_path(path);
3493out_unlock:
3494	mutex_unlock(&dir->log_mutex);
3495	if (err == -ENOSPC) {
3496		btrfs_set_log_full_commit(trans);
3497		err = 0;
3498	} else if (err < 0 && err != -ENOENT) {
3499		/* ENOENT can be returned if the entry hasn't been fsynced yet */
3500		btrfs_abort_transaction(trans, err);
3501	}
3502
3503	btrfs_end_log_trans(root);
3504
3505	return err;
3506}
3507
3508/* see comments for btrfs_del_dir_entries_in_log */
3509int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3510			       struct btrfs_root *root,
3511			       const char *name, int name_len,
3512			       struct btrfs_inode *inode, u64 dirid)
3513{
3514	struct btrfs_root *log;
3515	u64 index;
3516	int ret;
3517
3518	if (!inode_logged(trans, inode))
3519		return 0;
3520
3521	ret = join_running_log_trans(root);
3522	if (ret)
3523		return 0;
3524	log = root->log_root;
3525	mutex_lock(&inode->log_mutex);
3526
3527	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3528				  dirid, &index);
3529	mutex_unlock(&inode->log_mutex);
3530	if (ret == -ENOSPC) {
3531		btrfs_set_log_full_commit(trans);
3532		ret = 0;
3533	} else if (ret < 0 && ret != -ENOENT)
3534		btrfs_abort_transaction(trans, ret);
3535	btrfs_end_log_trans(root);
3536
3537	return ret;
3538}
3539
3540/*
3541 * creates a range item in the log for 'dirid'.  first_offset and
3542 * last_offset tell us which parts of the key space the log should
3543 * be considered authoritative for.
3544 */
3545static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3546				       struct btrfs_root *log,
3547				       struct btrfs_path *path,
3548				       int key_type, u64 dirid,
3549				       u64 first_offset, u64 last_offset)
3550{
3551	int ret;
3552	struct btrfs_key key;
3553	struct btrfs_dir_log_item *item;
3554
3555	key.objectid = dirid;
3556	key.offset = first_offset;
3557	if (key_type == BTRFS_DIR_ITEM_KEY)
3558		key.type = BTRFS_DIR_LOG_ITEM_KEY;
3559	else
3560		key.type = BTRFS_DIR_LOG_INDEX_KEY;
3561	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3562	if (ret)
3563		return ret;
3564
3565	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3566			      struct btrfs_dir_log_item);
3567	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3568	btrfs_mark_buffer_dirty(path->nodes[0]);
3569	btrfs_release_path(path);
3570	return 0;
3571}
3572
3573/*
3574 * log all the items included in the current transaction for a given
3575 * directory.  This also creates the range items in the log tree required
3576 * to replay anything deleted before the fsync
3577 */
3578static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3579			  struct btrfs_root *root, struct btrfs_inode *inode,
3580			  struct btrfs_path *path,
3581			  struct btrfs_path *dst_path, int key_type,
3582			  struct btrfs_log_ctx *ctx,
3583			  u64 min_offset, u64 *last_offset_ret)
3584{
3585	struct btrfs_key min_key;
3586	struct btrfs_root *log = root->log_root;
3587	struct extent_buffer *src;
3588	int err = 0;
3589	int ret;
3590	int i;
3591	int nritems;
3592	u64 first_offset = min_offset;
3593	u64 last_offset = (u64)-1;
3594	u64 ino = btrfs_ino(inode);
3595
3596	log = root->log_root;
3597
3598	min_key.objectid = ino;
3599	min_key.type = key_type;
3600	min_key.offset = min_offset;
3601
3602	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3603
3604	/*
3605	 * we didn't find anything from this transaction, see if there
3606	 * is anything at all
3607	 */
3608	if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
3609		min_key.objectid = ino;
3610		min_key.type = key_type;
3611		min_key.offset = (u64)-1;
3612		btrfs_release_path(path);
3613		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3614		if (ret < 0) {
3615			btrfs_release_path(path);
3616			return ret;
3617		}
3618		ret = btrfs_previous_item(root, path, ino, key_type);
3619
3620		/* if ret == 0 there are items for this type,
3621		 * create a range to tell us the last key of this type.
3622		 * otherwise, there are no items in this directory after
3623		 * *min_offset, and we create a range to indicate that.
3624		 */
3625		if (ret == 0) {
3626			struct btrfs_key tmp;
3627			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3628					      path->slots[0]);
3629			if (key_type == tmp.type)
3630				first_offset = max(min_offset, tmp.offset) + 1;
3631		}
3632		goto done;
3633	}
3634
3635	/* go backward to find any previous key */
3636	ret = btrfs_previous_item(root, path, ino, key_type);
3637	if (ret == 0) {
3638		struct btrfs_key tmp;
3639		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3640		if (key_type == tmp.type) {
3641			first_offset = tmp.offset;
3642			ret = overwrite_item(trans, log, dst_path,
3643					     path->nodes[0], path->slots[0],
3644					     &tmp);
3645			if (ret) {
3646				err = ret;
3647				goto done;
3648			}
3649		}
3650	}
3651	btrfs_release_path(path);
3652
3653	/*
3654	 * Find the first key from this transaction again.  See the note for
3655	 * log_new_dir_dentries, if we're logging a directory recursively we
3656	 * won't be holding its i_mutex, which means we can modify the directory
3657	 * while we're logging it.  If we remove an entry between our first
3658	 * search and this search we'll not find the key again and can just
3659	 * bail.
3660	 */
3661search:
3662	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3663	if (ret != 0)
3664		goto done;
3665
3666	/*
3667	 * we have a block from this transaction, log every item in it
3668	 * from our directory
3669	 */
3670	while (1) {
3671		struct btrfs_key tmp;
3672		src = path->nodes[0];
3673		nritems = btrfs_header_nritems(src);
3674		for (i = path->slots[0]; i < nritems; i++) {
3675			struct btrfs_dir_item *di;
3676
3677			btrfs_item_key_to_cpu(src, &min_key, i);
3678
3679			if (min_key.objectid != ino || min_key.type != key_type)
3680				goto done;
3681
3682			if (need_resched()) {
3683				btrfs_release_path(path);
3684				cond_resched();
3685				goto search;
3686			}
3687
3688			ret = overwrite_item(trans, log, dst_path, src, i,
3689					     &min_key);
3690			if (ret) {
3691				err = ret;
3692				goto done;
3693			}
3694
3695			/*
3696			 * We must make sure that when we log a directory entry,
3697			 * the corresponding inode, after log replay, has a
3698			 * matching link count. For example:
3699			 *
3700			 * touch foo
3701			 * mkdir mydir
3702			 * sync
3703			 * ln foo mydir/bar
3704			 * xfs_io -c "fsync" mydir
3705			 * <crash>
3706			 * <mount fs and log replay>
3707			 *
3708			 * Would result in a fsync log that when replayed, our
3709			 * file inode would have a link count of 1, but we get
3710			 * two directory entries pointing to the same inode.
3711			 * After removing one of the names, it would not be
3712			 * possible to remove the other name, which resulted
3713			 * always in stale file handle errors, and would not
3714			 * be possible to rmdir the parent directory, since
3715			 * its i_size could never decrement to the value
3716			 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
3717			 */
3718			di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3719			btrfs_dir_item_key_to_cpu(src, di, &tmp);
3720			if (ctx &&
3721			    (btrfs_dir_transid(src, di) == trans->transid ||
3722			     btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3723			    tmp.type != BTRFS_ROOT_ITEM_KEY)
3724				ctx->log_new_dentries = true;
3725		}
3726		path->slots[0] = nritems;
3727
3728		/*
3729		 * look ahead to the next item and see if it is also
3730		 * from this directory and from this transaction
3731		 */
3732		ret = btrfs_next_leaf(root, path);
3733		if (ret) {
3734			if (ret == 1)
3735				last_offset = (u64)-1;
3736			else
3737				err = ret;
3738			goto done;
3739		}
3740		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3741		if (tmp.objectid != ino || tmp.type != key_type) {
3742			last_offset = (u64)-1;
3743			goto done;
3744		}
3745		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3746			ret = overwrite_item(trans, log, dst_path,
3747					     path->nodes[0], path->slots[0],
3748					     &tmp);
3749			if (ret)
3750				err = ret;
3751			else
3752				last_offset = tmp.offset;
3753			goto done;
3754		}
3755	}
3756done:
3757	btrfs_release_path(path);
3758	btrfs_release_path(dst_path);
3759
3760	if (err == 0) {
3761		*last_offset_ret = last_offset;
3762		/*
3763		 * insert the log range keys to indicate where the log
3764		 * is valid
3765		 */
3766		ret = insert_dir_log_key(trans, log, path, key_type,
3767					 ino, first_offset, last_offset);
3768		if (ret)
3769			err = ret;
3770	}
3771	return err;
3772}
3773
3774/*
3775 * logging directories is very similar to logging inodes, We find all the items
3776 * from the current transaction and write them to the log.
3777 *
3778 * The recovery code scans the directory in the subvolume, and if it finds a
3779 * key in the range logged that is not present in the log tree, then it means
3780 * that dir entry was unlinked during the transaction.
3781 *
3782 * In order for that scan to work, we must include one key smaller than
3783 * the smallest logged by this transaction and one key larger than the largest
3784 * key logged by this transaction.
3785 */
3786static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3787			  struct btrfs_root *root, struct btrfs_inode *inode,
3788			  struct btrfs_path *path,
3789			  struct btrfs_path *dst_path,
3790			  struct btrfs_log_ctx *ctx)
3791{
3792	u64 min_key;
3793	u64 max_key;
3794	int ret;
3795	int key_type = BTRFS_DIR_ITEM_KEY;
3796
3797again:
3798	min_key = 0;
3799	max_key = 0;
3800	while (1) {
3801		ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
3802				ctx, min_key, &max_key);
3803		if (ret)
3804			return ret;
3805		if (max_key == (u64)-1)
3806			break;
3807		min_key = max_key + 1;
3808	}
3809
3810	if (key_type == BTRFS_DIR_ITEM_KEY) {
3811		key_type = BTRFS_DIR_INDEX_KEY;
3812		goto again;
3813	}
3814	return 0;
3815}
3816
3817/*
3818 * a helper function to drop items from the log before we relog an
3819 * inode.  max_key_type indicates the highest item type to remove.
3820 * This cannot be run for file data extents because it does not
3821 * free the extents they point to.
3822 */
3823static int drop_objectid_items(struct btrfs_trans_handle *trans,
3824				  struct btrfs_root *log,
3825				  struct btrfs_path *path,
3826				  u64 objectid, int max_key_type)
3827{
3828	int ret;
3829	struct btrfs_key key;
3830	struct btrfs_key found_key;
3831	int start_slot;
3832
3833	key.objectid = objectid;
3834	key.type = max_key_type;
3835	key.offset = (u64)-1;
3836
3837	while (1) {
3838		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3839		BUG_ON(ret == 0); /* Logic error */
3840		if (ret < 0)
3841			break;
3842
3843		if (path->slots[0] == 0)
3844			break;
3845
3846		path->slots[0]--;
3847		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3848				      path->slots[0]);
3849
3850		if (found_key.objectid != objectid)
3851			break;
3852
3853		found_key.offset = 0;
3854		found_key.type = 0;
3855		ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
3856		if (ret < 0)
3857			break;
3858
3859		ret = btrfs_del_items(trans, log, path, start_slot,
3860				      path->slots[0] - start_slot + 1);
3861		/*
3862		 * If start slot isn't 0 then we don't need to re-search, we've
3863		 * found the last guy with the objectid in this tree.
3864		 */
3865		if (ret || start_slot != 0)
3866			break;
3867		btrfs_release_path(path);
3868	}
3869	btrfs_release_path(path);
3870	if (ret > 0)
3871		ret = 0;
3872	return ret;
3873}
3874
3875static void fill_inode_item(struct btrfs_trans_handle *trans,
3876			    struct extent_buffer *leaf,
3877			    struct btrfs_inode_item *item,
3878			    struct inode *inode, int log_inode_only,
3879			    u64 logged_isize)
3880{
3881	struct btrfs_map_token token;
3882
3883	btrfs_init_map_token(&token, leaf);
3884
3885	if (log_inode_only) {
3886		/* set the generation to zero so the recover code
3887		 * can tell the difference between an logging
3888		 * just to say 'this inode exists' and a logging
3889		 * to say 'update this inode with these values'
3890		 */
3891		btrfs_set_token_inode_generation(&token, item, 0);
3892		btrfs_set_token_inode_size(&token, item, logged_isize);
3893	} else {
3894		btrfs_set_token_inode_generation(&token, item,
3895						 BTRFS_I(inode)->generation);
3896		btrfs_set_token_inode_size(&token, item, inode->i_size);
3897	}
3898
3899	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3900	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3901	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3902	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
3903
3904	btrfs_set_token_timespec_sec(&token, &item->atime,
3905				     inode->i_atime.tv_sec);
3906	btrfs_set_token_timespec_nsec(&token, &item->atime,
3907				      inode->i_atime.tv_nsec);
3908
3909	btrfs_set_token_timespec_sec(&token, &item->mtime,
3910				     inode->i_mtime.tv_sec);
3911	btrfs_set_token_timespec_nsec(&token, &item->mtime,
3912				      inode->i_mtime.tv_nsec);
3913
3914	btrfs_set_token_timespec_sec(&token, &item->ctime,
3915				     inode->i_ctime.tv_sec);
3916	btrfs_set_token_timespec_nsec(&token, &item->ctime,
3917				      inode->i_ctime.tv_nsec);
3918
3919	btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
3920
3921	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
3922	btrfs_set_token_inode_transid(&token, item, trans->transid);
3923	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
3924	btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
3925	btrfs_set_token_inode_block_group(&token, item, 0);
3926}
3927
3928static int log_inode_item(struct btrfs_trans_handle *trans,
3929			  struct btrfs_root *log, struct btrfs_path *path,
3930			  struct btrfs_inode *inode)
3931{
3932	struct btrfs_inode_item *inode_item;
3933	int ret;
3934
3935	ret = btrfs_insert_empty_item(trans, log, path,
3936				      &inode->location, sizeof(*inode_item));
3937	if (ret && ret != -EEXIST)
3938		return ret;
3939	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3940				    struct btrfs_inode_item);
3941	fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
3942			0, 0);
3943	btrfs_release_path(path);
3944	return 0;
3945}
3946
3947static int log_csums(struct btrfs_trans_handle *trans,
3948		     struct btrfs_inode *inode,
3949		     struct btrfs_root *log_root,
3950		     struct btrfs_ordered_sum *sums)
3951{
3952	const u64 lock_end = sums->bytenr + sums->len - 1;
3953	struct extent_state *cached_state = NULL;
3954	int ret;
3955
3956	/*
3957	 * If this inode was not used for reflink operations in the current
3958	 * transaction with new extents, then do the fast path, no need to
3959	 * worry about logging checksum items with overlapping ranges.
3960	 */
3961	if (inode->last_reflink_trans < trans->transid)
3962		return btrfs_csum_file_blocks(trans, log_root, sums);
3963
3964	/*
3965	 * Serialize logging for checksums. This is to avoid racing with the
3966	 * same checksum being logged by another task that is logging another
3967	 * file which happens to refer to the same extent as well. Such races
3968	 * can leave checksum items in the log with overlapping ranges.
3969	 */
3970	ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr,
3971			       lock_end, &cached_state);
3972	if (ret)
3973		return ret;
3974	/*
3975	 * Due to extent cloning, we might have logged a csum item that covers a
3976	 * subrange of a cloned extent, and later we can end up logging a csum
3977	 * item for a larger subrange of the same extent or the entire range.
3978	 * This would leave csum items in the log tree that cover the same range
3979	 * and break the searches for checksums in the log tree, resulting in
3980	 * some checksums missing in the fs/subvolume tree. So just delete (or
3981	 * trim and adjust) any existing csum items in the log for this range.
3982	 */
3983	ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
3984	if (!ret)
3985		ret = btrfs_csum_file_blocks(trans, log_root, sums);
3986
3987	unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end,
3988			     &cached_state);
3989
3990	return ret;
3991}
3992
3993static noinline int copy_items(struct btrfs_trans_handle *trans,
3994			       struct btrfs_inode *inode,
3995			       struct btrfs_path *dst_path,
3996			       struct btrfs_path *src_path,
3997			       int start_slot, int nr, int inode_only,
3998			       u64 logged_isize)
3999{
4000	struct btrfs_fs_info *fs_info = trans->fs_info;
4001	unsigned long src_offset;
4002	unsigned long dst_offset;
4003	struct btrfs_root *log = inode->root->log_root;
4004	struct btrfs_file_extent_item *extent;
4005	struct btrfs_inode_item *inode_item;
4006	struct extent_buffer *src = src_path->nodes[0];
4007	int ret;
4008	struct btrfs_key *ins_keys;
4009	u32 *ins_sizes;
4010	char *ins_data;
4011	int i;
4012	struct list_head ordered_sums;
4013	int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
4014
4015	INIT_LIST_HEAD(&ordered_sums);
4016
4017	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
4018			   nr * sizeof(u32), GFP_NOFS);
4019	if (!ins_data)
4020		return -ENOMEM;
4021
4022	ins_sizes = (u32 *)ins_data;
4023	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
4024
4025	for (i = 0; i < nr; i++) {
4026		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
4027		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
4028	}
4029	ret = btrfs_insert_empty_items(trans, log, dst_path,
4030				       ins_keys, ins_sizes, nr);
4031	if (ret) {
4032		kfree(ins_data);
4033		return ret;
4034	}
4035
4036	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
4037		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
4038						   dst_path->slots[0]);
4039
4040		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
4041
4042		if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
4043			inode_item = btrfs_item_ptr(dst_path->nodes[0],
4044						    dst_path->slots[0],
4045						    struct btrfs_inode_item);
4046			fill_inode_item(trans, dst_path->nodes[0], inode_item,
4047					&inode->vfs_inode,
4048					inode_only == LOG_INODE_EXISTS,
4049					logged_isize);
4050		} else {
4051			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
4052					   src_offset, ins_sizes[i]);
4053		}
4054
4055		/* take a reference on file data extents so that truncates
4056		 * or deletes of this inode don't have to relog the inode
4057		 * again
4058		 */
4059		if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
4060		    !skip_csum) {
4061			int found_type;
4062			extent = btrfs_item_ptr(src, start_slot + i,
4063						struct btrfs_file_extent_item);
4064
4065			if (btrfs_file_extent_generation(src, extent) < trans->transid)
4066				continue;
4067
4068			found_type = btrfs_file_extent_type(src, extent);
4069			if (found_type == BTRFS_FILE_EXTENT_REG) {
4070				u64 ds, dl, cs, cl;
4071				ds = btrfs_file_extent_disk_bytenr(src,
4072								extent);
4073				/* ds == 0 is a hole */
4074				if (ds == 0)
4075					continue;
4076
4077				dl = btrfs_file_extent_disk_num_bytes(src,
4078								extent);
4079				cs = btrfs_file_extent_offset(src, extent);
4080				cl = btrfs_file_extent_num_bytes(src,
4081								extent);
4082				if (btrfs_file_extent_compression(src,
4083								  extent)) {
4084					cs = 0;
4085					cl = dl;
4086				}
4087
4088				ret = btrfs_lookup_csums_range(
4089						fs_info->csum_root,
4090						ds + cs, ds + cs + cl - 1,
4091						&ordered_sums, 0);
4092				if (ret)
4093					break;
4094			}
4095		}
4096	}
4097
4098	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
4099	btrfs_release_path(dst_path);
4100	kfree(ins_data);
4101
4102	/*
4103	 * we have to do this after the loop above to avoid changing the
4104	 * log tree while trying to change the log tree.
4105	 */
4106	while (!list_empty(&ordered_sums)) {
4107		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4108						   struct btrfs_ordered_sum,
4109						   list);
4110		if (!ret)
4111			ret = log_csums(trans, inode, log, sums);
4112		list_del(&sums->list);
4113		kfree(sums);
4114	}
4115
4116	return ret;
4117}
4118
4119static int extent_cmp(void *priv, const struct list_head *a,
4120		      const struct list_head *b)
4121{
4122	struct extent_map *em1, *em2;
4123
4124	em1 = list_entry(a, struct extent_map, list);
4125	em2 = list_entry(b, struct extent_map, list);
4126
4127	if (em1->start < em2->start)
4128		return -1;
4129	else if (em1->start > em2->start)
4130		return 1;
4131	return 0;
4132}
4133
4134static int log_extent_csums(struct btrfs_trans_handle *trans,
4135			    struct btrfs_inode *inode,
4136			    struct btrfs_root *log_root,
4137			    const struct extent_map *em,
4138			    struct btrfs_log_ctx *ctx)
4139{
4140	struct btrfs_ordered_extent *ordered;
4141	u64 csum_offset;
4142	u64 csum_len;
4143	u64 mod_start = em->mod_start;
4144	u64 mod_len = em->mod_len;
4145	LIST_HEAD(ordered_sums);
4146	int ret = 0;
4147
4148	if (inode->flags & BTRFS_INODE_NODATASUM ||
4149	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
4150	    em->block_start == EXTENT_MAP_HOLE)
4151		return 0;
4152
4153	list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
4154		const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
4155		const u64 mod_end = mod_start + mod_len;
4156		struct btrfs_ordered_sum *sums;
4157
4158		if (mod_len == 0)
4159			break;
4160
4161		if (ordered_end <= mod_start)
4162			continue;
4163		if (mod_end <= ordered->file_offset)
4164			break;
4165
4166		/*
4167		 * We are going to copy all the csums on this ordered extent, so
4168		 * go ahead and adjust mod_start and mod_len in case this ordered
4169		 * extent has already been logged.
4170		 */
4171		if (ordered->file_offset > mod_start) {
4172			if (ordered_end >= mod_end)
4173				mod_len = ordered->file_offset - mod_start;
4174			/*
4175			 * If we have this case
4176			 *
4177			 * |--------- logged extent ---------|
4178			 *       |----- ordered extent ----|
4179			 *
4180			 * Just don't mess with mod_start and mod_len, we'll
4181			 * just end up logging more csums than we need and it
4182			 * will be ok.
4183			 */
4184		} else {
4185			if (ordered_end < mod_end) {
4186				mod_len = mod_end - ordered_end;
4187				mod_start = ordered_end;
4188			} else {
4189				mod_len = 0;
4190			}
4191		}
4192
4193		/*
4194		 * To keep us from looping for the above case of an ordered
4195		 * extent that falls inside of the logged extent.
4196		 */
4197		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
4198			continue;
4199
4200		list_for_each_entry(sums, &ordered->list, list) {
4201			ret = log_csums(trans, inode, log_root, sums);
4202			if (ret)
4203				return ret;
4204		}
4205	}
4206
4207	/* We're done, found all csums in the ordered extents. */
4208	if (mod_len == 0)
4209		return 0;
4210
4211	/* If we're compressed we have to save the entire range of csums. */
4212	if (em->compress_type) {
4213		csum_offset = 0;
4214		csum_len = max(em->block_len, em->orig_block_len);
4215	} else {
4216		csum_offset = mod_start - em->start;
4217		csum_len = mod_len;
4218	}
4219
4220	/* block start is already adjusted for the file extent offset. */
4221	ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
4222				       em->block_start + csum_offset,
4223				       em->block_start + csum_offset +
4224				       csum_len - 1, &ordered_sums, 0);
4225	if (ret)
4226		return ret;
4227
4228	while (!list_empty(&ordered_sums)) {
4229		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4230						   struct btrfs_ordered_sum,
4231						   list);
4232		if (!ret)
4233			ret = log_csums(trans, inode, log_root, sums);
4234		list_del(&sums->list);
4235		kfree(sums);
4236	}
4237
4238	return ret;
4239}
4240
4241static int log_one_extent(struct btrfs_trans_handle *trans,
4242			  struct btrfs_inode *inode, struct btrfs_root *root,
4243			  const struct extent_map *em,
4244			  struct btrfs_path *path,
4245			  struct btrfs_log_ctx *ctx)
4246{
4247	struct btrfs_root *log = root->log_root;
4248	struct btrfs_file_extent_item *fi;
4249	struct extent_buffer *leaf;
4250	struct btrfs_map_token token;
4251	struct btrfs_key key;
4252	u64 extent_offset = em->start - em->orig_start;
4253	u64 block_len;
4254	int ret;
4255	int extent_inserted = 0;
4256
4257	ret = log_extent_csums(trans, inode, log, em, ctx);
4258	if (ret)
4259		return ret;
4260
4261	ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
4262				   em->start + em->len, NULL, 0, 1,
4263				   sizeof(*fi), &extent_inserted);
4264	if (ret)
4265		return ret;
4266
4267	if (!extent_inserted) {
4268		key.objectid = btrfs_ino(inode);
4269		key.type = BTRFS_EXTENT_DATA_KEY;
4270		key.offset = em->start;
4271
4272		ret = btrfs_insert_empty_item(trans, log, path, &key,
4273					      sizeof(*fi));
4274		if (ret)
4275			return ret;
4276	}
4277	leaf = path->nodes[0];
4278	btrfs_init_map_token(&token, leaf);
4279	fi = btrfs_item_ptr(leaf, path->slots[0],
4280			    struct btrfs_file_extent_item);
4281
4282	btrfs_set_token_file_extent_generation(&token, fi, trans->transid);
4283	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4284		btrfs_set_token_file_extent_type(&token, fi,
4285						 BTRFS_FILE_EXTENT_PREALLOC);
4286	else
4287		btrfs_set_token_file_extent_type(&token, fi,
4288						 BTRFS_FILE_EXTENT_REG);
4289
4290	block_len = max(em->block_len, em->orig_block_len);
4291	if (em->compress_type != BTRFS_COMPRESS_NONE) {
4292		btrfs_set_token_file_extent_disk_bytenr(&token, fi,
4293							em->block_start);
4294		btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
4295	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4296		btrfs_set_token_file_extent_disk_bytenr(&token, fi,
4297							em->block_start -
4298							extent_offset);
4299		btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
4300	} else {
4301		btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0);
4302		btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0);
4303	}
4304
4305	btrfs_set_token_file_extent_offset(&token, fi, extent_offset);
4306	btrfs_set_token_file_extent_num_bytes(&token, fi, em->len);
4307	btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes);
4308	btrfs_set_token_file_extent_compression(&token, fi, em->compress_type);
4309	btrfs_set_token_file_extent_encryption(&token, fi, 0);
4310	btrfs_set_token_file_extent_other_encoding(&token, fi, 0);
4311	btrfs_mark_buffer_dirty(leaf);
4312
4313	btrfs_release_path(path);
4314
4315	return ret;
4316}
4317
4318/*
4319 * Log all prealloc extents beyond the inode's i_size to make sure we do not
4320 * lose them after doing a full/fast fsync and replaying the log. We scan the
4321 * subvolume's root instead of iterating the inode's extent map tree because
4322 * otherwise we can log incorrect extent items based on extent map conversion.
4323 * That can happen due to the fact that extent maps are merged when they
4324 * are not in the extent map tree's list of modified extents.
4325 */
4326static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4327				      struct btrfs_inode *inode,
4328				      struct btrfs_path *path)
4329{
4330	struct btrfs_root *root = inode->root;
4331	struct btrfs_key key;
4332	const u64 i_size = i_size_read(&inode->vfs_inode);
4333	const u64 ino = btrfs_ino(inode);
4334	struct btrfs_path *dst_path = NULL;
4335	bool dropped_extents = false;
4336	u64 truncate_offset = i_size;
4337	struct extent_buffer *leaf;
4338	int slot;
4339	int ins_nr = 0;
4340	int start_slot = 0;
4341	int ret;
4342
4343	if (!(inode->flags & BTRFS_INODE_PREALLOC))
4344		return 0;
4345
4346	key.objectid = ino;
4347	key.type = BTRFS_EXTENT_DATA_KEY;
4348	key.offset = i_size;
4349	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4350	if (ret < 0)
4351		goto out;
4352
4353	/*
4354	 * We must check if there is a prealloc extent that starts before the
4355	 * i_size and crosses the i_size boundary. This is to ensure later we
4356	 * truncate down to the end of that extent and not to the i_size, as
4357	 * otherwise we end up losing part of the prealloc extent after a log
4358	 * replay and with an implicit hole if there is another prealloc extent
4359	 * that starts at an offset beyond i_size.
4360	 */
4361	ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
4362	if (ret < 0)
4363		goto out;
4364
4365	if (ret == 0) {
4366		struct btrfs_file_extent_item *ei;
4367
4368		leaf = path->nodes[0];
4369		slot = path->slots[0];
4370		ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4371
4372		if (btrfs_file_extent_type(leaf, ei) ==
4373		    BTRFS_FILE_EXTENT_PREALLOC) {
4374			u64 extent_end;
4375
4376			btrfs_item_key_to_cpu(leaf, &key, slot);
4377			extent_end = key.offset +
4378				btrfs_file_extent_num_bytes(leaf, ei);
4379
4380			if (extent_end > i_size)
4381				truncate_offset = extent_end;
4382		}
4383	} else {
4384		ret = 0;
4385	}
4386
4387	while (true) {
4388		leaf = path->nodes[0];
4389		slot = path->slots[0];
4390
4391		if (slot >= btrfs_header_nritems(leaf)) {
4392			if (ins_nr > 0) {
4393				ret = copy_items(trans, inode, dst_path, path,
4394						 start_slot, ins_nr, 1, 0);
4395				if (ret < 0)
4396					goto out;
4397				ins_nr = 0;
4398			}
4399			ret = btrfs_next_leaf(root, path);
4400			if (ret < 0)
4401				goto out;
4402			if (ret > 0) {
4403				ret = 0;
4404				break;
4405			}
4406			continue;
4407		}
4408
4409		btrfs_item_key_to_cpu(leaf, &key, slot);
4410		if (key.objectid > ino)
4411			break;
4412		if (WARN_ON_ONCE(key.objectid < ino) ||
4413		    key.type < BTRFS_EXTENT_DATA_KEY ||
4414		    key.offset < i_size) {
4415			path->slots[0]++;
4416			continue;
4417		}
4418		if (!dropped_extents) {
4419			/*
4420			 * Avoid logging extent items logged in past fsync calls
4421			 * and leading to duplicate keys in the log tree.
4422			 */
4423			do {
4424				ret = btrfs_truncate_inode_items(trans,
4425							 root->log_root,
4426							 &inode->vfs_inode,
4427							 truncate_offset,
4428							 BTRFS_EXTENT_DATA_KEY);
4429			} while (ret == -EAGAIN);
4430			if (ret)
4431				goto out;
4432			dropped_extents = true;
4433		}
4434		if (ins_nr == 0)
4435			start_slot = slot;
4436		ins_nr++;
4437		path->slots[0]++;
4438		if (!dst_path) {
4439			dst_path = btrfs_alloc_path();
4440			if (!dst_path) {
4441				ret = -ENOMEM;
4442				goto out;
4443			}
4444		}
4445	}
4446	if (ins_nr > 0)
4447		ret = copy_items(trans, inode, dst_path, path,
4448				 start_slot, ins_nr, 1, 0);
4449out:
4450	btrfs_release_path(path);
4451	btrfs_free_path(dst_path);
4452	return ret;
4453}
4454
4455static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4456				     struct btrfs_root *root,
4457				     struct btrfs_inode *inode,
4458				     struct btrfs_path *path,
4459				     struct btrfs_log_ctx *ctx)
4460{
4461	struct btrfs_ordered_extent *ordered;
4462	struct btrfs_ordered_extent *tmp;
4463	struct extent_map *em, *n;
4464	struct list_head extents;
4465	struct extent_map_tree *tree = &inode->extent_tree;
4466	u64 test_gen;
4467	int ret = 0;
4468	int num = 0;
4469
4470	INIT_LIST_HEAD(&extents);
4471
4472	write_lock(&tree->lock);
4473	test_gen = root->fs_info->last_trans_committed;
4474
4475	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4476		list_del_init(&em->list);
4477		/*
4478		 * Just an arbitrary number, this can be really CPU intensive
4479		 * once we start getting a lot of extents, and really once we
4480		 * have a bunch of extents we just want to commit since it will
4481		 * be faster.
4482		 */
4483		if (++num > 32768) {
4484			list_del_init(&tree->modified_extents);
4485			ret = -EFBIG;
4486			goto process;
4487		}
4488
4489		if (em->generation <= test_gen)
4490			continue;
4491
4492		/* We log prealloc extents beyond eof later. */
4493		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
4494		    em->start >= i_size_read(&inode->vfs_inode))
4495			continue;
4496
4497		/* Need a ref to keep it from getting evicted from cache */
4498		refcount_inc(&em->refs);
4499		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
4500		list_add_tail(&em->list, &extents);
4501		num++;
4502	}
4503
4504	list_sort(NULL, &extents, extent_cmp);
4505process:
4506	while (!list_empty(&extents)) {
4507		em = list_entry(extents.next, struct extent_map, list);
4508
4509		list_del_init(&em->list);
4510
4511		/*
4512		 * If we had an error we just need to delete everybody from our
4513		 * private list.
4514		 */
4515		if (ret) {
4516			clear_em_logging(tree, em);
4517			free_extent_map(em);
4518			continue;
4519		}
4520
4521		write_unlock(&tree->lock);
4522
4523		ret = log_one_extent(trans, inode, root, em, path, ctx);
4524		write_lock(&tree->lock);
4525		clear_em_logging(tree, em);
4526		free_extent_map(em);
4527	}
4528	WARN_ON(!list_empty(&extents));
4529	write_unlock(&tree->lock);
4530
4531	btrfs_release_path(path);
4532	if (!ret)
4533		ret = btrfs_log_prealloc_extents(trans, inode, path);
4534	if (ret)
4535		return ret;
4536
4537	/*
4538	 * We have logged all extents successfully, now make sure the commit of
4539	 * the current transaction waits for the ordered extents to complete
4540	 * before it commits and wipes out the log trees, otherwise we would
4541	 * lose data if an ordered extents completes after the transaction
4542	 * commits and a power failure happens after the transaction commit.
4543	 */
4544	list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
4545		list_del_init(&ordered->log_list);
4546		set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
4547
4548		if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4549			spin_lock_irq(&inode->ordered_tree.lock);
4550			if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4551				set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
4552				atomic_inc(&trans->transaction->pending_ordered);
4553			}
4554			spin_unlock_irq(&inode->ordered_tree.lock);
4555		}
4556		btrfs_put_ordered_extent(ordered);
4557	}
4558
4559	return 0;
4560}
4561
4562static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
4563			     struct btrfs_path *path, u64 *size_ret)
4564{
4565	struct btrfs_key key;
4566	int ret;
4567
4568	key.objectid = btrfs_ino(inode);
4569	key.type = BTRFS_INODE_ITEM_KEY;
4570	key.offset = 0;
4571
4572	ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4573	if (ret < 0) {
4574		return ret;
4575	} else if (ret > 0) {
4576		*size_ret = 0;
4577	} else {
4578		struct btrfs_inode_item *item;
4579
4580		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4581				      struct btrfs_inode_item);
4582		*size_ret = btrfs_inode_size(path->nodes[0], item);
4583		/*
4584		 * If the in-memory inode's i_size is smaller then the inode
4585		 * size stored in the btree, return the inode's i_size, so
4586		 * that we get a correct inode size after replaying the log
4587		 * when before a power failure we had a shrinking truncate
4588		 * followed by addition of a new name (rename / new hard link).
4589		 * Otherwise return the inode size from the btree, to avoid
4590		 * data loss when replaying a log due to previously doing a
4591		 * write that expands the inode's size and logging a new name
4592		 * immediately after.
4593		 */
4594		if (*size_ret > inode->vfs_inode.i_size)
4595			*size_ret = inode->vfs_inode.i_size;
4596	}
4597
4598	btrfs_release_path(path);
4599	return 0;
4600}
4601
4602/*
4603 * At the moment we always log all xattrs. This is to figure out at log replay
4604 * time which xattrs must have their deletion replayed. If a xattr is missing
4605 * in the log tree and exists in the fs/subvol tree, we delete it. This is
4606 * because if a xattr is deleted, the inode is fsynced and a power failure
4607 * happens, causing the log to be replayed the next time the fs is mounted,
4608 * we want the xattr to not exist anymore (same behaviour as other filesystems
4609 * with a journal, ext3/4, xfs, f2fs, etc).
4610 */
4611static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
4612				struct btrfs_root *root,
4613				struct btrfs_inode *inode,
4614				struct btrfs_path *path,
4615				struct btrfs_path *dst_path)
4616{
4617	int ret;
4618	struct btrfs_key key;
4619	const u64 ino = btrfs_ino(inode);
4620	int ins_nr = 0;
4621	int start_slot = 0;
4622	bool found_xattrs = false;
4623
4624	if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
4625		return 0;
4626
4627	key.objectid = ino;
4628	key.type = BTRFS_XATTR_ITEM_KEY;
4629	key.offset = 0;
4630
4631	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4632	if (ret < 0)
4633		return ret;
4634
4635	while (true) {
4636		int slot = path->slots[0];
4637		struct extent_buffer *leaf = path->nodes[0];
4638		int nritems = btrfs_header_nritems(leaf);
4639
4640		if (slot >= nritems) {
4641			if (ins_nr > 0) {
4642				ret = copy_items(trans, inode, dst_path, path,
4643						 start_slot, ins_nr, 1, 0);
4644				if (ret < 0)
4645					return ret;
4646				ins_nr = 0;
4647			}
4648			ret = btrfs_next_leaf(root, path);
4649			if (ret < 0)
4650				return ret;
4651			else if (ret > 0)
4652				break;
4653			continue;
4654		}
4655
4656		btrfs_item_key_to_cpu(leaf, &key, slot);
4657		if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
4658			break;
4659
4660		if (ins_nr == 0)
4661			start_slot = slot;
4662		ins_nr++;
4663		path->slots[0]++;
4664		found_xattrs = true;
4665		cond_resched();
4666	}
4667	if (ins_nr > 0) {
4668		ret = copy_items(trans, inode, dst_path, path,
4669				 start_slot, ins_nr, 1, 0);
4670		if (ret < 0)
4671			return ret;
4672	}
4673
4674	if (!found_xattrs)
4675		set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
4676
4677	return 0;
4678}
4679
4680/*
4681 * When using the NO_HOLES feature if we punched a hole that causes the
4682 * deletion of entire leafs or all the extent items of the first leaf (the one
4683 * that contains the inode item and references) we may end up not processing
4684 * any extents, because there are no leafs with a generation matching the
4685 * current transaction that have extent items for our inode. So we need to find
4686 * if any holes exist and then log them. We also need to log holes after any
4687 * truncate operation that changes the inode's size.
4688 */
4689static int btrfs_log_holes(struct btrfs_trans_handle *trans,
4690			   struct btrfs_root *root,
4691			   struct btrfs_inode *inode,
4692			   struct btrfs_path *path)
4693{
4694	struct btrfs_fs_info *fs_info = root->fs_info;
4695	struct btrfs_key key;
4696	const u64 ino = btrfs_ino(inode);
4697	const u64 i_size = i_size_read(&inode->vfs_inode);
4698	u64 prev_extent_end = 0;
4699	int ret;
4700
4701	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
4702		return 0;
4703
4704	key.objectid = ino;
4705	key.type = BTRFS_EXTENT_DATA_KEY;
4706	key.offset = 0;
4707
4708	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4709	if (ret < 0)
4710		return ret;
4711
4712	while (true) {
4713		struct extent_buffer *leaf = path->nodes[0];
4714
4715		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
4716			ret = btrfs_next_leaf(root, path);
4717			if (ret < 0)
4718				return ret;
4719			if (ret > 0) {
4720				ret = 0;
4721				break;
4722			}
4723			leaf = path->nodes[0];
4724		}
4725
4726		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4727		if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
4728			break;
4729
4730		/* We have a hole, log it. */
4731		if (prev_extent_end < key.offset) {
4732			const u64 hole_len = key.offset - prev_extent_end;
4733
4734			/*
4735			 * Release the path to avoid deadlocks with other code
4736			 * paths that search the root while holding locks on
4737			 * leafs from the log root.
4738			 */
4739			btrfs_release_path(path);
4740			ret = btrfs_insert_file_extent(trans, root->log_root,
4741						       ino, prev_extent_end, 0,
4742						       0, hole_len, 0, hole_len,
4743						       0, 0, 0);
4744			if (ret < 0)
4745				return ret;
4746
4747			/*
4748			 * Search for the same key again in the root. Since it's
4749			 * an extent item and we are holding the inode lock, the
4750			 * key must still exist. If it doesn't just emit warning
4751			 * and return an error to fall back to a transaction
4752			 * commit.
4753			 */
4754			ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4755			if (ret < 0)
4756				return ret;
4757			if (WARN_ON(ret > 0))
4758				return -ENOENT;
4759			leaf = path->nodes[0];
4760		}
4761
4762		prev_extent_end = btrfs_file_extent_end(path);
4763		path->slots[0]++;
4764		cond_resched();
4765	}
4766
4767	if (prev_extent_end < i_size) {
4768		u64 hole_len;
4769
4770		btrfs_release_path(path);
4771		hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
4772		ret = btrfs_insert_file_extent(trans, root->log_root,
4773					       ino, prev_extent_end, 0, 0,
4774					       hole_len, 0, hole_len,
4775					       0, 0, 0);
4776		if (ret < 0)
4777			return ret;
4778	}
4779
4780	return 0;
4781}
4782
4783/*
4784 * When we are logging a new inode X, check if it doesn't have a reference that
4785 * matches the reference from some other inode Y created in a past transaction
4786 * and that was renamed in the current transaction. If we don't do this, then at
4787 * log replay time we can lose inode Y (and all its files if it's a directory):
4788 *
4789 * mkdir /mnt/x
4790 * echo "hello world" > /mnt/x/foobar
4791 * sync
4792 * mv /mnt/x /mnt/y
4793 * mkdir /mnt/x                 # or touch /mnt/x
4794 * xfs_io -c fsync /mnt/x
4795 * <power fail>
4796 * mount fs, trigger log replay
4797 *
4798 * After the log replay procedure, we would lose the first directory and all its
4799 * files (file foobar).
4800 * For the case where inode Y is not a directory we simply end up losing it:
4801 *
4802 * echo "123" > /mnt/foo
4803 * sync
4804 * mv /mnt/foo /mnt/bar
4805 * echo "abc" > /mnt/foo
4806 * xfs_io -c fsync /mnt/foo
4807 * <power fail>
4808 *
4809 * We also need this for cases where a snapshot entry is replaced by some other
4810 * entry (file or directory) otherwise we end up with an unreplayable log due to
4811 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
4812 * if it were a regular entry:
4813 *
4814 * mkdir /mnt/x
4815 * btrfs subvolume snapshot /mnt /mnt/x/snap
4816 * btrfs subvolume delete /mnt/x/snap
4817 * rmdir /mnt/x
4818 * mkdir /mnt/x
4819 * fsync /mnt/x or fsync some new file inside it
4820 * <power fail>
4821 *
4822 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
4823 * the same transaction.
4824 */
4825static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4826					 const int slot,
4827					 const struct btrfs_key *key,
4828					 struct btrfs_inode *inode,
4829					 u64 *other_ino, u64 *other_parent)
4830{
4831	int ret;
4832	struct btrfs_path *search_path;
4833	char *name = NULL;
4834	u32 name_len = 0;
4835	u32 item_size = btrfs_item_size_nr(eb, slot);
4836	u32 cur_offset = 0;
4837	unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
4838
4839	search_path = btrfs_alloc_path();
4840	if (!search_path)
4841		return -ENOMEM;
4842	search_path->search_commit_root = 1;
4843	search_path->skip_locking = 1;
4844
4845	while (cur_offset < item_size) {
4846		u64 parent;
4847		u32 this_name_len;
4848		u32 this_len;
4849		unsigned long name_ptr;
4850		struct btrfs_dir_item *di;
4851
4852		if (key->type == BTRFS_INODE_REF_KEY) {
4853			struct btrfs_inode_ref *iref;
4854
4855			iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
4856			parent = key->offset;
4857			this_name_len = btrfs_inode_ref_name_len(eb, iref);
4858			name_ptr = (unsigned long)(iref + 1);
4859			this_len = sizeof(*iref) + this_name_len;
4860		} else {
4861			struct btrfs_inode_extref *extref;
4862
4863			extref = (struct btrfs_inode_extref *)(ptr +
4864							       cur_offset);
4865			parent = btrfs_inode_extref_parent(eb, extref);
4866			this_name_len = btrfs_inode_extref_name_len(eb, extref);
4867			name_ptr = (unsigned long)&extref->name;
4868			this_len = sizeof(*extref) + this_name_len;
4869		}
4870
4871		if (this_name_len > name_len) {
4872			char *new_name;
4873
4874			new_name = krealloc(name, this_name_len, GFP_NOFS);
4875			if (!new_name) {
4876				ret = -ENOMEM;
4877				goto out;
4878			}
4879			name_len = this_name_len;
4880			name = new_name;
4881		}
4882
4883		read_extent_buffer(eb, name, name_ptr, this_name_len);
4884		di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
4885				parent, name, this_name_len, 0);
4886		if (di && !IS_ERR(di)) {
4887			struct btrfs_key di_key;
4888
4889			btrfs_dir_item_key_to_cpu(search_path->nodes[0],
4890						  di, &di_key);
4891			if (di_key.type == BTRFS_INODE_ITEM_KEY) {
4892				if (di_key.objectid != key->objectid) {
4893					ret = 1;
4894					*other_ino = di_key.objectid;
4895					*other_parent = parent;
4896				} else {
4897					ret = 0;
4898				}
4899			} else {
4900				ret = -EAGAIN;
4901			}
4902			goto out;
4903		} else if (IS_ERR(di)) {
4904			ret = PTR_ERR(di);
4905			goto out;
4906		}
4907		btrfs_release_path(search_path);
4908
4909		cur_offset += this_len;
4910	}
4911	ret = 0;
4912out:
4913	btrfs_free_path(search_path);
4914	kfree(name);
4915	return ret;
4916}
4917
4918struct btrfs_ino_list {
4919	u64 ino;
4920	u64 parent;
4921	struct list_head list;
4922};
4923
4924static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
4925				  struct btrfs_root *root,
4926				  struct btrfs_path *path,
4927				  struct btrfs_log_ctx *ctx,
4928				  u64 ino, u64 parent)
4929{
4930	struct btrfs_ino_list *ino_elem;
4931	LIST_HEAD(inode_list);
4932	int ret = 0;
4933
4934	ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
4935	if (!ino_elem)
4936		return -ENOMEM;
4937	ino_elem->ino = ino;
4938	ino_elem->parent = parent;
4939	list_add_tail(&ino_elem->list, &inode_list);
4940
4941	while (!list_empty(&inode_list)) {
4942		struct btrfs_fs_info *fs_info = root->fs_info;
4943		struct btrfs_key key;
4944		struct inode *inode;
4945
4946		ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
4947					    list);
4948		ino = ino_elem->ino;
4949		parent = ino_elem->parent;
4950		list_del(&ino_elem->list);
4951		kfree(ino_elem);
4952		if (ret)
4953			continue;
4954
4955		btrfs_release_path(path);
4956
4957		inode = btrfs_iget(fs_info->sb, ino, root);
4958		/*
4959		 * If the other inode that had a conflicting dir entry was
4960		 * deleted in the current transaction, we need to log its parent
4961		 * directory.
4962		 */
4963		if (IS_ERR(inode)) {
4964			ret = PTR_ERR(inode);
4965			if (ret == -ENOENT) {
4966				inode = btrfs_iget(fs_info->sb, parent, root);
4967				if (IS_ERR(inode)) {
4968					ret = PTR_ERR(inode);
4969				} else {
4970					ret = btrfs_log_inode(trans, root,
4971						      BTRFS_I(inode),
4972						      LOG_OTHER_INODE_ALL,
4973						      ctx);
4974					btrfs_add_delayed_iput(inode);
4975				}
4976			}
4977			continue;
4978		}
4979		/*
4980		 * If the inode was already logged skip it - otherwise we can
4981		 * hit an infinite loop. Example:
4982		 *
4983		 * From the commit root (previous transaction) we have the
4984		 * following inodes:
4985		 *
4986		 * inode 257 a directory
4987		 * inode 258 with references "zz" and "zz_link" on inode 257
4988		 * inode 259 with reference "a" on inode 257
4989		 *
4990		 * And in the current (uncommitted) transaction we have:
4991		 *
4992		 * inode 257 a directory, unchanged
4993		 * inode 258 with references "a" and "a2" on inode 257
4994		 * inode 259 with reference "zz_link" on inode 257
4995		 * inode 261 with reference "zz" on inode 257
4996		 *
4997		 * When logging inode 261 the following infinite loop could
4998		 * happen if we don't skip already logged inodes:
4999		 *
5000		 * - we detect inode 258 as a conflicting inode, with inode 261
5001		 *   on reference "zz", and log it;
5002		 *
5003		 * - we detect inode 259 as a conflicting inode, with inode 258
5004		 *   on reference "a", and log it;
5005		 *
5006		 * - we detect inode 258 as a conflicting inode, with inode 259
5007		 *   on reference "zz_link", and log it - again! After this we
5008		 *   repeat the above steps forever.
5009		 */
5010		spin_lock(&BTRFS_I(inode)->lock);
5011		/*
5012		 * Check the inode's logged_trans only instead of
5013		 * btrfs_inode_in_log(). This is because the last_log_commit of
5014		 * the inode is not updated when we only log that it exists and
5015		 * it has the full sync bit set (see btrfs_log_inode()).
5016		 */
5017		if (BTRFS_I(inode)->logged_trans == trans->transid) {
5018			spin_unlock(&BTRFS_I(inode)->lock);
5019			btrfs_add_delayed_iput(inode);
5020			continue;
5021		}
5022		spin_unlock(&BTRFS_I(inode)->lock);
5023		/*
5024		 * We are safe logging the other inode without acquiring its
5025		 * lock as long as we log with the LOG_INODE_EXISTS mode. We
5026		 * are safe against concurrent renames of the other inode as
5027		 * well because during a rename we pin the log and update the
5028		 * log with the new name before we unpin it.
5029		 */
5030		ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
5031				      LOG_OTHER_INODE, ctx);
5032		if (ret) {
5033			btrfs_add_delayed_iput(inode);
5034			continue;
5035		}
5036
5037		key.objectid = ino;
5038		key.type = BTRFS_INODE_REF_KEY;
5039		key.offset = 0;
5040		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5041		if (ret < 0) {
5042			btrfs_add_delayed_iput(inode);
5043			continue;
5044		}
5045
5046		while (true) {
5047			struct extent_buffer *leaf = path->nodes[0];
5048			int slot = path->slots[0];
5049			u64 other_ino = 0;
5050			u64 other_parent = 0;
5051
5052			if (slot >= btrfs_header_nritems(leaf)) {
5053				ret = btrfs_next_leaf(root, path);
5054				if (ret < 0) {
5055					break;
5056				} else if (ret > 0) {
5057					ret = 0;
5058					break;
5059				}
5060				continue;
5061			}
5062
5063			btrfs_item_key_to_cpu(leaf, &key, slot);
5064			if (key.objectid != ino ||
5065			    (key.type != BTRFS_INODE_REF_KEY &&
5066			     key.type != BTRFS_INODE_EXTREF_KEY)) {
5067				ret = 0;
5068				break;
5069			}
5070
5071			ret = btrfs_check_ref_name_override(leaf, slot, &key,
5072					BTRFS_I(inode), &other_ino,
5073					&other_parent);
5074			if (ret < 0)
5075				break;
5076			if (ret > 0) {
5077				ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5078				if (!ino_elem) {
5079					ret = -ENOMEM;
5080					break;
5081				}
5082				ino_elem->ino = other_ino;
5083				ino_elem->parent = other_parent;
5084				list_add_tail(&ino_elem->list, &inode_list);
5085				ret = 0;
5086			}
5087			path->slots[0]++;
5088		}
5089		btrfs_add_delayed_iput(inode);
5090	}
5091
5092	return ret;
5093}
5094
5095static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
5096				   struct btrfs_inode *inode,
5097				   struct btrfs_key *min_key,
5098				   const struct btrfs_key *max_key,
5099				   struct btrfs_path *path,
5100				   struct btrfs_path *dst_path,
5101				   const u64 logged_isize,
5102				   const bool recursive_logging,
5103				   const int inode_only,
5104				   struct btrfs_log_ctx *ctx,
5105				   bool *need_log_inode_item)
5106{
5107	const u64 i_size = i_size_read(&inode->vfs_inode);
5108	struct btrfs_root *root = inode->root;
5109	int ins_start_slot = 0;
5110	int ins_nr = 0;
5111	int ret;
5112
5113	while (1) {
5114		ret = btrfs_search_forward(root, min_key, path, trans->transid);
5115		if (ret < 0)
5116			return ret;
5117		if (ret > 0) {
5118			ret = 0;
5119			break;
5120		}
5121again:
5122		/* Note, ins_nr might be > 0 here, cleanup outside the loop */
5123		if (min_key->objectid != max_key->objectid)
5124			break;
5125		if (min_key->type > max_key->type)
5126			break;
5127
5128		if (min_key->type == BTRFS_INODE_ITEM_KEY) {
5129			*need_log_inode_item = false;
5130		} else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
5131			   min_key->offset >= i_size) {
5132			/*
5133			 * Extents at and beyond eof are logged with
5134			 * btrfs_log_prealloc_extents().
5135			 * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
5136			 * and no keys greater than that, so bail out.
5137			 */
5138			break;
5139		} else if ((min_key->type == BTRFS_INODE_REF_KEY ||
5140			    min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5141			   inode->generation == trans->transid &&
5142			   !recursive_logging) {
5143			u64 other_ino = 0;
5144			u64 other_parent = 0;
5145
5146			ret = btrfs_check_ref_name_override(path->nodes[0],
5147					path->slots[0], min_key, inode,
5148					&other_ino, &other_parent);
5149			if (ret < 0) {
5150				return ret;
5151			} else if (ret > 0 && ctx &&
5152				   other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
5153				if (ins_nr > 0) {
5154					ins_nr++;
5155				} else {
5156					ins_nr = 1;
5157					ins_start_slot = path->slots[0];
5158				}
5159				ret = copy_items(trans, inode, dst_path, path,
5160						 ins_start_slot, ins_nr,
5161						 inode_only, logged_isize);
5162				if (ret < 0)
5163					return ret;
5164				ins_nr = 0;
5165
5166				ret = log_conflicting_inodes(trans, root, path,
5167						ctx, other_ino, other_parent);
5168				if (ret)
5169					return ret;
5170				btrfs_release_path(path);
5171				goto next_key;
5172			}
5173		} else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5174			/* Skip xattrs, logged later with btrfs_log_all_xattrs() */
5175			if (ins_nr == 0)
5176				goto next_slot;
5177			ret = copy_items(trans, inode, dst_path, path,
5178					 ins_start_slot,
5179					 ins_nr, inode_only, logged_isize);
5180			if (ret < 0)
5181				return ret;
5182			ins_nr = 0;
5183			goto next_slot;
5184		}
5185
5186		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5187			ins_nr++;
5188			goto next_slot;
5189		} else if (!ins_nr) {
5190			ins_start_slot = path->slots[0];
5191			ins_nr = 1;
5192			goto next_slot;
5193		}
5194
5195		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5196				 ins_nr, inode_only, logged_isize);
5197		if (ret < 0)
5198			return ret;
5199		ins_nr = 1;
5200		ins_start_slot = path->slots[0];
5201next_slot:
5202		path->slots[0]++;
5203		if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5204			btrfs_item_key_to_cpu(path->nodes[0], min_key,
5205					      path->slots[0]);
5206			goto again;
5207		}
5208		if (ins_nr) {
5209			ret = copy_items(trans, inode, dst_path, path,
5210					 ins_start_slot, ins_nr, inode_only,
5211					 logged_isize);
5212			if (ret < 0)
5213				return ret;
5214			ins_nr = 0;
5215		}
5216		btrfs_release_path(path);
5217next_key:
5218		if (min_key->offset < (u64)-1) {
5219			min_key->offset++;
5220		} else if (min_key->type < max_key->type) {
5221			min_key->type++;
5222			min_key->offset = 0;
5223		} else {
5224			break;
5225		}
5226	}
5227	if (ins_nr) {
5228		ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5229				 ins_nr, inode_only, logged_isize);
5230		if (ret)
5231			return ret;
5232	}
5233
5234	if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
5235		/*
5236		 * Release the path because otherwise we might attempt to double
5237		 * lock the same leaf with btrfs_log_prealloc_extents() below.
5238		 */
5239		btrfs_release_path(path);
5240		ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
5241	}
5242
5243	return ret;
5244}
5245
5246/* log a single inode in the tree log.
5247 * At least one parent directory for this inode must exist in the tree
5248 * or be logged already.
5249 *
5250 * Any items from this inode changed by the current transaction are copied
5251 * to the log tree.  An extra reference is taken on any extents in this
5252 * file, allowing us to avoid a whole pile of corner cases around logging
5253 * blocks that have been removed from the tree.
5254 *
5255 * See LOG_INODE_ALL and related defines for a description of what inode_only
5256 * does.
5257 *
5258 * This handles both files and directories.
5259 */
5260static int btrfs_log_inode(struct btrfs_trans_handle *trans,
5261			   struct btrfs_root *root, struct btrfs_inode *inode,
5262			   int inode_only,
5263			   struct btrfs_log_ctx *ctx)
5264{
5265	struct btrfs_path *path;
5266	struct btrfs_path *dst_path;
5267	struct btrfs_key min_key;
5268	struct btrfs_key max_key;
5269	struct btrfs_root *log = root->log_root;
5270	int err = 0;
5271	int ret = 0;
5272	bool fast_search = false;
5273	u64 ino = btrfs_ino(inode);
5274	struct extent_map_tree *em_tree = &inode->extent_tree;
5275	u64 logged_isize = 0;
5276	bool need_log_inode_item = true;
5277	bool xattrs_logged = false;
5278	bool recursive_logging = false;
5279
5280	path = btrfs_alloc_path();
5281	if (!path)
5282		return -ENOMEM;
5283	dst_path = btrfs_alloc_path();
5284	if (!dst_path) {
5285		btrfs_free_path(path);
5286		return -ENOMEM;
5287	}
5288
5289	min_key.objectid = ino;
5290	min_key.type = BTRFS_INODE_ITEM_KEY;
5291	min_key.offset = 0;
5292
5293	max_key.objectid = ino;
5294
5295
5296	/* today the code can only do partial logging of directories */
5297	if (S_ISDIR(inode->vfs_inode.i_mode) ||
5298	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5299		       &inode->runtime_flags) &&
5300	     inode_only >= LOG_INODE_EXISTS))
5301		max_key.type = BTRFS_XATTR_ITEM_KEY;
5302	else
5303		max_key.type = (u8)-1;
5304	max_key.offset = (u64)-1;
5305
5306	/*
5307	 * Only run delayed items if we are a directory. We want to make sure
5308	 * all directory indexes hit the fs/subvolume tree so we can find them
5309	 * and figure out which index ranges have to be logged.
5310	 *
5311	 * Otherwise commit the delayed inode only if the full sync flag is set,
5312	 * as we want to make sure an up to date version is in the subvolume
5313	 * tree so copy_inode_items_to_log() / copy_items() can find it and copy
5314	 * it to the log tree. For a non full sync, we always log the inode item
5315	 * based on the in-memory struct btrfs_inode which is always up to date.
5316	 */
5317	if (S_ISDIR(inode->vfs_inode.i_mode))
5318		ret = btrfs_commit_inode_delayed_items(trans, inode);
5319	else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
5320		ret = btrfs_commit_inode_delayed_inode(inode);
5321
5322	if (ret) {
5323		btrfs_free_path(path);
5324		btrfs_free_path(dst_path);
5325		return ret;
5326	}
5327
5328	if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
5329		recursive_logging = true;
5330		if (inode_only == LOG_OTHER_INODE)
5331			inode_only = LOG_INODE_EXISTS;
5332		else
5333			inode_only = LOG_INODE_ALL;
5334		mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
5335	} else {
5336		mutex_lock(&inode->log_mutex);
5337	}
5338
5339	/*
5340	 * For symlinks, we must always log their content, which is stored in an
5341	 * inline extent, otherwise we could end up with an empty symlink after
5342	 * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
5343	 * one attempts to create an empty symlink).
5344	 * We don't need to worry about flushing delalloc, because when we create
5345	 * the inline extent when the symlink is created (we never have delalloc
5346	 * for symlinks).
5347	 */
5348	if (S_ISLNK(inode->vfs_inode.i_mode))
5349		inode_only = LOG_INODE_ALL;
5350
5351	/*
5352	 * a brute force approach to making sure we get the most uptodate
5353	 * copies of everything.
5354	 */
5355	if (S_ISDIR(inode->vfs_inode.i_mode)) {
5356		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
5357
5358		if (inode_only == LOG_INODE_EXISTS)
5359			max_key_type = BTRFS_XATTR_ITEM_KEY;
5360		ret = drop_objectid_items(trans, log, path, ino, max_key_type);
5361	} else {
5362		if (inode_only == LOG_INODE_EXISTS) {
5363			/*
5364			 * Make sure the new inode item we write to the log has
5365			 * the same isize as the current one (if it exists).
5366			 * This is necessary to prevent data loss after log
5367			 * replay, and also to prevent doing a wrong expanding
5368			 * truncate - for e.g. create file, write 4K into offset
5369			 * 0, fsync, write 4K into offset 4096, add hard link,
5370			 * fsync some other file (to sync log), power fail - if
5371			 * we use the inode's current i_size, after log replay
5372			 * we get a 8Kb file, with the last 4Kb extent as a hole
5373			 * (zeroes), as if an expanding truncate happened,
5374			 * instead of getting a file of 4Kb only.
5375			 */
5376			err = logged_inode_size(log, inode, path, &logged_isize);
5377			if (err)
5378				goto out_unlock;
5379		}
5380		if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5381			     &inode->runtime_flags)) {
5382			if (inode_only == LOG_INODE_EXISTS) {
5383				max_key.type = BTRFS_XATTR_ITEM_KEY;
5384				ret = drop_objectid_items(trans, log, path, ino,
5385							  max_key.type);
5386			} else {
5387				clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5388					  &inode->runtime_flags);
5389				clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5390					  &inode->runtime_flags);
5391				while(1) {
5392					ret = btrfs_truncate_inode_items(trans,
5393						log, &inode->vfs_inode, 0, 0);
5394					if (ret != -EAGAIN)
5395						break;
5396				}
5397			}
5398		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5399					      &inode->runtime_flags) ||
5400			   inode_only == LOG_INODE_EXISTS) {
5401			if (inode_only == LOG_INODE_ALL)
5402				fast_search = true;
5403			max_key.type = BTRFS_XATTR_ITEM_KEY;
5404			ret = drop_objectid_items(trans, log, path, ino,
5405						  max_key.type);
5406		} else {
5407			if (inode_only == LOG_INODE_ALL)
5408				fast_search = true;
5409			goto log_extents;
5410		}
5411
5412	}
5413	if (ret) {
5414		err = ret;
5415		goto out_unlock;
5416	}
5417
5418	err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
5419				      path, dst_path, logged_isize,
5420				      recursive_logging, inode_only, ctx,
5421				      &need_log_inode_item);
5422	if (err)
5423		goto out_unlock;
5424
5425	btrfs_release_path(path);
5426	btrfs_release_path(dst_path);
5427	err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
5428	if (err)
5429		goto out_unlock;
5430	xattrs_logged = true;
5431	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
5432		btrfs_release_path(path);
5433		btrfs_release_path(dst_path);
5434		err = btrfs_log_holes(trans, root, inode, path);
5435		if (err)
5436			goto out_unlock;
5437	}
5438log_extents:
5439	btrfs_release_path(path);
5440	btrfs_release_path(dst_path);
5441	if (need_log_inode_item) {
5442		err = log_inode_item(trans, log, dst_path, inode);
5443		if (!err && !xattrs_logged) {
5444			err = btrfs_log_all_xattrs(trans, root, inode, path,
5445						   dst_path);
5446			btrfs_release_path(path);
5447		}
5448		if (err)
5449			goto out_unlock;
5450	}
5451	if (fast_search) {
5452		ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
5453						ctx);
5454		if (ret) {
5455			err = ret;
5456			goto out_unlock;
5457		}
5458	} else if (inode_only == LOG_INODE_ALL) {
5459		struct extent_map *em, *n;
5460
5461		write_lock(&em_tree->lock);
5462		list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
5463			list_del_init(&em->list);
5464		write_unlock(&em_tree->lock);
5465	}
5466
5467	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
5468		ret = log_directory_changes(trans, root, inode, path, dst_path,
5469					ctx);
5470		if (ret) {
5471			err = ret;
5472			goto out_unlock;
5473		}
5474	}
5475
5476	/*
5477	 * If we are logging that an ancestor inode exists as part of logging a
5478	 * new name from a link or rename operation, don't mark the inode as
5479	 * logged - otherwise if an explicit fsync is made against an ancestor,
5480	 * the fsync considers the inode in the log and doesn't sync the log,
5481	 * resulting in the ancestor missing after a power failure unless the
5482	 * log was synced as part of an fsync against any other unrelated inode.
5483	 * So keep it simple for this case and just don't flag the ancestors as
5484	 * logged.
5485	 */
5486	if (!ctx ||
5487	    !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
5488	      &inode->vfs_inode != ctx->inode)) {
5489		spin_lock(&inode->lock);
5490		inode->logged_trans = trans->transid;
5491		/*
5492		 * Don't update last_log_commit if we logged that an inode exists
5493		 * after it was loaded to memory (full_sync bit set).
5494		 * This is to prevent data loss when we do a write to the inode,
5495		 * then the inode gets evicted after all delalloc was flushed,
5496		 * then we log it exists (due to a rename for example) and then
5497		 * fsync it. This last fsync would do nothing (not logging the
5498		 * extents previously written).
5499		 */
5500		if (inode_only != LOG_INODE_EXISTS ||
5501		    !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
5502			inode->last_log_commit = inode->last_sub_trans;
5503		spin_unlock(&inode->lock);
5504	}
5505out_unlock:
5506	mutex_unlock(&inode->log_mutex);
5507
5508	btrfs_free_path(path);
5509	btrfs_free_path(dst_path);
5510	return err;
5511}
5512
5513/*
5514 * Check if we must fallback to a transaction commit when logging an inode.
5515 * This must be called after logging the inode and is used only in the context
5516 * when fsyncing an inode requires the need to log some other inode - in which
5517 * case we can't lock the i_mutex of each other inode we need to log as that
5518 * can lead to deadlocks with concurrent fsync against other inodes (as we can
5519 * log inodes up or down in the hierarchy) or rename operations for example. So
5520 * we take the log_mutex of the inode after we have logged it and then check for
5521 * its last_unlink_trans value - this is safe because any task setting
5522 * last_unlink_trans must take the log_mutex and it must do this before it does
5523 * the actual unlink operation, so if we do this check before a concurrent task
5524 * sets last_unlink_trans it means we've logged a consistent version/state of
5525 * all the inode items, otherwise we are not sure and must do a transaction
5526 * commit (the concurrent task might have only updated last_unlink_trans before
5527 * we logged the inode or it might have also done the unlink).
5528 */
5529static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
5530					  struct btrfs_inode *inode)
5531{
5532	struct btrfs_fs_info *fs_info = inode->root->fs_info;
5533	bool ret = false;
5534
5535	mutex_lock(&inode->log_mutex);
5536	if (inode->last_unlink_trans > fs_info->last_trans_committed) {
5537		/*
5538		 * Make sure any commits to the log are forced to be full
5539		 * commits.
5540		 */
5541		btrfs_set_log_full_commit(trans);
5542		ret = true;
5543	}
5544	mutex_unlock(&inode->log_mutex);
5545
5546	return ret;
5547}
5548
5549/*
5550 * follow the dentry parent pointers up the chain and see if any
5551 * of the directories in it require a full commit before they can
5552 * be logged.  Returns zero if nothing special needs to be done or 1 if
5553 * a full commit is required.
5554 */
5555static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
5556					       struct btrfs_inode *inode,
5557					       struct dentry *parent,
5558					       struct super_block *sb,
5559					       u64 last_committed)
5560{
5561	int ret = 0;
5562	struct dentry *old_parent = NULL;
5563
5564	/*
5565	 * for regular files, if its inode is already on disk, we don't
5566	 * have to worry about the parents at all.  This is because
5567	 * we can use the last_unlink_trans field to record renames
5568	 * and other fun in this file.
5569	 */
5570	if (S_ISREG(inode->vfs_inode.i_mode) &&
5571	    inode->generation <= last_committed &&
5572	    inode->last_unlink_trans <= last_committed)
5573		goto out;
5574
5575	if (!S_ISDIR(inode->vfs_inode.i_mode)) {
5576		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5577			goto out;
5578		inode = BTRFS_I(d_inode(parent));
5579	}
5580
5581	while (1) {
5582		if (btrfs_must_commit_transaction(trans, inode)) {
5583			ret = 1;
5584			break;
5585		}
5586
5587		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5588			break;
5589
5590		if (IS_ROOT(parent)) {
5591			inode = BTRFS_I(d_inode(parent));
5592			if (btrfs_must_commit_transaction(trans, inode))
5593				ret = 1;
5594			break;
5595		}
5596
5597		parent = dget_parent(parent);
5598		dput(old_parent);
5599		old_parent = parent;
5600		inode = BTRFS_I(d_inode(parent));
5601
5602	}
5603	dput(old_parent);
5604out:
5605	return ret;
5606}
5607
5608struct btrfs_dir_list {
5609	u64 ino;
5610	struct list_head list;
5611};
5612
5613/*
5614 * Log the inodes of the new dentries of a directory. See log_dir_items() for
5615 * details about the why it is needed.
5616 * This is a recursive operation - if an existing dentry corresponds to a
5617 * directory, that directory's new entries are logged too (same behaviour as
5618 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5619 * the dentries point to we do not lock their i_mutex, otherwise lockdep
5620 * complains about the following circular lock dependency / possible deadlock:
5621 *
5622 *        CPU0                                        CPU1
5623 *        ----                                        ----
5624 * lock(&type->i_mutex_dir_key#3/2);
5625 *                                            lock(sb_internal#2);
5626 *                                            lock(&type->i_mutex_dir_key#3/2);
5627 * lock(&sb->s_type->i_mutex_key#14);
5628 *
5629 * Where sb_internal is the lock (a counter that works as a lock) acquired by
5630 * sb_start_intwrite() in btrfs_start_transaction().
5631 * Not locking i_mutex of the inodes is still safe because:
5632 *
5633 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5634 *    that while logging the inode new references (names) are added or removed
5635 *    from the inode, leaving the logged inode item with a link count that does
5636 *    not match the number of logged inode reference items. This is fine because
5637 *    at log replay time we compute the real number of links and correct the
5638 *    link count in the inode item (see replay_one_buffer() and
5639 *    link_to_fixup_dir());
5640 *
5641 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5642 *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
5643 *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
5644 *    has a size that doesn't match the sum of the lengths of all the logged
5645 *    names. This does not result in a problem because if a dir_item key is
5646 *    logged but its matching dir_index key is not logged, at log replay time we
5647 *    don't use it to replay the respective name (see replay_one_name()). On the
5648 *    other hand if only the dir_index key ends up being logged, the respective
5649 *    name is added to the fs/subvol tree with both the dir_item and dir_index
5650 *    keys created (see replay_one_name()).
5651 *    The directory's inode item with a wrong i_size is not a problem as well,
5652 *    since we don't use it at log replay time to set the i_size in the inode
5653 *    item of the fs/subvol tree (see overwrite_item()).
5654 */
5655static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5656				struct btrfs_root *root,
5657				struct btrfs_inode *start_inode,
5658				struct btrfs_log_ctx *ctx)
5659{
5660	struct btrfs_fs_info *fs_info = root->fs_info;
5661	struct btrfs_root *log = root->log_root;
5662	struct btrfs_path *path;
5663	LIST_HEAD(dir_list);
5664	struct btrfs_dir_list *dir_elem;
5665	int ret = 0;
5666
5667	path = btrfs_alloc_path();
5668	if (!path)
5669		return -ENOMEM;
5670
5671	dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5672	if (!dir_elem) {
5673		btrfs_free_path(path);
5674		return -ENOMEM;
5675	}
5676	dir_elem->ino = btrfs_ino(start_inode);
5677	list_add_tail(&dir_elem->list, &dir_list);
5678
5679	while (!list_empty(&dir_list)) {
5680		struct extent_buffer *leaf;
5681		struct btrfs_key min_key;
5682		int nritems;
5683		int i;
5684
5685		dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
5686					    list);
5687		if (ret)
5688			goto next_dir_inode;
5689
5690		min_key.objectid = dir_elem->ino;
5691		min_key.type = BTRFS_DIR_ITEM_KEY;
5692		min_key.offset = 0;
5693again:
5694		btrfs_release_path(path);
5695		ret = btrfs_search_forward(log, &min_key, path, trans->transid);
5696		if (ret < 0) {
5697			goto next_dir_inode;
5698		} else if (ret > 0) {
5699			ret = 0;
5700			goto next_dir_inode;
5701		}
5702
5703process_leaf:
5704		leaf = path->nodes[0];
5705		nritems = btrfs_header_nritems(leaf);
5706		for (i = path->slots[0]; i < nritems; i++) {
5707			struct btrfs_dir_item *di;
5708			struct btrfs_key di_key;
5709			struct inode *di_inode;
5710			struct btrfs_dir_list *new_dir_elem;
5711			int log_mode = LOG_INODE_EXISTS;
5712			int type;
5713
5714			btrfs_item_key_to_cpu(leaf, &min_key, i);
5715			if (min_key.objectid != dir_elem->ino ||
5716			    min_key.type != BTRFS_DIR_ITEM_KEY)
5717				goto next_dir_inode;
5718
5719			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
5720			type = btrfs_dir_type(leaf, di);
5721			if (btrfs_dir_transid(leaf, di) < trans->transid &&
5722			    type != BTRFS_FT_DIR)
5723				continue;
5724			btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5725			if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5726				continue;
5727
5728			btrfs_release_path(path);
5729			di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
5730			if (IS_ERR(di_inode)) {
5731				ret = PTR_ERR(di_inode);
5732				goto next_dir_inode;
5733			}
5734
5735			if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
5736				btrfs_add_delayed_iput(di_inode);
5737				break;
5738			}
5739
5740			ctx->log_new_dentries = false;
5741			if (type == BTRFS_FT_DIR)
5742				log_mode = LOG_INODE_ALL;
5743			ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
5744					      log_mode, ctx);
5745			if (!ret &&
5746			    btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
5747				ret = 1;
5748			btrfs_add_delayed_iput(di_inode);
5749			if (ret)
5750				goto next_dir_inode;
5751			if (ctx->log_new_dentries) {
5752				new_dir_elem = kmalloc(sizeof(*new_dir_elem),
5753						       GFP_NOFS);
5754				if (!new_dir_elem) {
5755					ret = -ENOMEM;
5756					goto next_dir_inode;
5757				}
5758				new_dir_elem->ino = di_key.objectid;
5759				list_add_tail(&new_dir_elem->list, &dir_list);
5760			}
5761			break;
5762		}
5763		if (i == nritems) {
5764			ret = btrfs_next_leaf(log, path);
5765			if (ret < 0) {
5766				goto next_dir_inode;
5767			} else if (ret > 0) {
5768				ret = 0;
5769				goto next_dir_inode;
5770			}
5771			goto process_leaf;
5772		}
5773		if (min_key.offset < (u64)-1) {
5774			min_key.offset++;
5775			goto again;
5776		}
5777next_dir_inode:
5778		list_del(&dir_elem->list);
5779		kfree(dir_elem);
5780	}
5781
5782	btrfs_free_path(path);
5783	return ret;
5784}
5785
5786static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
5787				 struct btrfs_inode *inode,
5788				 struct btrfs_log_ctx *ctx)
5789{
5790	struct btrfs_fs_info *fs_info = trans->fs_info;
5791	int ret;
5792	struct btrfs_path *path;
5793	struct btrfs_key key;
5794	struct btrfs_root *root = inode->root;
5795	const u64 ino = btrfs_ino(inode);
5796
5797	path = btrfs_alloc_path();
5798	if (!path)
5799		return -ENOMEM;
5800	path->skip_locking = 1;
5801	path->search_commit_root = 1;
5802
5803	key.objectid = ino;
5804	key.type = BTRFS_INODE_REF_KEY;
5805	key.offset = 0;
5806	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5807	if (ret < 0)
5808		goto out;
5809
5810	while (true) {
5811		struct extent_buffer *leaf = path->nodes[0];
5812		int slot = path->slots[0];
5813		u32 cur_offset = 0;
5814		u32 item_size;
5815		unsigned long ptr;
5816
5817		if (slot >= btrfs_header_nritems(leaf)) {
5818			ret = btrfs_next_leaf(root, path);
5819			if (ret < 0)
5820				goto out;
5821			else if (ret > 0)
5822				break;
5823			continue;
5824		}
5825
5826		btrfs_item_key_to_cpu(leaf, &key, slot);
5827		/* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
5828		if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
5829			break;
5830
5831		item_size = btrfs_item_size_nr(leaf, slot);
5832		ptr = btrfs_item_ptr_offset(leaf, slot);
5833		while (cur_offset < item_size) {
5834			struct btrfs_key inode_key;
5835			struct inode *dir_inode;
5836
5837			inode_key.type = BTRFS_INODE_ITEM_KEY;
5838			inode_key.offset = 0;
5839
5840			if (key.type == BTRFS_INODE_EXTREF_KEY) {
5841				struct btrfs_inode_extref *extref;
5842
5843				extref = (struct btrfs_inode_extref *)
5844					(ptr + cur_offset);
5845				inode_key.objectid = btrfs_inode_extref_parent(
5846					leaf, extref);
5847				cur_offset += sizeof(*extref);
5848				cur_offset += btrfs_inode_extref_name_len(leaf,
5849					extref);
5850			} else {
5851				inode_key.objectid = key.offset;
5852				cur_offset = item_size;
5853			}
5854
5855			dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
5856					       root);
5857			/*
5858			 * If the parent inode was deleted, return an error to
5859			 * fallback to a transaction commit. This is to prevent
5860			 * getting an inode that was moved from one parent A to
5861			 * a parent B, got its former parent A deleted and then
5862			 * it got fsync'ed, from existing at both parents after
5863			 * a log replay (and the old parent still existing).
5864			 * Example:
5865			 *
5866			 * mkdir /mnt/A
5867			 * mkdir /mnt/B
5868			 * touch /mnt/B/bar
5869			 * sync
5870			 * mv /mnt/B/bar /mnt/A/bar
5871			 * mv -T /mnt/A /mnt/B
5872			 * fsync /mnt/B/bar
5873			 * <power fail>
5874			 *
5875			 * If we ignore the old parent B which got deleted,
5876			 * after a log replay we would have file bar linked
5877			 * at both parents and the old parent B would still
5878			 * exist.
5879			 */
5880			if (IS_ERR(dir_inode)) {
5881				ret = PTR_ERR(dir_inode);
5882				goto out;
5883			}
5884
5885			if (ctx)
5886				ctx->log_new_dentries = false;
5887			ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
5888					      LOG_INODE_ALL, ctx);
5889			if (!ret &&
5890			    btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
5891				ret = 1;
5892			if (!ret && ctx && ctx->log_new_dentries)
5893				ret = log_new_dir_dentries(trans, root,
5894						   BTRFS_I(dir_inode), ctx);
5895			btrfs_add_delayed_iput(dir_inode);
5896			if (ret)
5897				goto out;
5898		}
5899		path->slots[0]++;
5900	}
5901	ret = 0;
5902out:
5903	btrfs_free_path(path);
5904	return ret;
5905}
5906
5907static int log_new_ancestors(struct btrfs_trans_handle *trans,
5908			     struct btrfs_root *root,
5909			     struct btrfs_path *path,
5910			     struct btrfs_log_ctx *ctx)
5911{
5912	struct btrfs_key found_key;
5913
5914	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
5915
5916	while (true) {
5917		struct btrfs_fs_info *fs_info = root->fs_info;
5918		const u64 last_committed = fs_info->last_trans_committed;
5919		struct extent_buffer *leaf = path->nodes[0];
5920		int slot = path->slots[0];
5921		struct btrfs_key search_key;
5922		struct inode *inode;
5923		u64 ino;
5924		int ret = 0;
5925
5926		btrfs_release_path(path);
5927
5928		ino = found_key.offset;
5929
5930		search_key.objectid = found_key.offset;
5931		search_key.type = BTRFS_INODE_ITEM_KEY;
5932		search_key.offset = 0;
5933		inode = btrfs_iget(fs_info->sb, ino, root);
5934		if (IS_ERR(inode))
5935			return PTR_ERR(inode);
5936
5937		if (BTRFS_I(inode)->generation > last_committed)
5938			ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
5939					      LOG_INODE_EXISTS, ctx);
5940		btrfs_add_delayed_iput(inode);
5941		if (ret)
5942			return ret;
5943
5944		if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
5945			break;
5946
5947		search_key.type = BTRFS_INODE_REF_KEY;
5948		ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5949		if (ret < 0)
5950			return ret;
5951
5952		leaf = path->nodes[0];
5953		slot = path->slots[0];
5954		if (slot >= btrfs_header_nritems(leaf)) {
5955			ret = btrfs_next_leaf(root, path);
5956			if (ret < 0)
5957				return ret;
5958			else if (ret > 0)
5959				return -ENOENT;
5960			leaf = path->nodes[0];
5961			slot = path->slots[0];
5962		}
5963
5964		btrfs_item_key_to_cpu(leaf, &found_key, slot);
5965		if (found_key.objectid != search_key.objectid ||
5966		    found_key.type != BTRFS_INODE_REF_KEY)
5967			return -ENOENT;
5968	}
5969	return 0;
5970}
5971
5972static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
5973				  struct btrfs_inode *inode,
5974				  struct dentry *parent,
5975				  struct btrfs_log_ctx *ctx)
5976{
5977	struct btrfs_root *root = inode->root;
5978	struct btrfs_fs_info *fs_info = root->fs_info;
5979	struct dentry *old_parent = NULL;
5980	struct super_block *sb = inode->vfs_inode.i_sb;
5981	int ret = 0;
5982
5983	while (true) {
5984		if (!parent || d_really_is_negative(parent) ||
5985		    sb != parent->d_sb)
5986			break;
5987
5988		inode = BTRFS_I(d_inode(parent));
5989		if (root != inode->root)
5990			break;
5991
5992		if (inode->generation > fs_info->last_trans_committed) {
5993			ret = btrfs_log_inode(trans, root, inode,
5994					      LOG_INODE_EXISTS, ctx);
5995			if (ret)
5996				break;
5997		}
5998		if (IS_ROOT(parent))
5999			break;
6000
6001		parent = dget_parent(parent);
6002		dput(old_parent);
6003		old_parent = parent;
6004	}
6005	dput(old_parent);
6006
6007	return ret;
6008}
6009
6010static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
6011				 struct btrfs_inode *inode,
6012				 struct dentry *parent,
6013				 struct btrfs_log_ctx *ctx)
6014{
6015	struct btrfs_root *root = inode->root;
6016	const u64 ino = btrfs_ino(inode);
6017	struct btrfs_path *path;
6018	struct btrfs_key search_key;
6019	int ret;
6020
6021	/*
6022	 * For a single hard link case, go through a fast path that does not
6023	 * need to iterate the fs/subvolume tree.
6024	 */
6025	if (inode->vfs_inode.i_nlink < 2)
6026		return log_new_ancestors_fast(trans, inode, parent, ctx);
6027
6028	path = btrfs_alloc_path();
6029	if (!path)
6030		return -ENOMEM;
6031
6032	search_key.objectid = ino;
6033	search_key.type = BTRFS_INODE_REF_KEY;
6034	search_key.offset = 0;
6035again:
6036	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6037	if (ret < 0)
6038		goto out;
6039	if (ret == 0)
6040		path->slots[0]++;
6041
6042	while (true) {
6043		struct extent_buffer *leaf = path->nodes[0];
6044		int slot = path->slots[0];
6045		struct btrfs_key found_key;
6046
6047		if (slot >= btrfs_header_nritems(leaf)) {
6048			ret = btrfs_next_leaf(root, path);
6049			if (ret < 0)
6050				goto out;
6051			else if (ret > 0)
6052				break;
6053			continue;
6054		}
6055
6056		btrfs_item_key_to_cpu(leaf, &found_key, slot);
6057		if (found_key.objectid != ino ||
6058		    found_key.type > BTRFS_INODE_EXTREF_KEY)
6059			break;
6060
6061		/*
6062		 * Don't deal with extended references because they are rare
6063		 * cases and too complex to deal with (we would need to keep
6064		 * track of which subitem we are processing for each item in
6065		 * this loop, etc). So just return some error to fallback to
6066		 * a transaction commit.
6067		 */
6068		if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
6069			ret = -EMLINK;
6070			goto out;
6071		}
6072
6073		/*
6074		 * Logging ancestors needs to do more searches on the fs/subvol
6075		 * tree, so it releases the path as needed to avoid deadlocks.
6076		 * Keep track of the last inode ref key and resume from that key
6077		 * after logging all new ancestors for the current hard link.
6078		 */
6079		memcpy(&search_key, &found_key, sizeof(search_key));
6080
6081		ret = log_new_ancestors(trans, root, path, ctx);
6082		if (ret)
6083			goto out;
6084		btrfs_release_path(path);
6085		goto again;
6086	}
6087	ret = 0;
6088out:
6089	btrfs_free_path(path);
6090	return ret;
6091}
6092
6093/*
6094 * helper function around btrfs_log_inode to make sure newly created
6095 * parent directories also end up in the log.  A minimal inode and backref
6096 * only logging is done of any parent directories that are older than
6097 * the last committed transaction
6098 */
6099static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
6100				  struct btrfs_inode *inode,
6101				  struct dentry *parent,
6102				  int inode_only,
6103				  struct btrfs_log_ctx *ctx)
6104{
6105	struct btrfs_root *root = inode->root;
6106	struct btrfs_fs_info *fs_info = root->fs_info;
6107	struct super_block *sb;
6108	int ret = 0;
6109	u64 last_committed = fs_info->last_trans_committed;
6110	bool log_dentries = false;
6111
6112	sb = inode->vfs_inode.i_sb;
6113
6114	if (btrfs_test_opt(fs_info, NOTREELOG)) {
6115		ret = 1;
6116		goto end_no_trans;
6117	}
6118
6119	/*
6120	 * The prev transaction commit doesn't complete, we need do
6121	 * full commit by ourselves.
6122	 */
6123	if (fs_info->last_trans_log_full_commit >
6124	    fs_info->last_trans_committed) {
6125		ret = 1;
6126		goto end_no_trans;
6127	}
6128
6129	if (btrfs_root_refs(&root->root_item) == 0) {
6130		ret = 1;
6131		goto end_no_trans;
6132	}
6133
6134	ret = check_parent_dirs_for_sync(trans, inode, parent, sb,
6135			last_committed);
6136	if (ret)
6137		goto end_no_trans;
6138
6139	/*
6140	 * Skip already logged inodes or inodes corresponding to tmpfiles
6141	 * (since logging them is pointless, a link count of 0 means they
6142	 * will never be accessible).
6143	 */
6144	if ((btrfs_inode_in_log(inode, trans->transid) &&
6145	     list_empty(&ctx->ordered_extents)) ||
6146	    inode->vfs_inode.i_nlink == 0) {
6147		ret = BTRFS_NO_LOG_SYNC;
6148		goto end_no_trans;
6149	}
6150
6151	ret = start_log_trans(trans, root, ctx);
6152	if (ret)
6153		goto end_no_trans;
6154
6155	ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
6156	if (ret)
6157		goto end_trans;
6158
6159	/*
6160	 * for regular files, if its inode is already on disk, we don't
6161	 * have to worry about the parents at all.  This is because
6162	 * we can use the last_unlink_trans field to record renames
6163	 * and other fun in this file.
6164	 */
6165	if (S_ISREG(inode->vfs_inode.i_mode) &&
6166	    inode->generation <= last_committed &&
6167	    inode->last_unlink_trans <= last_committed) {
6168		ret = 0;
6169		goto end_trans;
6170	}
6171
6172	if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
6173		log_dentries = true;
6174
6175	/*
6176	 * On unlink we must make sure all our current and old parent directory
6177	 * inodes are fully logged. This is to prevent leaving dangling
6178	 * directory index entries in directories that were our parents but are
6179	 * not anymore. Not doing this results in old parent directory being
6180	 * impossible to delete after log replay (rmdir will always fail with
6181	 * error -ENOTEMPTY).
6182	 *
6183	 * Example 1:
6184	 *
6185	 * mkdir testdir
6186	 * touch testdir/foo
6187	 * ln testdir/foo testdir/bar
6188	 * sync
6189	 * unlink testdir/bar
6190	 * xfs_io -c fsync testdir/foo
6191	 * <power failure>
6192	 * mount fs, triggers log replay
6193	 *
6194	 * If we don't log the parent directory (testdir), after log replay the
6195	 * directory still has an entry pointing to the file inode using the bar
6196	 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
6197	 * the file inode has a link count of 1.
6198	 *
6199	 * Example 2:
6200	 *
6201	 * mkdir testdir
6202	 * touch foo
6203	 * ln foo testdir/foo2
6204	 * ln foo testdir/foo3
6205	 * sync
6206	 * unlink testdir/foo3
6207	 * xfs_io -c fsync foo
6208	 * <power failure>
6209	 * mount fs, triggers log replay
6210	 *
6211	 * Similar as the first example, after log replay the parent directory
6212	 * testdir still has an entry pointing to the inode file with name foo3
6213	 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
6214	 * and has a link count of 2.
6215	 */
6216	if (inode->last_unlink_trans > last_committed) {
6217		ret = btrfs_log_all_parents(trans, inode, ctx);
6218		if (ret)
6219			goto end_trans;
6220	}
6221
6222	ret = log_all_new_ancestors(trans, inode, parent, ctx);
6223	if (ret)
6224		goto end_trans;
6225
6226	if (log_dentries)
6227		ret = log_new_dir_dentries(trans, root, inode, ctx);
6228	else
6229		ret = 0;
6230end_trans:
6231	if (ret < 0) {
6232		btrfs_set_log_full_commit(trans);
6233		ret = 1;
6234	}
6235
6236	if (ret)
6237		btrfs_remove_log_ctx(root, ctx);
6238	btrfs_end_log_trans(root);
6239end_no_trans:
6240	return ret;
6241}
6242
6243/*
6244 * it is not safe to log dentry if the chunk root has added new
6245 * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
6246 * If this returns 1, you must commit the transaction to safely get your
6247 * data on disk.
6248 */
6249int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
6250			  struct dentry *dentry,
6251			  struct btrfs_log_ctx *ctx)
6252{
6253	struct dentry *parent = dget_parent(dentry);
6254	int ret;
6255
6256	ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
6257				     LOG_INODE_ALL, ctx);
6258	dput(parent);
6259
6260	return ret;
6261}
6262
6263/*
6264 * should be called during mount to recover any replay any log trees
6265 * from the FS
6266 */
6267int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
6268{
6269	int ret;
6270	struct btrfs_path *path;
6271	struct btrfs_trans_handle *trans;
6272	struct btrfs_key key;
6273	struct btrfs_key found_key;
6274	struct btrfs_root *log;
6275	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
6276	struct walk_control wc = {
6277		.process_func = process_one_buffer,
6278		.stage = LOG_WALK_PIN_ONLY,
6279	};
6280
6281	path = btrfs_alloc_path();
6282	if (!path)
6283		return -ENOMEM;
6284
6285	set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6286
6287	trans = btrfs_start_transaction(fs_info->tree_root, 0);
6288	if (IS_ERR(trans)) {
6289		ret = PTR_ERR(trans);
6290		goto error;
6291	}
6292
6293	wc.trans = trans;
6294	wc.pin = 1;
6295
6296	ret = walk_log_tree(trans, log_root_tree, &wc);
6297	if (ret) {
6298		btrfs_handle_fs_error(fs_info, ret,
6299			"Failed to pin buffers while recovering log root tree.");
6300		goto error;
6301	}
6302
6303again:
6304	key.objectid = BTRFS_TREE_LOG_OBJECTID;
6305	key.offset = (u64)-1;
6306	key.type = BTRFS_ROOT_ITEM_KEY;
6307
6308	while (1) {
6309		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
6310
6311		if (ret < 0) {
6312			btrfs_handle_fs_error(fs_info, ret,
6313				    "Couldn't find tree log root.");
6314			goto error;
6315		}
6316		if (ret > 0) {
6317			if (path->slots[0] == 0)
6318				break;
6319			path->slots[0]--;
6320		}
6321		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
6322				      path->slots[0]);
6323		btrfs_release_path(path);
6324		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
6325			break;
6326
6327		log = btrfs_read_tree_root(log_root_tree, &found_key);
6328		if (IS_ERR(log)) {
6329			ret = PTR_ERR(log);
6330			btrfs_handle_fs_error(fs_info, ret,
6331				    "Couldn't read tree log root.");
6332			goto error;
6333		}
6334
6335		wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
6336						   true);
6337		if (IS_ERR(wc.replay_dest)) {
6338			ret = PTR_ERR(wc.replay_dest);
6339
6340			/*
6341			 * We didn't find the subvol, likely because it was
6342			 * deleted.  This is ok, simply skip this log and go to
6343			 * the next one.
6344			 *
6345			 * We need to exclude the root because we can't have
6346			 * other log replays overwriting this log as we'll read
6347			 * it back in a few more times.  This will keep our
6348			 * block from being modified, and we'll just bail for
6349			 * each subsequent pass.
6350			 */
6351			if (ret == -ENOENT)
6352				ret = btrfs_pin_extent_for_log_replay(trans,
6353							log->node->start,
6354							log->node->len);
6355			btrfs_put_root(log);
6356
6357			if (!ret)
6358				goto next;
6359			btrfs_handle_fs_error(fs_info, ret,
6360				"Couldn't read target root for tree log recovery.");
6361			goto error;
6362		}
6363
6364		wc.replay_dest->log_root = log;
6365		btrfs_record_root_in_trans(trans, wc.replay_dest);
6366		ret = walk_log_tree(trans, log, &wc);
6367
6368		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6369			ret = fixup_inode_link_counts(trans, wc.replay_dest,
6370						      path);
6371		}
6372
6373		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6374			struct btrfs_root *root = wc.replay_dest;
6375
6376			btrfs_release_path(path);
6377
6378			/*
6379			 * We have just replayed everything, and the highest
6380			 * objectid of fs roots probably has changed in case
6381			 * some inode_item's got replayed.
6382			 *
6383			 * root->objectid_mutex is not acquired as log replay
6384			 * could only happen during mount.
6385			 */
6386			ret = btrfs_find_highest_objectid(root,
6387						  &root->highest_objectid);
6388		}
6389
6390		wc.replay_dest->log_root = NULL;
6391		btrfs_put_root(wc.replay_dest);
6392		btrfs_put_root(log);
6393
6394		if (ret)
6395			goto error;
6396next:
6397		if (found_key.offset == 0)
6398			break;
6399		key.offset = found_key.offset - 1;
6400	}
6401	btrfs_release_path(path);
6402
6403	/* step one is to pin it all, step two is to replay just inodes */
6404	if (wc.pin) {
6405		wc.pin = 0;
6406		wc.process_func = replay_one_buffer;
6407		wc.stage = LOG_WALK_REPLAY_INODES;
6408		goto again;
6409	}
6410	/* step three is to replay everything */
6411	if (wc.stage < LOG_WALK_REPLAY_ALL) {
6412		wc.stage++;
6413		goto again;
6414	}
6415
6416	btrfs_free_path(path);
6417
6418	/* step 4: commit the transaction, which also unpins the blocks */
6419	ret = btrfs_commit_transaction(trans);
6420	if (ret)
6421		return ret;
6422
6423	log_root_tree->log_root = NULL;
6424	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6425	btrfs_put_root(log_root_tree);
6426
6427	return 0;
6428error:
6429	if (wc.trans)
6430		btrfs_end_transaction(wc.trans);
6431	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6432	btrfs_free_path(path);
6433	return ret;
6434}
6435
6436/*
6437 * there are some corner cases where we want to force a full
6438 * commit instead of allowing a directory to be logged.
6439 *
6440 * They revolve around files there were unlinked from the directory, and
6441 * this function updates the parent directory so that a full commit is
6442 * properly done if it is fsync'd later after the unlinks are done.
6443 *
6444 * Must be called before the unlink operations (updates to the subvolume tree,
6445 * inodes, etc) are done.
6446 */
6447void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
6448			     struct btrfs_inode *dir, struct btrfs_inode *inode,
6449			     int for_rename)
6450{
6451	/*
6452	 * when we're logging a file, if it hasn't been renamed
6453	 * or unlinked, and its inode is fully committed on disk,
6454	 * we don't have to worry about walking up the directory chain
6455	 * to log its parents.
6456	 *
6457	 * So, we use the last_unlink_trans field to put this transid
6458	 * into the file.  When the file is logged we check it and
6459	 * don't log the parents if the file is fully on disk.
6460	 */
6461	mutex_lock(&inode->log_mutex);
6462	inode->last_unlink_trans = trans->transid;
6463	mutex_unlock(&inode->log_mutex);
6464
6465	/*
6466	 * if this directory was already logged any new
6467	 * names for this file/dir will get recorded
6468	 */
6469	if (dir->logged_trans == trans->transid)
6470		return;
6471
6472	/*
6473	 * if the inode we're about to unlink was logged,
6474	 * the log will be properly updated for any new names
6475	 */
6476	if (inode->logged_trans == trans->transid)
6477		return;
6478
6479	/*
6480	 * when renaming files across directories, if the directory
6481	 * there we're unlinking from gets fsync'd later on, there's
6482	 * no way to find the destination directory later and fsync it
6483	 * properly.  So, we have to be conservative and force commits
6484	 * so the new name gets discovered.
6485	 */
6486	if (for_rename)
6487		goto record;
6488
6489	/* we can safely do the unlink without any special recording */
6490	return;
6491
6492record:
6493	mutex_lock(&dir->log_mutex);
6494	dir->last_unlink_trans = trans->transid;
6495	mutex_unlock(&dir->log_mutex);
6496}
6497
6498/*
6499 * Make sure that if someone attempts to fsync the parent directory of a deleted
6500 * snapshot, it ends up triggering a transaction commit. This is to guarantee
6501 * that after replaying the log tree of the parent directory's root we will not
6502 * see the snapshot anymore and at log replay time we will not see any log tree
6503 * corresponding to the deleted snapshot's root, which could lead to replaying
6504 * it after replaying the log tree of the parent directory (which would replay
6505 * the snapshot delete operation).
6506 *
6507 * Must be called before the actual snapshot destroy operation (updates to the
6508 * parent root and tree of tree roots trees, etc) are done.
6509 */
6510void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
6511				   struct btrfs_inode *dir)
6512{
6513	mutex_lock(&dir->log_mutex);
6514	dir->last_unlink_trans = trans->transid;
6515	mutex_unlock(&dir->log_mutex);
6516}
6517
6518/*
6519 * Call this after adding a new name for a file and it will properly
6520 * update the log to reflect the new name.
6521 */
6522void btrfs_log_new_name(struct btrfs_trans_handle *trans,
6523			struct btrfs_inode *inode, struct btrfs_inode *old_dir,
6524			struct dentry *parent)
6525{
6526	struct btrfs_log_ctx ctx;
6527
6528	/*
6529	 * this will force the logging code to walk the dentry chain
6530	 * up for the file
6531	 */
6532	if (!S_ISDIR(inode->vfs_inode.i_mode))
6533		inode->last_unlink_trans = trans->transid;
6534
6535	/*
6536	 * if this inode hasn't been logged and directory we're renaming it
6537	 * from hasn't been logged, we don't need to log it
6538	 */
6539	if (!inode_logged(trans, inode) &&
6540	    (!old_dir || !inode_logged(trans, old_dir)))
6541		return;
6542
6543	btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
6544	ctx.logging_new_name = true;
6545	/*
6546	 * We don't care about the return value. If we fail to log the new name
6547	 * then we know the next attempt to sync the log will fallback to a full
6548	 * transaction commit (due to a call to btrfs_set_log_full_commit()), so
6549	 * we don't need to worry about getting a log committed that has an
6550	 * inconsistent state after a rename operation.
6551	 */
6552	btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
6553}
6554
6555