tree-log.c - OpenGrok cross reference for /kernel/linux/linux-6.6/fs/btrfs/tree-log.c

Lines Matching defs:log
13 #include "tree-log.h"
38  * LOG_INODE_ALL means to log everything
39  * LOG_INODE_EXISTS means to log just enough to recreate the inode
40  * during log replay
51  * log, we must force a full commit before doing an fsync of the directory
65  * 2) we must log any new names for any file or dir that is in the fsync
66  * log. ---> check inode while renaming/linking.
68  * 2a) we must log any new names for any file or dir during rename
94  * we find in the log are created in the subvolume.
115 				       struct btrfs_root *log,
121  * tree logging is a special write ahead log used to make sure that
134  * After a crash, items are copied out of the log-tree back into the
136  * allocation tree, and the log-tree freed.
138  * The log tree is read three times, once to pin down all the extents it is
144  * start a sub transaction and setup the log tree
145  * this increments the log tree writer count to make the people
159 	 * First check if the log root tree was already created. If not, create
202 		 * nodes from multiple log transactions to do sequential
232  * returns 0 if there was a log transaction running and we were able
261  * This either makes the current running log transaction wait
263  * log transactions wait until you call btrfs_end_log_trans()
271  * indicate we're done making changes to the log tree
284  * processing the log tree.  The stage field tells us which part
285  * of the log tree processing we are currently doing.  The others
290 	 * at transaction commit time while freeing a log tree
295 	 * log trees
320 	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
327 static int process_one_buffer(struct btrfs_root *log,
331 	struct btrfs_fs_info *fs_info = log->fs_info;
391 	 * This is only used during log replay, so the root is always from a
392 	 * fs/subvolume tree. In case we ever need to support a log root, then
394 	 * the leaf before writing into the log tree. See the comments at
467 			 * the rest of the items in this log.
487 		 * the items in this log.
521 	 * log replay inserts and removes directory items based on the
599  * This can only be called for subvolume roots and not for the log
616  * extents in the log tree have not been allocated out of the extent
744 		 * as the owner of the file extent changed from log tree
815 			 * overlap each other. For example, imagine our log has
827 			 * that starts at disk byte 12845056, and the log tree
912 	 * that future checks for a name during log replay see that the name
920  * subvolume, directory names in the log and directory names in the
1009  * helper function to check a log tree for a named back reference in
1011  * found in the subvolume conflicts with what we find in the log.
1016  * link is also in the log.
1018 static noinline int backref_in_log(struct btrfs_root *log,
1030 	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
1085 		 * if they are in the log.  if so, we allow them to stay
1257  * Take an inode reference item from the log tree and iterate all names from the
1259  * For any name that is not in the inode reference item from the log tree, do a
1342  * replay one inode back reference item found in the log tree.
1343  * eb, slot and key refer to the buffer and key found in the log tree.
1349 				  struct btrfs_root *log,
1383 	 * it is possible that we didn't log all the parent directories
1432 			ret = __add_inode_ref(trans, root, path, log,
1465 	 * with the item from the log tree, we must unlink all names from the
1590  * lots of complexity to the log code, we just scan the backrefs
1745  * when replaying the log for a directory, we only insert names
1808  * take a single entry in a log directory item and replay it into
1815  * If a name from the log points to a file or directory that does
1901 	 * Check if the inode reference exists in the log for the given name,
1961 	/* We only log dir index keys, which only contain a single dir item. */
1988 	 * mount fs, log replay happens
2013  * items in the log copied from the subvolume, and range items
2014  * created in the log while the subvolume was logged.
2016  * The range items tell us which parts of the key space the log
2018  * directory is in a logged range item, but not actually in the log
2094  * this looks for a given directory item in the log.  If the directory
2095  * item is not in the log, the item is removed and the inode it points
2099 				      struct btrfs_root *log,
2115 	 * Currently we only log dir index keys. Even if we replay a log created
2129 	if (log) {
2132 		log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
2139 			/* The dentry exists in the log, we have nothing to do. */
2176 			      struct btrfs_root *log,
2229 			log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2233 				/* Doesn't exist in log tree, so delete it. */
2274  * out of the log or out of backreferences from inodes.  It
2275  * scans the log to find ranges of keys that log is authoritative for,
2277  * not present in the log.
2279  * Anything we don't find in the log is unlinked and removed from the
2284 				       struct btrfs_root *log,
2305 	 * from the log
2318 			ret = find_dir_range(log, path, dirid,
2353 			ret = check_item_in_log(trans, log, path,
2376  * the process_func used to replay items from the log tree.  This
2380  * The second stage copies all the other item types from the log into
2383  * only in the log (references come from either directory items or inode
2386 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2430 			 * possible we are replaying a log created with a kernel
2431 			 * that used to log tmpfiles.
2439 			ret = replay_xattr_deletes(wc->trans, root, log,
2446 					 root, log, path, key.objectid, 0);
2457 			 * size. We need to do it now and not after log replay
2521 			ret = add_inode_ref(wc->trans, root, log, path,
2533 		 * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
2535 		 * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
2544  * Correctly adjust the reserved bytes occupied by a log tree extent buffer
2710 			 struct btrfs_root *log, struct walk_control *wc)
2722 	level = btrfs_header_level(log->node);
2724 	path->nodes[level] = log->node;
2725 	atomic_inc(&log->node->refs);
2729 		wret = walk_down_log_tree(trans, log, path, &level, wc);
2737 		wret = walk_up_log_tree(trans, log, path, &level, wc);
2748 		ret = wc->process_func(log, path->nodes[orig_level], wc,
2763  * helper function to update the item for a given subvolumes log root
2764  * in the tree of log roots
2767 			   struct btrfs_root *log,
2770 	struct btrfs_fs_info *fs_info = log->fs_info;
2773 	if (log->log_transid == 1) {
2776 				&log->root_key, root_item);
2779 				&log->root_key, root_item);
2790 	 * we only allow two pending log transactions at a time,
2835  * Invoked in log mutex context, or be sure there is no other task which
2851  * btrfs_sync_log does sends a given tree log down to the disk and
2870 	struct btrfs_root *log = root->log_root;
2895 	/* wait for previous tree log sync to complete */
2901 		/* when we're on an ssd, just kick the log commit out */
2929 	ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
2932 	 *  commit, writes a dirty extent in this tree-log commit. This
2950 	 * have a consistent view of the log root we are trying to commit at
2961 	btrfs_set_root_node(&log->root_item, log->node);
2962 	memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
2965 	log->log_transid = root->log_transid;
2968 	 * IO has been started, blocks of the log tree have WRITTEN flag set
2969 	 * in their headers. new modifications of the log will be written to
2970 	 * new positions. so it's safe to allow log writers to go in.
3000 	ret = update_log_root(trans, log, &new_root_item);
3009 				  "failed to update log for root %llu ret %d",
3011 		btrfs_wait_tree_log_extents(log, mark);
3027 		ret = btrfs_wait_tree_log_extents(log, mark);
3044 	 * now that we've moved on to the tree of log tree roots,
3049 		btrfs_wait_tree_log_extents(log, mark);
3066 		btrfs_wait_tree_log_extents(log, mark);
3074 	ret = btrfs_wait_tree_log_extents(log, mark);
3134 	 * log must wait for the previous log transaction to commit if it's
3135 	 * still in progress or wait for the current log transaction commit if
3137 	 * first log transaction has an ID of 0.
3173 			  struct btrfs_root *log)
3181 	if (log->node) {
3182 		ret = walk_log_tree(trans, log, &wc);
3185 			 * We weren't able to traverse the entire log tree, the
3191 				&log->fs_info->fs_state);
3194 			 * Some extent buffers of the log tree may still be dirty
3196 			 * have updates to a log tree without syncing a log tree,
3201 			btrfs_write_marked_extents(log->fs_info,
3202 						   &log->dirty_log_pages,
3204 			btrfs_wait_tree_log_extents(log,
3210 				btrfs_handle_fs_error(log->fs_info, ret, NULL);
3214 	clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
3216 	extent_io_tree_release(&log->log_csum_range);
3218 	btrfs_put_root(log);
3222  * free all the extents used by the tree log.  This should be called
3274 	 * If no log tree was created for this root in this transaction, then
3278 	 * a log tree gets created after this.
3286 	 * We have a log tree and the inode's logged_trans is 0. We can't tell
3290 	 * operations, and then further updating the log in followup rename and
3306 	 * case 3) and return true. So we do a search in the log root for the inode
3350 	 * maximum possible value, so that the next attempt to log the inode does
3353 	 * to insert duplicate dir index keys in the log tree. This must be done
3365  * Delete a directory entry from the log if it exists.
3372 			     struct btrfs_root *log,
3381 	 * We only log dir index items of a directory, so we don't need to look
3384 	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3393 	 * inode item because on log replay we update the field to reflect
3396 	return btrfs_delete_one_dir_name(trans, log, path, di);
3410  * didn't get fsync'd again so the log has back refs for X and X.link.
3413  * log when a file that was logged in the current transaction is
3414  * unlinked.  Any later fsync will include the updated log entries, and
3464 	struct btrfs_root *log;
3479 	log = root->log_root;
3482 	ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode),
3491  * creates a range item in the log for 'dirid'.  first_offset and
3492  * last_offset tell us which parts of the key space the log should
3496 				       struct btrfs_root *log,
3508 	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3545 	struct btrfs_root *log = inode->root->log_root;
3590 	ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
3637 	struct btrfs_root *log = inode->root->log_root;
3648 	 * clone before modifying the log tree. See the comment at copy_items()
3676 		 * in past transactions. However if we find a gap, we must log a
3678 		 * gap are deleted during log replay.
3682 				ret = insert_dir_log_key(trans, log, dst_path,
3698 		 * We must make sure that when we log a directory entry, the
3699 		 * corresponding inode, after log replay, has a matching link
3708 		 * <mount fs and log replay>
3710 		 * Would result in a fsync log that when replayed, our file inode
3745  * log all the items included in the current transaction for a given
3746  * directory.  This also creates the range items in the log tree required
3758 	struct btrfs_root *log = root->log_root;
3816 		 * happened. So the key range item we log (key type
3833 	 * need to log the parent directory of an inode. This means a dir index
3837 	 * not log dir index keys that come after the one that was just deleted
3840 	 * entries we should not remove at log replay time.
3856 	 * we have a block from this transaction, log every item in it
3892 			 * that next key. So the key range item we log (key type
3916 		 * change in the current transaction), then we don't need to log
3921 			ret = insert_dir_log_key(trans, log, path, ino,
3933  * is to avoid lookups in the log tree every time we try to insert a dir index
3996  * from the current transaction and write them to the log.
3999  * key in the range logged that is not present in the log tree, then it means
4037  * a helper function to drop items from the log before we relog an
4043 				  struct btrfs_root *log,
4058 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
4079 		ret = btrfs_del_items(trans, log, path, start_slot,
4159 	 * only waits for writeback to complete. During log replay as we find
4174 			  struct btrfs_root *log, struct btrfs_path *path,
4183 	 * it exists in the log tree. For performance reasons, in this case use
4191 		ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
4198 		 * so the inode item is not in the log and we need to insert it.
4205 		ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
4240 	 * can leave checksum items in the log with overlapping ranges.
4250 	 * This would leave csum items in the log tree that cover the same range
4251 	 * and break the searches for checksums in the log tree, resulting in
4253 	 * trim and adjust) any existing csum items in the log for this range.
4272 	struct btrfs_root *log = inode->root->log_root;
4287 	 * use the clone. This is because otherwise we would be changing the log
4290 	 * creates a nasty lock dependency when COWing log tree nodes/leaves:
4292 	 * 1) Modifying the log tree triggers an extent buffer allocation while
4293 	 *    holding a write lock on a parent extent buffer from the log tree.
4299 	 * 2) Allocating a metadata extent for a log tree can trigger the async
4305 	 *    on the parent extent buffer in the log tree.
4360 		 * log a lot more metadata for common cases like doing only a
4367 		 * generations copied into it. We also must always log prealloc
4369 		 * them on log replay.
4386 		 * no need to log them.
4416 				ret = log_csums(trans, inode, log, sums);
4437 	ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
4629 	struct btrfs_root *log = inode->root->log_root;
4658 	ret = log_extent_csums(trans, inode, log, em, ctx);
4667 	 * but also adds significant contention in a log tree, since log trees
4677 		ret = btrfs_drop_extents(trans, log, inode, &drop_args);
4687 		ret = btrfs_insert_empty_item(trans, log, path, &key,
4705  * lose them after doing a full/fast fsync and replaying the log. We scan the
4707  * otherwise we can log incorrect extent items based on extent map conversion.
4742 	 * otherwise we end up losing part of the prealloc extent after a log
4806 			 * and leading to duplicate keys in the log tree.
4868 		/* We log prealloc extents beyond eof later. */
4915 	 * before it commits and wipes out the log trees, otherwise we would
4937 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
4947 	ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4961 		 * that we get a correct inode size after replaying the log
4965 		 * data loss when replaying a log due to previously doing a
4978  * At the moment we always log all xattrs. This is to figure out at log replay
4980  * in the log tree and exists in the fs/subvol tree, we delete it. This is
4982  * happens, causing the log to be replayed the next time the fs is mounted,
5061  * if any holes exist and then log them. We also need to log holes after any
5105 		/* We have a hole, log it. */
5112 			 * leafs from the log root.
5159  * log replay time we can lose inode Y (and all its files if it's a directory):
5168  * mount fs, trigger log replay
5170  * After the log replay procedure, we would lose the first directory and all its
5182  * entry (file or directory) otherwise we end up with an unreplayable log due to
5295  * Check if we need to log an inode. This is used in contexts where while
5296  * logging an inode we need to log another inode (either that it exists or in
5298  * requires the inode to be in the log and have the log transaction committed,
5299  * while here we do not care if the log transaction was already committed - our
5300  * caller will commit the log later - and we want to avoid logging an inode
5301  * multiple times when multiple tasks have joined the same log transaction.
5317 	 * the log by link/unlink/rename operations.
5319 	 * logged_trans will be 0, in which case we have to fully log it since
5354  * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5358  *    at log replay time we compute the real number of links and correct the
5362  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5366  *    names - this is ok, not a problem, because at log replay time we set the
5385 	 * don't bother logging new dentries, as we just want to log the names
5601 	 *    during log replay. So we just log the parent, which will result in
5605 	 * 2) Do nothing if it's not a directory. During log replay we simply
5623 		/* Conflicting inode is a directory, so we'll log its parent. */
5657 	 *   on reference "zz", and log it;
5660 	 *   on reference "a", and log it;
5663 	 *   on reference "zz_link", and log it - again! After this we
5666 	 * Here we can use need_log_inode() because we only need to log the
5667 	 * inode in LOG_INODE_EXISTS mode and rename operations update the log,
5668 	 * so that the log ends up with the new name and without the old name.
5726 		 * deleted in the current transaction, we need to log its parent
5741 			 * Always log the directory, we cannot make this
5757 		 * Here we can use need_log_inode() because we only need to log
5759 		 * update the log, so that the log ends up with the new name and
5773 		 * lock as long as we log with the LOG_INODE_EXISTS mode. We
5775 		 * well because during a rename we pin the log and update the
5776 		 * log with the new name before we unpin it.
5950 				      struct btrfs_root *log,
5958 	ret = btrfs_insert_empty_items(trans, log, path, batch);
5987 	struct btrfs_root *log = inode->root->log_root;
6001 	/* We are adding dir index items to the log tree. */
6006 	 * to the log tree. However just after we collected them, they may have
6008 	 * could have copied them from the subvolume tree to the log tree.
6038 			ret = insert_delayed_items_batch(trans, log, path,
6061 	ret = insert_delayed_items_batch(trans, log, path, &batch, first);
6092 		 * this we log a single dir range item spanning several contiguous
6157 	struct btrfs_root *log = inode->root->log_root;
6176 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
6193 		 * existing one. Otherwise we have to log a dir range item.
6209 		ret = insert_dir_log_key(trans, log, path, key.objectid,
6230 	 * We are deleting dir index items from the log tree or adding range
6261 	 * No need for the log mutex, plus to avoid potential deadlocks or
6262 	 * lockdep annotations due to nesting of delayed inode mutexes and log
6314 /* log a single inode in the tree log.
6319  * to the log tree.  An extra reference is taken on any extents in this
6337 	struct btrfs_root *log = inode->root->log_root;
6382 	 * items of this directory and not log the delayed items directly. This
6391 	 * So in such a case we directly log the delayed items of the main
6418 	 * For symlinks, we must always log their content, which is stored in an
6420 	 * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
6442 	 * a file after replaying the log. For example, if we move a file from a
6445 	 * result in losing the file after a log replay.
6459 			ret = drop_inode_items(trans, log, path, inode,
6464 			 * Make sure the new inode item we write to the log has
6466 			 * This is necessary to prevent data loss after log
6470 			 * fsync some other file (to sync log), power fail - if
6471 			 * we use the inode's current i_size, after log replay
6476 			ret = logged_inode_size(log, inode, path, &logged_isize);
6485 					ret = drop_inode_items(trans, log, path,
6493 					ret = truncate_inode_items(trans, log,
6503 				ret = drop_inode_items(trans, log, path, inode,
6550 		ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
6555 		 * in this transaction, we don't need to log the xattrs because
6603 	 *    transaction, but we did not previously log the inode with
6605 	 *    it was loaded again and this LOG_INODE_EXISTS log operation
6608 	 *    updated inode item, etc, and syncs the log. The same logic
6611 	 * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
6614 	 *    truncate operation, the log is synced through an fsync of
6618 	 *    the new i_size, and syncs the log.
6623 	 *    against an ancestor, the fsync considers the inode in the log
6624 	 *    and doesn't sync the log, resulting in the ancestor missing after
6625 	 *    a power failure unless the log was synced as part of an fsync
6739 			 * a log replay (and the old parent still existing).
6752 			 * after a log replay we would have file bar linked
6972  * parent directories also end up in the log.  A minimal inode and backref
7038 	 * impossible to delete after log replay (rmdir will always fail with
7050 	 * mount fs, triggers log replay
7052 	 * If we don't log the parent directory (testdir), after log replay the
7067 	 * mount fs, triggers log replay
7069 	 * Similar as the first example, after log replay the parent directory
7102  * it is not safe to log dentry if the chunk root has added new
7122  * should be called during mount to recover any replay any log trees
7132 	struct btrfs_root *log;
7183 		log = btrfs_read_tree_root(log_root_tree, &found_key);
7184 		if (IS_ERR(log)) {
7185 			ret = PTR_ERR(log);
7197 			 * deleted.  This is ok, simply skip this log and go to
7201 			 * other log replays overwriting this log as we'll read
7208 							log->node->start,
7209 							log->node->len);
7210 			btrfs_put_root(log);
7218 		wc.replay_dest->log_root = log;
7224 			ret = walk_log_tree(trans, log, &wc);
7243 			 * root->objectid_mutex is not acquired as log replay
7253 		btrfs_put_root(log);
7316 	 * to log its parents.
7320 	 * don't log the parents if the file is fully on disk.
7331 	 * with btrfs_log_new_name() and old names will be deleted from the log
7339 	 * If the inode we're about to unlink was logged before, the log will be
7362  * that after replaying the log tree of the parent directory's root we will not
7363  * see the snapshot anymore and at log replay time we will not see any log tree
7365  * it after replaying the log tree of the parent directory (which would replay
7380  * Update the log after adding a new name for an inode.
7394  * rename operation, and it will properly update the log to reflect the new name.
7415 	 * from hasn't been logged, we don't need to log it
7438 	 * was previously logged, make sure that on log replay we get the old
7439 	 * dir entry deleted. This is needed because we will also log the new
7440 	 * name of the renamed inode, so we need to make sure that after log
7444 		struct btrfs_root *log = old_dir->root->log_root;
7455 		 * We have two inodes to update in the log, the old directory and
7456 		 * the inode that got renamed, so we must pin the log to prevent
7457 		 * anyone from syncing the log until we have updated both inodes
7458 		 * in the log.
7464 		 * mark the log for a full commit.
7491 		ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
7495 			 * The dentry does not exist in the log, so record its
7499 			ret = insert_dir_log_key(trans, log, path,
7514 	 * We don't care about the return value. If we fail to log the new name
7515 	 * then we know the next attempt to sync the log will fallback to a full
7517 	 * we don't need to worry about getting a log committed that has an
7524 	 * If an error happened mark the log for a full commit because it's not
7527 	 * the log, to avoid any races with someone else trying to commit it.