// SPDX-License-Identifier: GPL-2.0 /* * fs/hmdfs/stash.c * * Copyright (c) 2020-2021 Huawei Device Co., Ltd. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "stash.h" #include "comm/node_cb.h" #include "comm/protocol.h" #include "comm/connection.h" #include "file_remote.h" #include "hmdfs_dentryfile.h" #include "authority/authentication.h" /* Head magic used to identify a stash file */ #define HMDFS_STASH_FILE_HEAD_MAGIC 0xF7AB06C3 /* Head and path in stash file are aligned with HMDFS_STASH_BLK_SIZE */ #define HMDFS_STASH_BLK_SIZE 4096 #define HMDFS_STASH_BLK_SHIFT 12 #define HMDFS_STASH_PAGE_TO_SECTOR_SHIFT 3 #define HMDFS_STASH_DIR_NAME "stash" #define HMDFS_STASH_FMT_DIR_NAME "v1" #define HMDFS_STASH_WORK_DIR_NAME \ (HMDFS_STASH_DIR_NAME "/" HMDFS_STASH_FMT_DIR_NAME) #define HMDFS_STASH_FILE_NAME_LEN 20 #define HMDFS_STASH_FLUSH_CNT 2 #define HMDFS_STASH_PATH_LEN (HMDFS_CID_SIZE + HMDFS_STASH_FILE_NAME_LEN + 1) struct hmdfs_cache_file_head { __le32 magic; __le32 crc_offset; __le64 ino; __le64 size; __le64 blocks; __le64 last_write_pos; __le64 ctime; __le32 ctime_nsec; __le32 change_detect_cap; __le64 ichange_count; __le32 path_offs; __le32 path_len; __le32 path_cnt; __le32 data_offs; /* Attention: expand new fields in here to compatible with old ver */ __le32 crc32; } __packed; struct hmdfs_stash_work { struct hmdfs_peer *conn; struct list_head *list; struct work_struct work; struct completion done; }; struct hmdfs_inode_tbl { unsigned int cnt; unsigned int max; uint64_t inodes[0]; }; struct hmdfs_stash_dir_context { struct dir_context dctx; char name[NAME_MAX + 1]; struct hmdfs_inode_tbl *tbl; }; struct hmdfs_restore_stats { unsigned int succeed; unsigned int fail; unsigned int keep; unsigned long long ok_pages; unsigned long long fail_pages; }; struct hmdfs_stash_stats { unsigned int succeed; unsigned int donothing; unsigned int fail; unsigned long long ok_pages; unsigned long long fail_pages; }; struct hmdfs_file_restore_ctx { struct hmdfs_peer *conn; struct path src_dir_path; struct path dst_root_path; char *dst; char *page; struct file *src_filp; uint64_t inum; uint64_t pages; unsigned int seq; unsigned int data_offs; /* output */ bool keep; }; struct hmdfs_copy_args { struct file *src; struct file *dst; void *buf; size_t buf_len; unsigned int seq; unsigned int data_offs; uint64_t inum; }; struct hmdfs_copy_ctx { struct hmdfs_copy_args args; loff_t src_pos; loff_t dst_pos; /* output */ size_t copied; bool eof; }; struct hmdfs_rebuild_stats { unsigned int succeed; unsigned int total; unsigned int fail; unsigned int invalid; }; struct hmdfs_check_work { struct hmdfs_peer *conn; struct work_struct work; struct completion done; }; typedef int (*stash_operation_func)(struct hmdfs_peer *, unsigned int, struct path *, const struct hmdfs_inode_tbl *, void *); static struct dentry *hmdfs_do_vfs_mkdir(struct dentry *parent, const char *name, int namelen, umode_t mode) { struct inode *dir = d_inode(parent); struct dentry *child = NULL; int err; inode_lock_nested(dir, I_MUTEX_PARENT); child = lookup_one_len(name, parent, namelen); if (IS_ERR(child)) goto out; if (d_is_positive(child)) { if (d_can_lookup(child)) goto out; dput(child); child = ERR_PTR(-EINVAL); goto out; } err = vfs_mkdir(dir, child, mode); if (err) { dput(child); child = ERR_PTR(err); goto out; } out: inode_unlock(dir); return child; } struct dentry *hmdfs_stash_new_work_dir(struct dentry *parent) { struct dentry *base = NULL; struct dentry *work = NULL; base = hmdfs_do_vfs_mkdir(parent, HMDFS_STASH_DIR_NAME, strlen(HMDFS_STASH_DIR_NAME), 0700); if (IS_ERR(base)) return base; work = hmdfs_do_vfs_mkdir(base, HMDFS_STASH_FMT_DIR_NAME, strlen(HMDFS_STASH_FMT_DIR_NAME), 0700); dput(base); return work; } static struct file *hmdfs_new_stash_file(struct path *d_path, const char *cid) { struct dentry *parent = NULL; struct dentry *child = NULL; struct file *filp = NULL; struct path stash; int err; parent = hmdfs_do_vfs_mkdir(d_path->dentry, cid, strlen(cid), 0700); if (IS_ERR(parent)) { err = PTR_ERR(parent); hmdfs_err("mkdir error %d", err); goto mkdir_err; } child = vfs_tmpfile(parent, S_IFREG | 0600, 0); if (IS_ERR(child)) { err = PTR_ERR(child); hmdfs_err("new stash file error %d", err); goto tmpfile_err; } stash.mnt = d_path->mnt; stash.dentry = child; filp = dentry_open(&stash, O_LARGEFILE | O_WRONLY, current_cred()); if (IS_ERR(filp)) { err = PTR_ERR(filp); hmdfs_err("open stash file error %d", err); goto open_err; } dput(child); dput(parent); return filp; open_err: dput(child); tmpfile_err: dput(parent); mkdir_err: return ERR_PTR(err); } static inline bool hmdfs_is_dir(struct dentry *child) { return d_is_positive(child) && d_can_lookup(child); } static inline bool hmdfs_is_reg(struct dentry *child) { return d_is_positive(child) && d_is_reg(child); } static void hmdfs_set_stash_file_head(const struct hmdfs_cache_info *cache, uint64_t ino, struct hmdfs_cache_file_head *head) { long long blocks; unsigned int crc_offset; memset(head, 0, sizeof(*head)); head->magic = cpu_to_le32(HMDFS_STASH_FILE_HEAD_MAGIC); head->ino = cpu_to_le64(ino); head->size = cpu_to_le64(i_size_read(file_inode(cache->cache_file))); blocks = atomic64_read(&cache->written_pgs) << HMDFS_STASH_PAGE_TO_SECTOR_SHIFT; head->blocks = cpu_to_le64(blocks); head->path_offs = cpu_to_le32(cache->path_offs); head->path_len = cpu_to_le32(cache->path_len); head->path_cnt = cpu_to_le32(cache->path_cnt); head->data_offs = cpu_to_le32(cache->data_offs); crc_offset = offsetof(struct hmdfs_cache_file_head, crc32); head->crc_offset = cpu_to_le32(crc_offset); head->crc32 = cpu_to_le32(crc32(0, head, crc_offset)); } static int hmdfs_flush_stash_file_metadata(struct hmdfs_inode_info *info) { struct hmdfs_cache_info *cache = NULL; struct hmdfs_peer *conn = info->conn; struct hmdfs_cache_file_head cache_head; size_t written; loff_t pos; unsigned int head_size; /* No metadata if no cache file info */ cache = info->cache; if (!cache) return -EINVAL; if (strlen(cache->path) == 0) { long long to_write_pgs = atomic64_read(&cache->to_write_pgs); /* Nothing to stash. No need to flush meta data. */ if (to_write_pgs == 0) return 0; hmdfs_err("peer 0x%x:0x%llx inode 0x%llx lost %lld pages due to no path", conn->owner, conn->device_id, info->remote_ino, to_write_pgs); return -EINVAL; } hmdfs_set_stash_file_head(cache, info->remote_ino, &cache_head); /* Write head */ pos = 0; head_size = sizeof(cache_head); written = kernel_write(cache->cache_file, &cache_head, head_size, &pos); if (written != head_size) { hmdfs_err("stash peer 0x%x:0x%llx ino 0x%llx write head len %u err %zd", conn->owner, conn->device_id, info->remote_ino, head_size, written); return -EIO; } /* Write path */ pos = (loff_t)cache->path_offs << HMDFS_STASH_BLK_SHIFT; written = kernel_write(cache->cache_file, cache->path, cache->path_len, &pos); if (written != cache->path_len) { hmdfs_err("stash peer 0x%x:0x%llx ino 0x%llx write path len %u err %zd", conn->owner, conn->device_id, info->remote_ino, cache->path_len, written); return -EIO; } return 0; } /* Mainly from inode_wait_for_writeback() */ static void hmdfs_wait_remote_writeback_once(struct hmdfs_peer *conn, struct hmdfs_inode_info *info) { struct inode *inode = &info->vfs_inode; DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); wait_queue_head_t *wq_head = NULL; bool in_sync = false; spin_lock(&inode->i_lock); in_sync = inode->i_state & I_SYNC; spin_unlock(&inode->i_lock); if (!in_sync) return; hmdfs_info("peer 0x%x:0x%llx ino 0x%llx wait for wb once", conn->owner, conn->device_id, info->remote_ino); wq_head = bit_waitqueue(&inode->i_state, __I_SYNC); __wait_on_bit(wq_head, &wq, bit_wait, TASK_UNINTERRUPTIBLE); } static void hmdfs_reset_remote_write_err(struct hmdfs_peer *conn, struct hmdfs_inode_info *info) { struct address_space *mapping = info->vfs_inode.i_mapping; int flags_err; errseq_t old; int wb_err; flags_err = filemap_check_errors(mapping); old = errseq_sample(&mapping->wb_err); wb_err = errseq_check_and_advance(&mapping->wb_err, &old); if (flags_err || wb_err) hmdfs_warning("peer 0x%x:0x%llx inode 0x%llx wb error %d %d before stash", conn->owner, conn->device_id, info->remote_ino, flags_err, wb_err); } static bool hmdfs_is_mapping_clean(struct address_space *mapping) { bool clean = false; /* b93b016313b3b ("page cache: use xa_lock") introduces i_pages */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0) xa_lock_irq(&mapping->i_pages); #else spin_lock_irq(&mapping->tree_lock); #endif clean = !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0) xa_unlock_irq(&mapping->i_pages); #else spin_unlock_irq(&mapping->tree_lock); #endif return clean; } static int hmdfs_flush_stash_file_data(struct hmdfs_peer *conn, struct hmdfs_inode_info *info) { struct inode *inode = &info->vfs_inode; struct address_space *mapping = inode->i_mapping; bool all_clean = true; int err = 0; int i; /* Wait for the completion of write syscall */ inode_lock(inode); inode_unlock(inode); all_clean = hmdfs_is_mapping_clean(mapping); if (all_clean) { hmdfs_reset_remote_write_err(conn, info); return 0; } /* * No-sync_all writeback during offline may have not seen * the setting of stash_status as HMDFS_REMOTE_INODE_STASHING * and will call mapping_set_error() after we just reset * the previous error. So waiting for these writeback once, * and the following writeback will do local write. */ hmdfs_wait_remote_writeback_once(conn, info); /* Need to clear previous error ? */ hmdfs_reset_remote_write_err(conn, info); /* * 1. dirty page: do write back * 2. writeback page: wait for its completion * 3. writeback -> redirty page: do filemap_write_and_wait() * twice, so 2th writeback should not allow * writeback -> redirty transition */ for (i = 0; i < HMDFS_STASH_FLUSH_CNT; i++) { err = filemap_write_and_wait(mapping); if (err) { hmdfs_err("peer 0x%x:0x%llx inode 0x%llx #%d stash flush error %d", conn->owner, conn->device_id, info->remote_ino, i, err); return err; } } if (!hmdfs_is_mapping_clean(mapping)) hmdfs_err("peer 0x%x:0x%llx inode 0x%llx is still dirty dt %d wb %d", conn->owner, conn->device_id, info->remote_ino, !!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY), !!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)); return 0; } static int hmdfs_flush_stash_file(struct hmdfs_inode_info *info) { int err; err = hmdfs_flush_stash_file_data(info->conn, info); if (!err) err = hmdfs_flush_stash_file_metadata(info); return err; } static int hmdfs_enable_stash_file(struct hmdfs_inode_info *info, struct dentry *stash) { char name[HMDFS_STASH_FILE_NAME_LEN]; struct dentry *parent = NULL; struct inode *dir = NULL; struct dentry *child = NULL; int err = 0; bool retried = false; snprintf(name, sizeof(name), "0x%llx", info->remote_ino); parent = lock_parent(stash); dir = d_inode(parent); lookup_again: child = lookup_one_len(name, parent, strlen(name)); if (IS_ERR(child)) { err = PTR_ERR(child); child = NULL; hmdfs_err("lookup %s err %d", name, err); goto out; } if (d_is_positive(child)) { hmdfs_warning("%s exists (mode 0%o)", name, d_inode(child)->i_mode); err = vfs_unlink(dir, child, NULL); if (err) { hmdfs_err("unlink %s err %d", name, err); goto out; } if (retried) { err = -EEXIST; goto out; } retried = true; dput(child); goto lookup_again; } err = vfs_link(stash, dir, child, NULL); if (err) { hmdfs_err("link stash file to %s err %d", name, err); goto out; } out: unlock_dir(parent); if (child) dput(child); return err; } /* Return 1 if stash is done, 0 if nothing is stashed */ static int hmdfs_close_stash_file(struct hmdfs_peer *conn, struct hmdfs_inode_info *info) { struct file *cache_file = info->cache->cache_file; struct dentry *c_dentry = file_dentry(cache_file); struct inode *c_inode = d_inode(c_dentry); long long to_write_pgs = atomic64_read(&info->cache->to_write_pgs); int err; hmdfs_info("peer 0x%x:0x%llx inode 0x%llx stashed bytes %lld pages %lld", conn->owner, conn->device_id, info->remote_ino, i_size_read(c_inode), to_write_pgs); if (to_write_pgs == 0) return 0; err = vfs_fsync(cache_file, 0); if (!err) err = hmdfs_enable_stash_file(info, c_dentry); else hmdfs_err("fsync stash file err %d", err); return err < 0 ? err : 1; } static void hmdfs_del_file_cache(struct hmdfs_cache_info *cache) { if (!cache) return; fput(cache->cache_file); kfree(cache->path_buf); kfree(cache); } static struct hmdfs_cache_info * hmdfs_new_file_cache(struct hmdfs_peer *conn, struct hmdfs_inode_info *info) { struct hmdfs_cache_info *cache = NULL; struct dentry *stash_dentry = NULL; int err; cache = kzalloc(sizeof(*cache), GFP_KERNEL); if (!cache) return ERR_PTR(-ENOMEM); atomic64_set(&cache->to_write_pgs, 0); atomic64_set(&cache->written_pgs, 0); cache->path_buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!cache->path_buf) { err = -ENOMEM; goto free_cache; } /* Need to handle "hardlink" ? */ stash_dentry = d_find_any_alias(&info->vfs_inode); if (stash_dentry) { /* Needs full path in hmdfs, will be a device-view path */ cache->path = dentry_path_raw(stash_dentry, cache->path_buf, PATH_MAX); dput(stash_dentry); if (IS_ERR(cache->path)) { err = PTR_ERR(cache->path); hmdfs_err("peer 0x%x:0x%llx inode 0x%llx gen path err %d", conn->owner, conn->device_id, info->remote_ino, err); goto free_path; } } else { /* Write-opened file was closed before finding dentry */ hmdfs_info("peer 0x%x:0x%llx inode 0x%llx no dentry found", conn->owner, conn->device_id, info->remote_ino); cache->path_buf[0] = '\0'; cache->path = cache->path_buf; } cache->path_cnt = 1; cache->path_len = strlen(cache->path) + 1; cache->path_offs = DIV_ROUND_UP(sizeof(struct hmdfs_cache_file_head), HMDFS_STASH_BLK_SIZE); cache->data_offs = cache->path_offs + DIV_ROUND_UP(cache->path_len, HMDFS_STASH_BLK_SIZE); cache->cache_file = hmdfs_new_stash_file(&conn->sbi->stash_work_dir, conn->cid); if (IS_ERR(cache->cache_file)) { err = PTR_ERR(cache->cache_file); goto free_path; } return cache; free_path: kfree(cache->path_buf); free_cache: kfree(cache); return ERR_PTR(err); } static void hmdfs_init_stash_file_cache(struct hmdfs_peer *conn, struct hmdfs_inode_info *info) { struct hmdfs_cache_info *cache = NULL; cache = hmdfs_new_file_cache(conn, info); if (IS_ERR(cache)) /* * Continue even creating stash info failed. * We need to ensure there is no dirty pages * after stash completes */ cache = NULL; /* Make write() returns */ spin_lock(&info->stash_lock); info->cache = cache; info->stash_status = HMDFS_REMOTE_INODE_STASHING; spin_unlock(&info->stash_lock); } static void hmdfs_update_stash_stats(struct hmdfs_stash_stats *stats, const struct hmdfs_cache_info *cache, int err) { unsigned long long ok_pages, fail_pages; if (cache) { ok_pages = err > 0 ? atomic64_read(&cache->written_pgs) : 0; fail_pages = atomic64_read(&cache->to_write_pgs) - ok_pages; stats->ok_pages += ok_pages; stats->fail_pages += fail_pages; } if (err > 0) stats->succeed++; else if (!err) stats->donothing++; else stats->fail++; } /* Return 1 if stash is done, 0 if nothing is stashed */ static int hmdfs_stash_remote_inode(struct hmdfs_inode_info *info, struct hmdfs_stash_stats *stats) { struct hmdfs_cache_info *cache = info->cache; struct hmdfs_peer *conn = info->conn; unsigned int status; int err = 0; hmdfs_info("stash peer 0x%x:0x%llx ino 0x%llx", conn->owner, conn->device_id, info->remote_ino); err = hmdfs_flush_stash_file(info); if (!err) err = hmdfs_close_stash_file(conn, info); if (err <= 0) set_bit(HMDFS_FID_NEED_OPEN, &info->fid_flags); status = err > 0 ? HMDFS_REMOTE_INODE_RESTORING : HMDFS_REMOTE_INODE_NONE; spin_lock(&info->stash_lock); info->cache = NULL; /* * Use smp_store_release() to ensure order between HMDFS_FID_NEED_OPEN * and HMDFS_REMOTE_INODE_NONE. */ smp_store_release(&info->stash_status, status); spin_unlock(&info->stash_lock); hmdfs_update_stash_stats(stats, cache, err); hmdfs_del_file_cache(cache); return err; } static void hmdfs_init_cache_for_stash_files(struct hmdfs_peer *conn, struct list_head *list) { const struct cred *old_cred = NULL; struct hmdfs_inode_info *info = NULL; /* For file creation under stash_work_dir */ old_cred = hmdfs_override_creds(conn->sbi->cred); list_for_each_entry(info, list, stash_node) hmdfs_init_stash_file_cache(conn, info); hmdfs_revert_creds(old_cred); } static void hmdfs_init_stash_cache_work_fn(struct work_struct *base) { struct hmdfs_stash_work *work = container_of(base, struct hmdfs_stash_work, work); hmdfs_init_cache_for_stash_files(work->conn, work->list); complete(&work->done); } static void hmdfs_init_cache_for_stash_files_by_work(struct hmdfs_peer *conn, struct list_head *list) { struct hmdfs_stash_work work = { .conn = conn, .list = list, .done = COMPLETION_INITIALIZER_ONSTACK(work.done), }; INIT_WORK_ONSTACK(&work.work, hmdfs_init_stash_cache_work_fn); schedule_work(&work.work); wait_for_completion(&work.done); } static void hmdfs_stash_fetch_ready_files(struct hmdfs_peer *conn, bool check, struct list_head *list) { struct hmdfs_inode_info *info = NULL; spin_lock(&conn->wr_opened_inode_lock); list_for_each_entry(info, &conn->wr_opened_inode_list, wr_opened_node) { int status; /* Paired with *_release() in hmdfs_reset_stashed_inode() */ status = smp_load_acquire(&info->stash_status); if (status == HMDFS_REMOTE_INODE_NONE) { list_add_tail(&info->stash_node, list); /* * Prevent close() removing the inode from * writeable-opened inode list */ hmdfs_remote_add_wr_opened_inode_nolock(conn, info); /* Prevent the inode from eviction */ ihold(&info->vfs_inode); } else if (check && status == HMDFS_REMOTE_INODE_STASHING) { hmdfs_warning("peer 0x%x:0x%llx inode 0x%llx unexpected stash status %d", conn->owner, conn->device_id, info->remote_ino, status); } } spin_unlock(&conn->wr_opened_inode_lock); } static void hmdfs_stash_offline_prepare(struct hmdfs_peer *conn, int evt, unsigned int seq) { LIST_HEAD(preparing); if (!hmdfs_is_stash_enabled(conn->sbi)) return; mutex_lock(&conn->offline_cb_lock); hmdfs_stash_fetch_ready_files(conn, true, &preparing); if (list_empty(&preparing)) goto out; hmdfs_init_cache_for_stash_files_by_work(conn, &preparing); out: mutex_unlock(&conn->offline_cb_lock); } static void hmdfs_track_inode_locked(struct hmdfs_peer *conn, struct hmdfs_inode_info *info) { spin_lock(&conn->stashed_inode_lock); list_add_tail(&info->stash_node, &conn->stashed_inode_list); conn->stashed_inode_nr++; spin_unlock(&conn->stashed_inode_lock); } static void hmdfs_update_peer_stash_stats(struct hmdfs_stash_statistics *stash_stats, const struct hmdfs_stash_stats *stats) { stash_stats->cur_ok = stats->succeed; stash_stats->cur_nothing = stats->donothing; stash_stats->cur_fail = stats->fail; stash_stats->total_ok += stats->succeed; stash_stats->total_nothing += stats->donothing; stash_stats->total_fail += stats->fail; stash_stats->ok_pages += stats->ok_pages; stash_stats->fail_pages += stats->fail_pages; } static void hmdfs_stash_remote_inodes(struct hmdfs_peer *conn, struct list_head *list) { const struct cred *old_cred = NULL; struct hmdfs_inode_info *info = NULL; struct hmdfs_inode_info *next = NULL; struct hmdfs_stash_stats stats; /* For file creation, write and relink under stash_work_dir */ old_cred = hmdfs_override_creds(conn->sbi->cred); memset(&stats, 0, sizeof(stats)); list_for_each_entry_safe(info, next, list, stash_node) { int err; list_del_init(&info->stash_node); err = hmdfs_stash_remote_inode(info, &stats); if (err > 0) hmdfs_track_inode_locked(conn, info); hmdfs_remote_del_wr_opened_inode(conn, info); if (err <= 0) iput(&info->vfs_inode); } hmdfs_revert_creds(old_cred); hmdfs_update_peer_stash_stats(&conn->stats.stash, &stats); hmdfs_info("peer 0x%x:0x%llx total stashed %u cur ok %u none %u fail %u", conn->owner, conn->device_id, conn->stashed_inode_nr, stats.succeed, stats.donothing, stats.fail); } static void hmdfs_stash_offline_do_stash(struct hmdfs_peer *conn, int evt, unsigned int seq) { struct hmdfs_inode_info *info = NULL; LIST_HEAD(preparing); LIST_HEAD(stashing); if (!hmdfs_is_stash_enabled(conn->sbi)) return; /* release seq_lock to prevent blocking no-offline sync cb */ mutex_unlock(&conn->seq_lock); /* acquire offline_cb_lock to serialized with offline sync cb */ mutex_lock(&conn->offline_cb_lock); hmdfs_stash_fetch_ready_files(conn, false, &preparing); if (!list_empty(&preparing)) hmdfs_init_cache_for_stash_files(conn, &preparing); spin_lock(&conn->wr_opened_inode_lock); list_for_each_entry(info, &conn->wr_opened_inode_list, wr_opened_node) { int status = READ_ONCE(info->stash_status); if (status == HMDFS_REMOTE_INODE_STASHING) list_add_tail(&info->stash_node, &stashing); } spin_unlock(&conn->wr_opened_inode_lock); if (list_empty(&stashing)) goto unlock; hmdfs_stash_remote_inodes(conn, &stashing); unlock: mutex_unlock(&conn->offline_cb_lock); mutex_lock(&conn->seq_lock); } static struct hmdfs_inode_info * hmdfs_lookup_stash_inode(struct hmdfs_peer *conn, uint64_t inum) { struct hmdfs_inode_info *info = NULL; list_for_each_entry(info, &conn->stashed_inode_list, stash_node) { if (info->remote_ino == inum) return info; } return NULL; } static void hmdfs_untrack_stashed_inode(struct hmdfs_peer *conn, struct hmdfs_inode_info *info) { list_del_init(&info->stash_node); iput(&info->vfs_inode); conn->stashed_inode_nr--; } static void hmdfs_reset_stashed_inode(struct hmdfs_peer *conn, struct hmdfs_inode_info *info) { struct inode *ino = &info->vfs_inode; /* * For updating stash_status after iput() * in hmdfs_untrack_stashed_inode() */ ihold(ino); hmdfs_untrack_stashed_inode(conn, info); /* * Ensure the order of stash_node and stash_status: * only update stash_status to NONE after removal of * stash_node is completed. */ smp_store_release(&info->stash_status, HMDFS_REMOTE_INODE_NONE); iput(ino); } static void hmdfs_drop_stashed_inodes(struct hmdfs_peer *conn) { struct hmdfs_inode_info *info = NULL; struct hmdfs_inode_info *next = NULL; if (list_empty(&conn->stashed_inode_list)) return; hmdfs_warning("peer 0x%x:0x%llx drop unrestorable file %u", conn->owner, conn->device_id, conn->stashed_inode_nr); list_for_each_entry_safe(info, next, &conn->stashed_inode_list, stash_node) { hmdfs_warning("peer 0x%x:0x%llx inode 0x%llx unrestorable status %u", conn->owner, conn->device_id, info->remote_ino, READ_ONCE(info->stash_status)); hmdfs_reset_stashed_inode(conn, info); } } static struct file *hmdfs_open_stash_dir(struct path *d_path, const char *cid) { int err = 0; struct dentry *parent = d_path->dentry; struct inode *dir = d_inode(parent); struct dentry *child = NULL; struct path peer_path; struct file *filp = NULL; inode_lock_nested(dir, I_MUTEX_PARENT); child = lookup_one_len(cid, parent, strlen(cid)); if (!IS_ERR(child)) { if (!hmdfs_is_dir(child)) { if (d_is_positive(child)) { hmdfs_err("invalid stash dir mode 0%o", d_inode(child)->i_mode); err = -EINVAL; } else { err = -ENOENT; } dput(child); } } else { err = PTR_ERR(child); hmdfs_err("lookup stash dir err %d", err); } inode_unlock(dir); if (err) return ERR_PTR(err); peer_path.mnt = d_path->mnt; peer_path.dentry = child; filp = dentry_open(&peer_path, O_RDONLY | O_DIRECTORY, current_cred()); if (IS_ERR(filp)) hmdfs_err("open err %d", (int)PTR_ERR(filp)); dput(child); return filp; } static int hmdfs_new_inode_tbl(struct hmdfs_inode_tbl **tbl) { struct hmdfs_inode_tbl *new = NULL; new = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!new) return -ENOMEM; new->cnt = 0; new->max = (PAGE_SIZE - offsetof(struct hmdfs_inode_tbl, inodes)) / sizeof(new->inodes[0]); *tbl = new; return 0; } static int hmdfs_parse_stash_file_name(struct dir_context *dctx, const char *name, int namelen, unsigned int d_type, uint64_t *stash_inum) { struct hmdfs_stash_dir_context *ctx = NULL; int err; if (d_type != DT_UNKNOWN && d_type != DT_REG) return 0; if (namelen > NAME_MAX) return 0; ctx = container_of(dctx, struct hmdfs_stash_dir_context, dctx); memcpy(ctx->name, name, namelen); ctx->name[namelen] = '\0'; err = kstrtoull(ctx->name, 16, stash_inum); if (err) { hmdfs_err("unexpected stash file err %d", err); return 0; } return 1; } static int hmdfs_has_stash_file(struct dir_context *dctx, const char *name, int namelen, loff_t offset, u64 inum, unsigned int d_type) { struct hmdfs_stash_dir_context *ctx = NULL; uint64_t stash_inum; int err; ctx = container_of(dctx, struct hmdfs_stash_dir_context, dctx); err = hmdfs_parse_stash_file_name(dctx, name, namelen, d_type, &stash_inum); if (!err) return 0; ctx->tbl->cnt++; return 1; } static int hmdfs_fill_stash_file(struct dir_context *dctx, const char *name, int namelen, loff_t offset, u64 inum, unsigned int d_type) { struct hmdfs_stash_dir_context *ctx = NULL; uint64_t stash_inum; int err; ctx = container_of(dctx, struct hmdfs_stash_dir_context, dctx); err = hmdfs_parse_stash_file_name(dctx, name, namelen, d_type, &stash_inum); if (!err) return 0; if (ctx->tbl->cnt >= ctx->tbl->max) return 1; ctx->tbl->inodes[ctx->tbl->cnt++] = stash_inum; return 0; } static int hmdfs_del_stash_file(struct dentry *parent, struct dentry *child) { struct inode *dir = d_inode(parent); int err = 0; /* Prevent d_delete() from calling dentry_unlink_inode() */ dget(child); inode_lock_nested(dir, I_MUTEX_PARENT); err = vfs_unlink(dir, child, NULL); if (err) hmdfs_err("remove stash file err %d", err); inode_unlock(dir); dput(child); return err; } static inline bool hmdfs_is_node_offlined(const struct hmdfs_peer *conn, unsigned int seq) { /* * open()/fsync() may fail due to "status = NODE_STAT_OFFLINE" * in hmdfs_disconnect_node(). * Pair with smp_mb() in hmdfs_disconnect_node() to ensure * getting the newest event sequence. */ smp_mb__before_atomic(); return hmdfs_node_evt_seq(conn) != seq; } static int hmdfs_verify_restore_file_head(struct hmdfs_file_restore_ctx *ctx, const struct hmdfs_cache_file_head *head) { struct inode *inode = file_inode(ctx->src_filp); struct hmdfs_peer *conn = ctx->conn; unsigned int crc, read_crc, crc_offset; loff_t path_offs, data_offs, isize; int err = 0; if (le32_to_cpu(head->magic) != HMDFS_STASH_FILE_HEAD_MAGIC) { err = -EUCLEAN; hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid magic: got 0x%x, exp 0x%x", conn->owner, conn->device_id, ctx->inum, le32_to_cpu(head->magic), HMDFS_STASH_FILE_HEAD_MAGIC); goto out; } crc_offset = le32_to_cpu(head->crc_offset); read_crc = le32_to_cpu(*((__le32 *)((char *)head + crc_offset))); crc = crc32(0, head, crc_offset); if (read_crc != crc) { err = -EUCLEAN; hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid crc: got 0x%x, exp 0x%x", conn->owner, conn->device_id, ctx->inum, read_crc, crc); goto out; } if (le64_to_cpu(head->ino) != ctx->inum) { err = -EUCLEAN; hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid ino: got %llu, exp %llu", conn->owner, conn->device_id, ctx->inum, le64_to_cpu(head->ino), ctx->inum); goto out; } path_offs = (loff_t)le32_to_cpu(head->path_offs) << HMDFS_STASH_BLK_SHIFT; if (path_offs <= 0 || path_offs >= i_size_read(inode)) { err = -EUCLEAN; hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid path_offs %d, stash file size %llu", conn->owner, conn->device_id, ctx->inum, le32_to_cpu(head->path_offs), i_size_read(inode)); goto out; } data_offs = (loff_t)le32_to_cpu(head->data_offs) << HMDFS_STASH_BLK_SHIFT; if (path_offs >= data_offs) { err = -EUCLEAN; hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid data_offs %d, path_offs %d", conn->owner, conn->device_id, ctx->inum, le32_to_cpu(head->data_offs), le32_to_cpu(head->path_offs)); goto out; } if (data_offs <= 0 || data_offs >= i_size_read(inode)) { err = -EUCLEAN; hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid data_offs %d, stash file size %llu", conn->owner, conn->device_id, ctx->inum, le32_to_cpu(head->data_offs), i_size_read(inode)); goto out; } isize = le64_to_cpu(head->size); if (isize != i_size_read(inode)) { err = -EUCLEAN; hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid isize: got %llu, exp %llu", conn->owner, conn->device_id, ctx->inum, le64_to_cpu(head->size), i_size_read(inode)); goto out; } if (le32_to_cpu(head->path_cnt) < 1) { err = -EUCLEAN; hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid path_cnt %d", conn->owner, conn->device_id, ctx->inum, le32_to_cpu(head->path_cnt)); goto out; } out: return err; } static int hmdfs_get_restore_file_metadata(struct hmdfs_file_restore_ctx *ctx) { struct hmdfs_cache_file_head head; struct hmdfs_peer *conn = ctx->conn; unsigned int head_size, read_size, head_crc_offset; loff_t pos; ssize_t rd; int err = 0; head_size = sizeof(struct hmdfs_cache_file_head); memset(&head, 0, head_size); /* Read part head */ pos = 0; read_size = offsetof(struct hmdfs_cache_file_head, crc_offset) + sizeof(head.crc_offset); rd = kernel_read(ctx->src_filp, &head, read_size, &pos); if (rd != read_size) { err = rd < 0 ? rd : -ENODATA; hmdfs_err("peer 0x%x:0x%llx ino 0x%llx read part head err %d", conn->owner, conn->device_id, ctx->inum, err); goto out; } head_crc_offset = le32_to_cpu(head.crc_offset); if (head_crc_offset + sizeof(head.crc32) < head_crc_offset || head_crc_offset + sizeof(head.crc32) > head_size) { err = -EUCLEAN; hmdfs_err("peer 0x%x:0x%llx ino 0x%llx got bad head: Too long crc_offset %u which exceeds head size %u", conn->owner, conn->device_id, ctx->inum, head_crc_offset, head_size); goto out; } /* Read full head */ pos = 0; read_size = le32_to_cpu(head.crc_offset) + sizeof(head.crc32); rd = kernel_read(ctx->src_filp, &head, read_size, &pos); if (rd != read_size) { err = rd < 0 ? rd : -ENODATA; hmdfs_err("peer 0x%x:0x%llx ino 0x%llx read full head err %d", conn->owner, conn->device_id, ctx->inum, err); goto out; } err = hmdfs_verify_restore_file_head(ctx, &head); if (err) goto out; ctx->pages = le64_to_cpu(head.blocks) >> HMDFS_STASH_PAGE_TO_SECTOR_SHIFT; ctx->data_offs = le32_to_cpu(head.data_offs); /* Read path */ read_size = min_t(unsigned int, le32_to_cpu(head.path_len), PATH_MAX); pos = (loff_t)le32_to_cpu(head.path_offs) << HMDFS_STASH_BLK_SHIFT; rd = kernel_read(ctx->src_filp, ctx->dst, read_size, &pos); if (rd != read_size) { err = rd < 0 ? rd : -ENODATA; hmdfs_err("peer 0x%x:0x%llx ino 0x%llx read path err %d", conn->owner, conn->device_id, ctx->inum, err); goto out; } if (strnlen(ctx->dst, read_size) >= read_size) { err = -EUCLEAN; hmdfs_err("peer 0x%x:0x%llx ino 0x%llx read path not end with \\0", conn->owner, conn->device_id, ctx->inum); goto out; } /* TODO: Pick a valid path from all paths */ out: return err; } static int hmdfs_open_restore_dst_file(struct hmdfs_file_restore_ctx *ctx, unsigned int rw_flag, struct file **filp) { struct hmdfs_peer *conn = ctx->conn; struct file *dst = NULL; int err = 0; err = hmdfs_get_restore_file_metadata(ctx); if (err) goto out; /* Error comes from connection or server ? */ dst = file_open_root(&ctx->dst_root_path, ctx->dst, O_LARGEFILE | rw_flag, 0); if (IS_ERR(dst)) { err = PTR_ERR(dst); hmdfs_err("open remote file ino 0x%llx err %d", ctx->inum, err); if (hmdfs_is_node_offlined(conn, ctx->seq)) err = -ESHUTDOWN; goto out; } *filp = dst; out: return err; } static bool hmdfs_need_abort_restore(struct hmdfs_file_restore_ctx *ctx, struct hmdfs_inode_info *pinned, struct file *opened_file) { struct hmdfs_inode_info *opened = hmdfs_i(file_inode(opened_file)); if (opened->inode_type != HMDFS_LAYER_OTHER_REMOTE) goto abort; if (opened == pinned) return false; abort: hmdfs_warning("peer 0x%x:0x%llx inode 0x%llx invalid remote file", ctx->conn->owner, ctx->conn->device_id, ctx->inum); hmdfs_warning("got: peer 0x%x:0x%llx inode 0x%llx type %d status %d", opened->conn ? opened->conn->owner : 0, opened->conn ? opened->conn->device_id : 0, opened->remote_ino, opened->inode_type, opened->stash_status); hmdfs_warning("pinned: peer 0x%x:0x%llx inode 0x%llx type %d status %d", pinned->conn->owner, pinned->conn->device_id, pinned->remote_ino, pinned->inode_type, pinned->stash_status); return true; } static void hmdfs_init_copy_args(const struct hmdfs_file_restore_ctx *ctx, struct file *dst, struct hmdfs_copy_args *args) { args->src = ctx->src_filp; args->dst = dst; args->buf = ctx->page; args->buf_len = PAGE_SIZE; args->seq = ctx->seq; args->data_offs = ctx->data_offs; args->inum = ctx->inum; } static ssize_t hmdfs_write_dst(struct hmdfs_peer *conn, struct file *filp, void *buf, size_t len, loff_t pos) { mm_segment_t old_fs; struct kiocb kiocb; struct iovec iov; struct iov_iter iter; ssize_t wr; int err = 0; file_start_write(filp); old_fs = force_uaccess_begin(); init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = pos; iov.iov_base = buf; iov.iov_len = len; iov_iter_init(&iter, WRITE, &iov, 1, len); wr = hmdfs_file_write_iter_remote_nocheck(&kiocb, &iter); force_uaccess_end(old_fs); file_end_write(filp); if (wr != len) { struct hmdfs_inode_info *info = hmdfs_i(file_inode(filp)); hmdfs_err("peer 0x%x:0x%llx ino 0x%llx short write ret %zd exp %zu", conn->owner, conn->device_id, info->remote_ino, wr, len); err = wr < 0 ? (int)wr : -EFAULT; } return err; } static int hmdfs_rd_src_wr_dst(struct hmdfs_peer *conn, struct hmdfs_copy_ctx *ctx) { const struct hmdfs_copy_args *args = NULL; int err = 0; loff_t rd_pos; ssize_t rd; ctx->eof = false; ctx->copied = 0; args = &ctx->args; rd_pos = ctx->src_pos; rd = kernel_read(args->src, args->buf, args->buf_len, &rd_pos); if (rd < 0) { err = (int)rd; hmdfs_err("peer 0x%x:0x%llx ino 0x%llx short read err %d", conn->owner, conn->device_id, args->inum, err); goto out; } else if (rd == 0) { ctx->eof = true; goto out; } err = hmdfs_write_dst(conn, args->dst, args->buf, rd, ctx->dst_pos); if (!err) ctx->copied = rd; else if (hmdfs_is_node_offlined(conn, args->seq)) err = -ESHUTDOWN; out: return err; } static int hmdfs_copy_src_to_dst(struct hmdfs_peer *conn, const struct hmdfs_copy_args *args) { int err = 0; struct file *src = NULL; struct hmdfs_copy_ctx ctx; loff_t seek_pos, data_init_pos; loff_t src_size; ctx.args = *args; src = ctx.args.src; data_init_pos = (loff_t)ctx.args.data_offs << HMDFS_STASH_BLK_SHIFT; seek_pos = data_init_pos; src_size = i_size_read(file_inode(src)); while (true) { loff_t data_pos; data_pos = vfs_llseek(src, seek_pos, SEEK_DATA); if (data_pos > seek_pos) { seek_pos = data_pos; continue; } else if (data_pos < 0) { if (data_pos == -ENXIO) { loff_t src_blks = file_inode(src)->i_blocks; hmdfs_info("peer 0x%x:0x%llx ino 0x%llx end at 0x%llx (sz 0x%llx blk 0x%llx)", conn->owner, conn->device_id, args->inum, seek_pos, src_size, src_blks); } else { err = (int)data_pos; hmdfs_err("peer 0x%x:0x%llx ino 0x%llx seek pos 0x%llx err %d", conn->owner, conn->device_id, args->inum, seek_pos, err); } break; } hmdfs_debug("peer 0x%x:0x%llx ino 0x%llx seek to 0x%llx", conn->owner, conn->device_id, args->inum, data_pos); ctx.src_pos = data_pos; ctx.dst_pos = data_pos - data_init_pos; err = hmdfs_rd_src_wr_dst(conn, &ctx); if (err || ctx.eof) break; seek_pos += ctx.copied; if (seek_pos >= src_size) break; } return err; } static int hmdfs_restore_src_to_dst(struct hmdfs_file_restore_ctx *ctx, struct file *dst) { struct file *src = ctx->src_filp; struct hmdfs_copy_args args; int err; hmdfs_init_copy_args(ctx, dst, &args); err = hmdfs_copy_src_to_dst(ctx->conn, &args); if (err) goto out; err = vfs_fsync(dst, 0); if (err) { hmdfs_err("fsync remote file ino 0x%llx err %d", ctx->inum, err); if (hmdfs_is_node_offlined(ctx->conn, ctx->seq)) err = -ESHUTDOWN; } out: if (err) truncate_inode_pages(file_inode(dst)->i_mapping, 0); /* Remove the unnecessary cache */ invalidate_mapping_pages(file_inode(src)->i_mapping, 0, -1); return err; } static int hmdfs_restore_file(struct hmdfs_file_restore_ctx *ctx) { struct hmdfs_peer *conn = ctx->conn; uint64_t inum = ctx->inum; struct hmdfs_inode_info *pinned_info = NULL; struct file *dst_filp = NULL; int err = 0; bool keep = false; hmdfs_info("peer 0x%x:0x%llx ino 0x%llx do restore", conn->owner, conn->device_id, inum); pinned_info = hmdfs_lookup_stash_inode(conn, inum); if (pinned_info) { unsigned int status = READ_ONCE(pinned_info->stash_status); if (status != HMDFS_REMOTE_INODE_RESTORING) { hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid status %u", conn->owner, conn->device_id, inum, status); err = -EINVAL; goto clean; } } else { hmdfs_warning("peer 0x%x:0x%llx ino 0x%llx doesn't being pinned", conn->owner, conn->device_id, inum); err = -EINVAL; goto clean; } set_bit(HMDFS_FID_NEED_OPEN, &pinned_info->fid_flags); err = hmdfs_open_restore_dst_file(ctx, O_RDWR, &dst_filp); if (err) { if (err == -ESHUTDOWN) keep = true; goto clean; } if (hmdfs_need_abort_restore(ctx, pinned_info, dst_filp)) goto abort; err = hmdfs_restore_src_to_dst(ctx, dst_filp); if (err == -ESHUTDOWN) keep = true; abort: fput(dst_filp); clean: if (pinned_info && !keep) hmdfs_reset_stashed_inode(conn, pinned_info); ctx->keep = keep; hmdfs_info("peer 0x%x:0x%llx ino 0x%llx restore err %d keep %d", conn->owner, conn->device_id, inum, err, ctx->keep); return err; } static int hmdfs_init_file_restore_ctx(struct hmdfs_peer *conn, unsigned int seq, struct path *src_dir, struct hmdfs_file_restore_ctx *ctx) { struct hmdfs_sb_info *sbi = conn->sbi; struct path dst_root; char *dst = NULL; char *page = NULL; int err = 0; err = hmdfs_get_path_in_sb(sbi->sb, sbi->real_dst, LOOKUP_DIRECTORY, &dst_root); if (err) return err; dst = kmalloc(PATH_MAX, GFP_KERNEL); if (!dst) { err = -ENOMEM; goto put_path; } page = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!page) { err = -ENOMEM; goto free_dst; } ctx->conn = conn; ctx->src_dir_path = *src_dir; ctx->dst_root_path = dst_root; ctx->dst = dst; ctx->page = page; ctx->seq = seq; return 0; free_dst: kfree(dst); put_path: path_put(&dst_root); return err; } static void hmdfs_exit_file_restore_ctx(struct hmdfs_file_restore_ctx *ctx) { path_put(&ctx->dst_root_path); kfree(ctx->dst); kfree(ctx->page); } static struct file *hmdfs_open_stash_file(struct path *p_path, char *name) { struct dentry *parent = NULL; struct inode *dir = NULL; struct dentry *child = NULL; struct file *filp = NULL; struct path c_path; int err = 0; parent = p_path->dentry; dir = d_inode(parent); inode_lock_nested(dir, I_MUTEX_PARENT); child = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(child) && !hmdfs_is_reg(child)) { if (d_is_positive(child)) { hmdfs_err("invalid stash file (mode 0%o)", d_inode(child)->i_mode); err = -EINVAL; } else { hmdfs_err("missing stash file"); err = -ENOENT; } dput(child); } else if (IS_ERR(child)) { err = PTR_ERR(child); hmdfs_err("lookup stash file err %d", err); } inode_unlock(dir); if (err) return ERR_PTR(err); c_path.mnt = p_path->mnt; c_path.dentry = child; filp = dentry_open(&c_path, O_RDONLY | O_LARGEFILE, current_cred()); if (IS_ERR(filp)) hmdfs_err("open stash file err %d", (int)PTR_ERR(filp)); dput(child); return filp; } static void hmdfs_update_restore_stats(struct hmdfs_restore_stats *stats, bool keep, uint64_t pages, int err) { if (!err) { stats->succeed++; stats->ok_pages += pages; } else if (keep) { stats->keep++; } else { stats->fail++; stats->fail_pages += pages; } } static int hmdfs_restore_files(struct hmdfs_peer *conn, unsigned int seq, struct path *dir, const struct hmdfs_inode_tbl *tbl, void *priv) { unsigned int i; struct hmdfs_file_restore_ctx ctx; int err = 0; struct hmdfs_restore_stats *stats = priv; err = hmdfs_init_file_restore_ctx(conn, seq, dir, &ctx); if (err) return err; for (i = 0; i < tbl->cnt; i++) { char name[HMDFS_STASH_FILE_NAME_LEN]; struct file *filp = NULL; snprintf(name, sizeof(name), "0x%llx", tbl->inodes[i]); filp = hmdfs_open_stash_file(dir, name); /* Continue to restore if any error */ if (IS_ERR(filp)) { stats->fail++; continue; } ctx.inum = tbl->inodes[i]; ctx.src_filp = filp; ctx.keep = false; ctx.pages = 0; err = hmdfs_restore_file(&ctx); hmdfs_update_restore_stats(stats, ctx.keep, ctx.pages, err); if (!ctx.keep) hmdfs_del_stash_file(dir->dentry, file_dentry(ctx.src_filp)); fput(ctx.src_filp); /* Continue to restore */ if (err == -ESHUTDOWN) break; err = 0; } hmdfs_exit_file_restore_ctx(&ctx); return err; } static bool hmdfs_is_valid_stash_status(struct hmdfs_inode_info *inode_info, uint64_t ino) { return (inode_info->inode_type == HMDFS_LAYER_OTHER_REMOTE && inode_info->stash_status == HMDFS_REMOTE_INODE_RESTORING && inode_info->remote_ino == ino); } static int hmdfs_rebuild_stash_list(struct hmdfs_peer *conn, unsigned int seq, struct path *dir, const struct hmdfs_inode_tbl *tbl, void *priv) { struct hmdfs_file_restore_ctx ctx; unsigned int i; int err; struct hmdfs_rebuild_stats *stats = priv; err = hmdfs_init_file_restore_ctx(conn, seq, dir, &ctx); if (err) return err; stats->total += tbl->cnt; for (i = 0; i < tbl->cnt; i++) { char name[HMDFS_STASH_FILE_NAME_LEN]; struct file *src_filp = NULL; struct file *dst_filp = NULL; struct hmdfs_inode_info *inode_info = NULL; bool is_valid = true; snprintf(name, sizeof(name), "0x%llx", tbl->inodes[i]); src_filp = hmdfs_open_stash_file(dir, name); if (IS_ERR(src_filp)) { stats->fail++; continue; } ctx.inum = tbl->inodes[i]; ctx.src_filp = src_filp; /* No need to track the open which only needs meta info */ err = hmdfs_open_restore_dst_file(&ctx, O_RDONLY, &dst_filp); if (err) { fput(src_filp); if (err == -ESHUTDOWN) break; stats->fail++; err = 0; continue; } inode_info = hmdfs_i(file_inode(dst_filp)); is_valid = hmdfs_is_valid_stash_status(inode_info, ctx.inum); if (is_valid) { stats->succeed++; } else { hmdfs_err("peer 0x%x:0x%llx inode 0x%llx invalid state: type: %d, status: %u, inode: %llu", conn->owner, conn->device_id, ctx.inum, inode_info->inode_type, READ_ONCE(inode_info->stash_status), inode_info->remote_ino); stats->invalid++; } fput(ctx.src_filp); fput(dst_filp); } hmdfs_exit_file_restore_ctx(&ctx); return err; } static int hmdfs_iter_stash_file(struct hmdfs_peer *conn, unsigned int seq, struct file *filp, stash_operation_func op, void *priv) { int err = 0; struct hmdfs_stash_dir_context ctx = { .dctx.actor = hmdfs_fill_stash_file, }; struct hmdfs_inode_tbl *tbl = NULL; struct path dir; err = hmdfs_new_inode_tbl(&tbl); if (err) goto out; dir.mnt = filp->f_path.mnt; dir.dentry = file_dentry(filp); ctx.tbl = tbl; ctx.dctx.pos = 0; do { tbl->cnt = 0; err = iterate_dir(filp, &ctx.dctx); if (err || !tbl->cnt) { if (err) hmdfs_err("iterate stash dir err %d", err); break; } err = op(conn, seq, &dir, tbl, priv); } while (!err); out: kfree(tbl); return err; } static void hmdfs_rebuild_check_work_fn(struct work_struct *base) { struct hmdfs_check_work *work = container_of(base, struct hmdfs_check_work, work); struct hmdfs_peer *conn = work->conn; struct hmdfs_sb_info *sbi = conn->sbi; struct file *filp = NULL; const struct cred *old_cred = NULL; struct hmdfs_stash_dir_context ctx = { .dctx.actor = hmdfs_has_stash_file, }; struct hmdfs_inode_tbl tbl; int err; old_cred = hmdfs_override_creds(sbi->cred); filp = hmdfs_open_stash_dir(&sbi->stash_work_dir, conn->cid); if (IS_ERR(filp)) goto out; memset(&tbl, 0, sizeof(tbl)); ctx.tbl = &tbl; err = iterate_dir(filp, &ctx.dctx); if (!err && ctx.tbl->cnt > 0) conn->need_rebuild_stash_list = true; fput(filp); out: hmdfs_revert_creds(old_cred); hmdfs_info("peer 0x%x:0x%llx %sneed to rebuild stash list", conn->owner, conn->device_id, conn->need_rebuild_stash_list ? "" : "don't "); complete(&work->done); } static void hmdfs_stash_add_do_check(struct hmdfs_peer *conn, int evt, unsigned int seq) { struct hmdfs_sb_info *sbi = conn->sbi; struct hmdfs_check_work work = { .conn = conn, .done = COMPLETION_INITIALIZER_ONSTACK(work.done), }; if (!hmdfs_is_stash_enabled(sbi)) return; INIT_WORK_ONSTACK(&work.work, hmdfs_rebuild_check_work_fn); schedule_work(&work.work); wait_for_completion(&work.done); } static void hmdfs_update_peer_rebuild_stats(struct hmdfs_rebuild_statistics *rebuild_stats, const struct hmdfs_rebuild_stats *stats) { rebuild_stats->cur_ok = stats->succeed; rebuild_stats->cur_fail = stats->fail; rebuild_stats->cur_invalid = stats->invalid; rebuild_stats->total_ok += stats->succeed; rebuild_stats->total_fail += stats->fail; rebuild_stats->total_invalid += stats->invalid; } /* rebuild stash inode list */ static void hmdfs_stash_online_prepare(struct hmdfs_peer *conn, int evt, unsigned int seq) { struct hmdfs_sb_info *sbi = conn->sbi; struct file *filp = NULL; const struct cred *old_cred = NULL; int err; struct hmdfs_rebuild_stats stats; if (!hmdfs_is_stash_enabled(sbi) || !conn->need_rebuild_stash_list) return; /* release seq_lock to prevent blocking no-online sync cb */ mutex_unlock(&conn->seq_lock); old_cred = hmdfs_override_creds(sbi->cred); filp = hmdfs_open_stash_dir(&sbi->stash_work_dir, conn->cid); if (IS_ERR(filp)) goto out; memset(&stats, 0, sizeof(stats)); err = hmdfs_iter_stash_file(conn, seq, filp, hmdfs_rebuild_stash_list, &stats); if (err == -ESHUTDOWN) { hmdfs_info("peer 0x%x:0x%llx offline again during rebuild", conn->owner, conn->device_id); } else { WRITE_ONCE(conn->need_rebuild_stash_list, false); if (err) hmdfs_warning("partial rebuild fail err %d", err); } hmdfs_update_peer_rebuild_stats(&conn->stats.rebuild, &stats); hmdfs_info("peer 0x%x:0x%llx rebuild stashed-file total %u succeed %u fail %u invalid %u", conn->owner, conn->device_id, stats.total, stats.succeed, stats.fail, stats.invalid); fput(filp); out: conn->stats.rebuild.time++; hmdfs_revert_creds(old_cred); if (!READ_ONCE(conn->need_rebuild_stash_list)) { /* * Use smp_mb__before_atomic() to ensure order between * writing @conn->need_rebuild_stash_list and * reading conn->rebuild_inode_status_nr. */ smp_mb__before_atomic(); /* * Wait until all inodes finish rebuilding stash status before * accessing @conn->stashed_inode_list in restoring. */ wait_event(conn->rebuild_inode_status_wq, !atomic_read(&conn->rebuild_inode_status_nr)); } mutex_lock(&conn->seq_lock); } static void hmdfs_update_peer_restore_stats(struct hmdfs_restore_statistics *restore_stats, const struct hmdfs_restore_stats *stats) { restore_stats->cur_ok = stats->succeed; restore_stats->cur_fail = stats->fail; restore_stats->cur_keep = stats->keep; restore_stats->total_ok += stats->succeed; restore_stats->total_fail += stats->fail; restore_stats->total_keep += stats->keep; restore_stats->ok_pages += stats->ok_pages; restore_stats->fail_pages += stats->fail_pages; } static void hmdfs_stash_online_do_restore(struct hmdfs_peer *conn, int evt, unsigned int seq) { struct hmdfs_sb_info *sbi = conn->sbi; struct file *filp = NULL; const struct cred *old_cred = NULL; struct hmdfs_restore_stats stats; int err = 0; if (!hmdfs_is_stash_enabled(sbi) || conn->need_rebuild_stash_list) { if (conn->need_rebuild_stash_list) hmdfs_info("peer 0x%x:0x%llx skip restoring due to rebuild-need", conn->owner, conn->device_id); return; } /* release seq_lock to prevent blocking no-online sync cb */ mutex_unlock(&conn->seq_lock); /* For dir iteration, file read and unlink */ old_cred = hmdfs_override_creds(conn->sbi->cred); memset(&stats, 0, sizeof(stats)); filp = hmdfs_open_stash_dir(&sbi->stash_work_dir, conn->cid); if (IS_ERR(filp)) { err = PTR_ERR(filp); goto out; } err = hmdfs_iter_stash_file(conn, seq, filp, hmdfs_restore_files, &stats); fput(filp); out: hmdfs_revert_creds(old_cred); /* offline again ? */ if (err != -ESHUTDOWN) hmdfs_drop_stashed_inodes(conn); hmdfs_update_peer_restore_stats(&conn->stats.restore, &stats); hmdfs_info("peer 0x%x:0x%llx restore stashed-file ok %u fail %u keep %u", conn->owner, conn->device_id, stats.succeed, stats.fail, stats.keep); mutex_lock(&conn->seq_lock); } static void hmdfs_stash_del_do_cleanup(struct hmdfs_peer *conn, int evt, unsigned int seq) { struct hmdfs_inode_info *info = NULL; struct hmdfs_inode_info *next = NULL; unsigned int preparing; if (!hmdfs_is_stash_enabled(conn->sbi)) return; /* Async cb is cancelled */ preparing = 0; list_for_each_entry_safe(info, next, &conn->wr_opened_inode_list, wr_opened_node) { int status = READ_ONCE(info->stash_status); if (status == HMDFS_REMOTE_INODE_STASHING) { struct hmdfs_cache_info *cache = NULL; spin_lock(&info->stash_lock); cache = info->cache; info->cache = NULL; info->stash_status = HMDFS_REMOTE_INODE_NONE; spin_unlock(&info->stash_lock); hmdfs_remote_del_wr_opened_inode(conn, info); hmdfs_del_file_cache(cache); /* put inode after all access are completed */ iput(&info->vfs_inode); preparing++; } } hmdfs_info("release %u preparing inodes", preparing); hmdfs_info("release %u pinned inodes", conn->stashed_inode_nr); if (list_empty(&conn->stashed_inode_list)) return; list_for_each_entry_safe(info, next, &conn->stashed_inode_list, stash_node) hmdfs_untrack_stashed_inode(conn, info); } void hmdfs_exit_stash(struct hmdfs_sb_info *sbi) { if (!sbi->s_offline_stash) return; if (sbi->stash_work_dir.dentry) { path_put(&sbi->stash_work_dir); sbi->stash_work_dir.dentry = NULL; } } int hmdfs_init_stash(struct hmdfs_sb_info *sbi) { int err = 0; struct path parent; struct dentry *child = NULL; if (!sbi->s_offline_stash) return 0; err = kern_path(sbi->cache_dir, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &parent); if (err) { hmdfs_err("invalid cache dir err %d", err); goto out; } child = hmdfs_stash_new_work_dir(parent.dentry); if (!IS_ERR(child)) { sbi->stash_work_dir.mnt = mntget(parent.mnt); sbi->stash_work_dir.dentry = child; } else { err = PTR_ERR(child); hmdfs_err("create stash work dir err %d", err); } path_put(&parent); out: return err; } static int hmdfs_stash_write_local_file(struct hmdfs_peer *conn, struct hmdfs_inode_info *info, struct hmdfs_writepage_context *ctx, struct hmdfs_cache_info *cache) { struct page *page = ctx->page; const struct cred *old_cred = NULL; void *buf = NULL; loff_t pos; unsigned int flags; ssize_t written; int err = 0; buf = kmap(page); pos = (loff_t)page->index << PAGE_SHIFT; /* enable NOFS for memory allocation */ flags = memalloc_nofs_save(); old_cred = hmdfs_override_creds(conn->sbi->cred); pos += cache->data_offs << HMDFS_STASH_BLK_SHIFT; written = kernel_write(cache->cache_file, buf, ctx->count, &pos); hmdfs_revert_creds(old_cred); memalloc_nofs_restore(flags); kunmap(page); if (written != ctx->count) { hmdfs_err("stash peer 0x%x:0x%llx ino 0x%llx page 0x%lx data_offs 0x%x len %u err %zd", conn->owner, conn->device_id, info->remote_ino, page->index, cache->data_offs, ctx->count, written); err = -EIO; } return err; } int hmdfs_stash_writepage(struct hmdfs_peer *conn, struct hmdfs_writepage_context *ctx) { struct inode *inode = ctx->page->mapping->host; struct hmdfs_inode_info *info = hmdfs_i(inode); struct hmdfs_cache_info *cache = NULL; int err; /* e.g. fail to create stash file */ cache = info->cache; if (!cache) return -EIO; err = hmdfs_stash_write_local_file(conn, info, ctx, cache); if (!err) { hmdfs_client_writepage_done(info, ctx); atomic64_inc(&cache->written_pgs); put_task_struct(ctx->caller); kfree(ctx); } atomic64_inc(&cache->to_write_pgs); return err; } static void hmdfs_stash_rebuild_status(struct hmdfs_peer *conn, struct inode *inode) { char *path_str = NULL; struct hmdfs_inode_info *info = NULL; const struct cred *old_cred = NULL; struct path path; struct path *stash_path = NULL; int err = 0; path_str = kmalloc(HMDFS_STASH_PATH_LEN, GFP_KERNEL); if (!path_str) { err = -ENOMEM; return; } info = hmdfs_i(inode); err = snprintf(path_str, HMDFS_STASH_PATH_LEN, "%s/0x%llx", conn->cid, info->remote_ino); if (err >= HMDFS_STASH_PATH_LEN) { kfree(path_str); hmdfs_err("peer 0x%x:0x%llx inode 0x%llx too long name len", conn->owner, conn->device_id, info->remote_ino); return; } old_cred = hmdfs_override_creds(conn->sbi->cred); stash_path = &conn->sbi->stash_work_dir; err = vfs_path_lookup(stash_path->dentry, stash_path->mnt, path_str, 0, &path); hmdfs_revert_creds(old_cred); if (!err) { if (hmdfs_is_reg(path.dentry)) { WRITE_ONCE(info->stash_status, HMDFS_REMOTE_INODE_RESTORING); ihold(&info->vfs_inode); hmdfs_track_inode_locked(conn, info); } else { hmdfs_info("peer 0x%x:0x%llx inode 0x%llx unexpected stashed file mode 0%o", conn->owner, conn->device_id, info->remote_ino, d_inode(path.dentry)->i_mode); } path_put(&path); } else if (err && err != -ENOENT) { hmdfs_err("peer 0x%x:0x%llx inode 0x%llx find %s err %d", conn->owner, conn->device_id, info->remote_ino, path_str, err); } kfree(path_str); } static inline bool hmdfs_need_rebuild_inode_stash_status(struct hmdfs_peer *conn, umode_t mode) { return hmdfs_is_stash_enabled(conn->sbi) && READ_ONCE(conn->need_rebuild_stash_list) && (S_ISREG(mode) || S_ISLNK(mode)); } void hmdfs_remote_init_stash_status(struct hmdfs_peer *conn, struct inode *inode, umode_t mode) { if (!hmdfs_need_rebuild_inode_stash_status(conn, mode)) return; atomic_inc(&conn->rebuild_inode_status_nr); /* * Use smp_mb__after_atomic() to ensure order between writing * @conn->rebuild_inode_status_nr and reading * @conn->need_rebuild_stash_list. */ smp_mb__after_atomic(); if (READ_ONCE(conn->need_rebuild_stash_list)) hmdfs_stash_rebuild_status(conn, inode); if (atomic_dec_and_test(&conn->rebuild_inode_status_nr)) wake_up(&conn->rebuild_inode_status_wq); } static struct hmdfs_node_cb_desc stash_cb[] = { { .evt = NODE_EVT_OFFLINE, .sync = true, .fn = hmdfs_stash_offline_prepare, }, { .evt = NODE_EVT_OFFLINE, .sync = false, .fn = hmdfs_stash_offline_do_stash, }, { .evt = NODE_EVT_ADD, .sync = true, .fn = hmdfs_stash_add_do_check, }, { .evt = NODE_EVT_ONLINE, .sync = false, .fn = hmdfs_stash_online_prepare, }, { .evt = NODE_EVT_ONLINE, .sync = false, .fn = hmdfs_stash_online_do_restore, }, { .evt = NODE_EVT_DEL, .sync = true, .fn = hmdfs_stash_del_do_cleanup, }, }; void __init hmdfs_stash_add_node_evt_cb(void) { hmdfs_node_add_evt_cb(stash_cb, ARRAY_SIZE(stash_cb)); }