162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci#include <linux/ceph/ceph_debug.h> 362306a36Sopenharmony_ci 462306a36Sopenharmony_ci#include <linux/fs.h> 562306a36Sopenharmony_ci#include <linux/sort.h> 662306a36Sopenharmony_ci#include <linux/slab.h> 762306a36Sopenharmony_ci#include <linux/iversion.h> 862306a36Sopenharmony_ci#include "super.h" 962306a36Sopenharmony_ci#include "mds_client.h" 1062306a36Sopenharmony_ci#include <linux/ceph/decode.h> 1162306a36Sopenharmony_ci 1262306a36Sopenharmony_ci/* unused map expires after 5 minutes */ 1362306a36Sopenharmony_ci#define CEPH_SNAPID_MAP_TIMEOUT (5 * 60 * HZ) 1462306a36Sopenharmony_ci 1562306a36Sopenharmony_ci/* 1662306a36Sopenharmony_ci * Snapshots in ceph are driven in large part by cooperation from the 1762306a36Sopenharmony_ci * client. In contrast to local file systems or file servers that 1862306a36Sopenharmony_ci * implement snapshots at a single point in the system, ceph's 1962306a36Sopenharmony_ci * distributed access to storage requires clients to help decide 2062306a36Sopenharmony_ci * whether a write logically occurs before or after a recently created 2162306a36Sopenharmony_ci * snapshot. 2262306a36Sopenharmony_ci * 2362306a36Sopenharmony_ci * This provides a perfect instantanous client-wide snapshot. Between 2462306a36Sopenharmony_ci * clients, however, snapshots may appear to be applied at slightly 2562306a36Sopenharmony_ci * different points in time, depending on delays in delivering the 2662306a36Sopenharmony_ci * snapshot notification. 2762306a36Sopenharmony_ci * 2862306a36Sopenharmony_ci * Snapshots are _not_ file system-wide. Instead, each snapshot 2962306a36Sopenharmony_ci * applies to the subdirectory nested beneath some directory. This 3062306a36Sopenharmony_ci * effectively divides the hierarchy into multiple "realms," where all 3162306a36Sopenharmony_ci * of the files contained by each realm share the same set of 3262306a36Sopenharmony_ci * snapshots. An individual realm's snap set contains snapshots 3362306a36Sopenharmony_ci * explicitly created on that realm, as well as any snaps in its 3462306a36Sopenharmony_ci * parent's snap set _after_ the point at which the parent became it's 3562306a36Sopenharmony_ci * parent (due to, say, a rename). Similarly, snaps from prior parents 3662306a36Sopenharmony_ci * during the time intervals during which they were the parent are included. 3762306a36Sopenharmony_ci * 3862306a36Sopenharmony_ci * The client is spared most of this detail, fortunately... it must only 3962306a36Sopenharmony_ci * maintains a hierarchy of realms reflecting the current parent/child 4062306a36Sopenharmony_ci * realm relationship, and for each realm has an explicit list of snaps 4162306a36Sopenharmony_ci * inherited from prior parents. 4262306a36Sopenharmony_ci * 4362306a36Sopenharmony_ci * A snap_realm struct is maintained for realms containing every inode 4462306a36Sopenharmony_ci * with an open cap in the system. (The needed snap realm information is 4562306a36Sopenharmony_ci * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq' 4662306a36Sopenharmony_ci * version number is used to ensure that as realm parameters change (new 4762306a36Sopenharmony_ci * snapshot, new parent, etc.) the client's realm hierarchy is updated. 4862306a36Sopenharmony_ci * 4962306a36Sopenharmony_ci * The realm hierarchy drives the generation of a 'snap context' for each 5062306a36Sopenharmony_ci * realm, which simply lists the resulting set of snaps for the realm. This 5162306a36Sopenharmony_ci * is attached to any writes sent to OSDs. 5262306a36Sopenharmony_ci */ 5362306a36Sopenharmony_ci/* 5462306a36Sopenharmony_ci * Unfortunately error handling is a bit mixed here. If we get a snap 5562306a36Sopenharmony_ci * update, but don't have enough memory to update our realm hierarchy, 5662306a36Sopenharmony_ci * it's not clear what we can do about it (besides complaining to the 5762306a36Sopenharmony_ci * console). 5862306a36Sopenharmony_ci */ 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_ci 6162306a36Sopenharmony_ci/* 6262306a36Sopenharmony_ci * increase ref count for the realm 6362306a36Sopenharmony_ci * 6462306a36Sopenharmony_ci * caller must hold snap_rwsem. 6562306a36Sopenharmony_ci */ 6662306a36Sopenharmony_civoid ceph_get_snap_realm(struct ceph_mds_client *mdsc, 6762306a36Sopenharmony_ci struct ceph_snap_realm *realm) 6862306a36Sopenharmony_ci{ 6962306a36Sopenharmony_ci lockdep_assert_held(&mdsc->snap_rwsem); 7062306a36Sopenharmony_ci 7162306a36Sopenharmony_ci /* 7262306a36Sopenharmony_ci * The 0->1 and 1->0 transitions must take the snap_empty_lock 7362306a36Sopenharmony_ci * atomically with the refcount change. Go ahead and bump the 7462306a36Sopenharmony_ci * nref here, unless it's 0, in which case we take the spinlock 7562306a36Sopenharmony_ci * and then do the increment and remove it from the list. 7662306a36Sopenharmony_ci */ 7762306a36Sopenharmony_ci if (atomic_inc_not_zero(&realm->nref)) 7862306a36Sopenharmony_ci return; 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_ci spin_lock(&mdsc->snap_empty_lock); 8162306a36Sopenharmony_ci if (atomic_inc_return(&realm->nref) == 1) 8262306a36Sopenharmony_ci list_del_init(&realm->empty_item); 8362306a36Sopenharmony_ci spin_unlock(&mdsc->snap_empty_lock); 8462306a36Sopenharmony_ci} 8562306a36Sopenharmony_ci 8662306a36Sopenharmony_cistatic void __insert_snap_realm(struct rb_root *root, 8762306a36Sopenharmony_ci struct ceph_snap_realm *new) 8862306a36Sopenharmony_ci{ 8962306a36Sopenharmony_ci struct rb_node **p = &root->rb_node; 9062306a36Sopenharmony_ci struct rb_node *parent = NULL; 9162306a36Sopenharmony_ci struct ceph_snap_realm *r = NULL; 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_ci while (*p) { 9462306a36Sopenharmony_ci parent = *p; 9562306a36Sopenharmony_ci r = rb_entry(parent, struct ceph_snap_realm, node); 9662306a36Sopenharmony_ci if (new->ino < r->ino) 9762306a36Sopenharmony_ci p = &(*p)->rb_left; 9862306a36Sopenharmony_ci else if (new->ino > r->ino) 9962306a36Sopenharmony_ci p = &(*p)->rb_right; 10062306a36Sopenharmony_ci else 10162306a36Sopenharmony_ci BUG(); 10262306a36Sopenharmony_ci } 10362306a36Sopenharmony_ci 10462306a36Sopenharmony_ci rb_link_node(&new->node, parent, p); 10562306a36Sopenharmony_ci rb_insert_color(&new->node, root); 10662306a36Sopenharmony_ci} 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_ci/* 10962306a36Sopenharmony_ci * create and get the realm rooted at @ino and bump its ref count. 11062306a36Sopenharmony_ci * 11162306a36Sopenharmony_ci * caller must hold snap_rwsem for write. 11262306a36Sopenharmony_ci */ 11362306a36Sopenharmony_cistatic struct ceph_snap_realm *ceph_create_snap_realm( 11462306a36Sopenharmony_ci struct ceph_mds_client *mdsc, 11562306a36Sopenharmony_ci u64 ino) 11662306a36Sopenharmony_ci{ 11762306a36Sopenharmony_ci struct ceph_snap_realm *realm; 11862306a36Sopenharmony_ci 11962306a36Sopenharmony_ci lockdep_assert_held_write(&mdsc->snap_rwsem); 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_ci realm = kzalloc(sizeof(*realm), GFP_NOFS); 12262306a36Sopenharmony_ci if (!realm) 12362306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 12462306a36Sopenharmony_ci 12562306a36Sopenharmony_ci /* Do not release the global dummy snaprealm until unmouting */ 12662306a36Sopenharmony_ci if (ino == CEPH_INO_GLOBAL_SNAPREALM) 12762306a36Sopenharmony_ci atomic_set(&realm->nref, 2); 12862306a36Sopenharmony_ci else 12962306a36Sopenharmony_ci atomic_set(&realm->nref, 1); 13062306a36Sopenharmony_ci realm->ino = ino; 13162306a36Sopenharmony_ci INIT_LIST_HEAD(&realm->children); 13262306a36Sopenharmony_ci INIT_LIST_HEAD(&realm->child_item); 13362306a36Sopenharmony_ci INIT_LIST_HEAD(&realm->empty_item); 13462306a36Sopenharmony_ci INIT_LIST_HEAD(&realm->dirty_item); 13562306a36Sopenharmony_ci INIT_LIST_HEAD(&realm->rebuild_item); 13662306a36Sopenharmony_ci INIT_LIST_HEAD(&realm->inodes_with_caps); 13762306a36Sopenharmony_ci spin_lock_init(&realm->inodes_with_caps_lock); 13862306a36Sopenharmony_ci __insert_snap_realm(&mdsc->snap_realms, realm); 13962306a36Sopenharmony_ci mdsc->num_snap_realms++; 14062306a36Sopenharmony_ci 14162306a36Sopenharmony_ci dout("%s %llx %p\n", __func__, realm->ino, realm); 14262306a36Sopenharmony_ci return realm; 14362306a36Sopenharmony_ci} 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_ci/* 14662306a36Sopenharmony_ci * lookup the realm rooted at @ino. 14762306a36Sopenharmony_ci * 14862306a36Sopenharmony_ci * caller must hold snap_rwsem. 14962306a36Sopenharmony_ci */ 15062306a36Sopenharmony_cistatic struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, 15162306a36Sopenharmony_ci u64 ino) 15262306a36Sopenharmony_ci{ 15362306a36Sopenharmony_ci struct rb_node *n = mdsc->snap_realms.rb_node; 15462306a36Sopenharmony_ci struct ceph_snap_realm *r; 15562306a36Sopenharmony_ci 15662306a36Sopenharmony_ci lockdep_assert_held(&mdsc->snap_rwsem); 15762306a36Sopenharmony_ci 15862306a36Sopenharmony_ci while (n) { 15962306a36Sopenharmony_ci r = rb_entry(n, struct ceph_snap_realm, node); 16062306a36Sopenharmony_ci if (ino < r->ino) 16162306a36Sopenharmony_ci n = n->rb_left; 16262306a36Sopenharmony_ci else if (ino > r->ino) 16362306a36Sopenharmony_ci n = n->rb_right; 16462306a36Sopenharmony_ci else { 16562306a36Sopenharmony_ci dout("%s %llx %p\n", __func__, r->ino, r); 16662306a36Sopenharmony_ci return r; 16762306a36Sopenharmony_ci } 16862306a36Sopenharmony_ci } 16962306a36Sopenharmony_ci return NULL; 17062306a36Sopenharmony_ci} 17162306a36Sopenharmony_ci 17262306a36Sopenharmony_cistruct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, 17362306a36Sopenharmony_ci u64 ino) 17462306a36Sopenharmony_ci{ 17562306a36Sopenharmony_ci struct ceph_snap_realm *r; 17662306a36Sopenharmony_ci r = __lookup_snap_realm(mdsc, ino); 17762306a36Sopenharmony_ci if (r) 17862306a36Sopenharmony_ci ceph_get_snap_realm(mdsc, r); 17962306a36Sopenharmony_ci return r; 18062306a36Sopenharmony_ci} 18162306a36Sopenharmony_ci 18262306a36Sopenharmony_cistatic void __put_snap_realm(struct ceph_mds_client *mdsc, 18362306a36Sopenharmony_ci struct ceph_snap_realm *realm); 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_ci/* 18662306a36Sopenharmony_ci * called with snap_rwsem (write) 18762306a36Sopenharmony_ci */ 18862306a36Sopenharmony_cistatic void __destroy_snap_realm(struct ceph_mds_client *mdsc, 18962306a36Sopenharmony_ci struct ceph_snap_realm *realm) 19062306a36Sopenharmony_ci{ 19162306a36Sopenharmony_ci lockdep_assert_held_write(&mdsc->snap_rwsem); 19262306a36Sopenharmony_ci 19362306a36Sopenharmony_ci dout("%s %p %llx\n", __func__, realm, realm->ino); 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_ci rb_erase(&realm->node, &mdsc->snap_realms); 19662306a36Sopenharmony_ci mdsc->num_snap_realms--; 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci if (realm->parent) { 19962306a36Sopenharmony_ci list_del_init(&realm->child_item); 20062306a36Sopenharmony_ci __put_snap_realm(mdsc, realm->parent); 20162306a36Sopenharmony_ci } 20262306a36Sopenharmony_ci 20362306a36Sopenharmony_ci kfree(realm->prior_parent_snaps); 20462306a36Sopenharmony_ci kfree(realm->snaps); 20562306a36Sopenharmony_ci ceph_put_snap_context(realm->cached_context); 20662306a36Sopenharmony_ci kfree(realm); 20762306a36Sopenharmony_ci} 20862306a36Sopenharmony_ci 20962306a36Sopenharmony_ci/* 21062306a36Sopenharmony_ci * caller holds snap_rwsem (write) 21162306a36Sopenharmony_ci */ 21262306a36Sopenharmony_cistatic void __put_snap_realm(struct ceph_mds_client *mdsc, 21362306a36Sopenharmony_ci struct ceph_snap_realm *realm) 21462306a36Sopenharmony_ci{ 21562306a36Sopenharmony_ci lockdep_assert_held_write(&mdsc->snap_rwsem); 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci /* 21862306a36Sopenharmony_ci * We do not require the snap_empty_lock here, as any caller that 21962306a36Sopenharmony_ci * increments the value must hold the snap_rwsem. 22062306a36Sopenharmony_ci */ 22162306a36Sopenharmony_ci if (atomic_dec_and_test(&realm->nref)) 22262306a36Sopenharmony_ci __destroy_snap_realm(mdsc, realm); 22362306a36Sopenharmony_ci} 22462306a36Sopenharmony_ci 22562306a36Sopenharmony_ci/* 22662306a36Sopenharmony_ci * See comments in ceph_get_snap_realm. Caller needn't hold any locks. 22762306a36Sopenharmony_ci */ 22862306a36Sopenharmony_civoid ceph_put_snap_realm(struct ceph_mds_client *mdsc, 22962306a36Sopenharmony_ci struct ceph_snap_realm *realm) 23062306a36Sopenharmony_ci{ 23162306a36Sopenharmony_ci if (!atomic_dec_and_lock(&realm->nref, &mdsc->snap_empty_lock)) 23262306a36Sopenharmony_ci return; 23362306a36Sopenharmony_ci 23462306a36Sopenharmony_ci if (down_write_trylock(&mdsc->snap_rwsem)) { 23562306a36Sopenharmony_ci spin_unlock(&mdsc->snap_empty_lock); 23662306a36Sopenharmony_ci __destroy_snap_realm(mdsc, realm); 23762306a36Sopenharmony_ci up_write(&mdsc->snap_rwsem); 23862306a36Sopenharmony_ci } else { 23962306a36Sopenharmony_ci list_add(&realm->empty_item, &mdsc->snap_empty); 24062306a36Sopenharmony_ci spin_unlock(&mdsc->snap_empty_lock); 24162306a36Sopenharmony_ci } 24262306a36Sopenharmony_ci} 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ci/* 24562306a36Sopenharmony_ci * Clean up any realms whose ref counts have dropped to zero. Note 24662306a36Sopenharmony_ci * that this does not include realms who were created but not yet 24762306a36Sopenharmony_ci * used. 24862306a36Sopenharmony_ci * 24962306a36Sopenharmony_ci * Called under snap_rwsem (write) 25062306a36Sopenharmony_ci */ 25162306a36Sopenharmony_cistatic void __cleanup_empty_realms(struct ceph_mds_client *mdsc) 25262306a36Sopenharmony_ci{ 25362306a36Sopenharmony_ci struct ceph_snap_realm *realm; 25462306a36Sopenharmony_ci 25562306a36Sopenharmony_ci lockdep_assert_held_write(&mdsc->snap_rwsem); 25662306a36Sopenharmony_ci 25762306a36Sopenharmony_ci spin_lock(&mdsc->snap_empty_lock); 25862306a36Sopenharmony_ci while (!list_empty(&mdsc->snap_empty)) { 25962306a36Sopenharmony_ci realm = list_first_entry(&mdsc->snap_empty, 26062306a36Sopenharmony_ci struct ceph_snap_realm, empty_item); 26162306a36Sopenharmony_ci list_del(&realm->empty_item); 26262306a36Sopenharmony_ci spin_unlock(&mdsc->snap_empty_lock); 26362306a36Sopenharmony_ci __destroy_snap_realm(mdsc, realm); 26462306a36Sopenharmony_ci spin_lock(&mdsc->snap_empty_lock); 26562306a36Sopenharmony_ci } 26662306a36Sopenharmony_ci spin_unlock(&mdsc->snap_empty_lock); 26762306a36Sopenharmony_ci} 26862306a36Sopenharmony_ci 26962306a36Sopenharmony_civoid ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc) 27062306a36Sopenharmony_ci{ 27162306a36Sopenharmony_ci struct ceph_snap_realm *global_realm; 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci down_write(&mdsc->snap_rwsem); 27462306a36Sopenharmony_ci global_realm = __lookup_snap_realm(mdsc, CEPH_INO_GLOBAL_SNAPREALM); 27562306a36Sopenharmony_ci if (global_realm) 27662306a36Sopenharmony_ci ceph_put_snap_realm(mdsc, global_realm); 27762306a36Sopenharmony_ci __cleanup_empty_realms(mdsc); 27862306a36Sopenharmony_ci up_write(&mdsc->snap_rwsem); 27962306a36Sopenharmony_ci} 28062306a36Sopenharmony_ci 28162306a36Sopenharmony_ci/* 28262306a36Sopenharmony_ci * adjust the parent realm of a given @realm. adjust child list, and parent 28362306a36Sopenharmony_ci * pointers, and ref counts appropriately. 28462306a36Sopenharmony_ci * 28562306a36Sopenharmony_ci * return true if parent was changed, 0 if unchanged, <0 on error. 28662306a36Sopenharmony_ci * 28762306a36Sopenharmony_ci * caller must hold snap_rwsem for write. 28862306a36Sopenharmony_ci */ 28962306a36Sopenharmony_cistatic int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, 29062306a36Sopenharmony_ci struct ceph_snap_realm *realm, 29162306a36Sopenharmony_ci u64 parentino) 29262306a36Sopenharmony_ci{ 29362306a36Sopenharmony_ci struct ceph_snap_realm *parent; 29462306a36Sopenharmony_ci 29562306a36Sopenharmony_ci lockdep_assert_held_write(&mdsc->snap_rwsem); 29662306a36Sopenharmony_ci 29762306a36Sopenharmony_ci if (realm->parent_ino == parentino) 29862306a36Sopenharmony_ci return 0; 29962306a36Sopenharmony_ci 30062306a36Sopenharmony_ci parent = ceph_lookup_snap_realm(mdsc, parentino); 30162306a36Sopenharmony_ci if (!parent) { 30262306a36Sopenharmony_ci parent = ceph_create_snap_realm(mdsc, parentino); 30362306a36Sopenharmony_ci if (IS_ERR(parent)) 30462306a36Sopenharmony_ci return PTR_ERR(parent); 30562306a36Sopenharmony_ci } 30662306a36Sopenharmony_ci dout("%s %llx %p: %llx %p -> %llx %p\n", __func__, realm->ino, 30762306a36Sopenharmony_ci realm, realm->parent_ino, realm->parent, parentino, parent); 30862306a36Sopenharmony_ci if (realm->parent) { 30962306a36Sopenharmony_ci list_del_init(&realm->child_item); 31062306a36Sopenharmony_ci ceph_put_snap_realm(mdsc, realm->parent); 31162306a36Sopenharmony_ci } 31262306a36Sopenharmony_ci realm->parent_ino = parentino; 31362306a36Sopenharmony_ci realm->parent = parent; 31462306a36Sopenharmony_ci list_add(&realm->child_item, &parent->children); 31562306a36Sopenharmony_ci return 1; 31662306a36Sopenharmony_ci} 31762306a36Sopenharmony_ci 31862306a36Sopenharmony_ci 31962306a36Sopenharmony_cistatic int cmpu64_rev(const void *a, const void *b) 32062306a36Sopenharmony_ci{ 32162306a36Sopenharmony_ci if (*(u64 *)a < *(u64 *)b) 32262306a36Sopenharmony_ci return 1; 32362306a36Sopenharmony_ci if (*(u64 *)a > *(u64 *)b) 32462306a36Sopenharmony_ci return -1; 32562306a36Sopenharmony_ci return 0; 32662306a36Sopenharmony_ci} 32762306a36Sopenharmony_ci 32862306a36Sopenharmony_ci 32962306a36Sopenharmony_ci/* 33062306a36Sopenharmony_ci * build the snap context for a given realm. 33162306a36Sopenharmony_ci */ 33262306a36Sopenharmony_cistatic int build_snap_context(struct ceph_snap_realm *realm, 33362306a36Sopenharmony_ci struct list_head *realm_queue, 33462306a36Sopenharmony_ci struct list_head *dirty_realms) 33562306a36Sopenharmony_ci{ 33662306a36Sopenharmony_ci struct ceph_snap_realm *parent = realm->parent; 33762306a36Sopenharmony_ci struct ceph_snap_context *snapc; 33862306a36Sopenharmony_ci int err = 0; 33962306a36Sopenharmony_ci u32 num = realm->num_prior_parent_snaps + realm->num_snaps; 34062306a36Sopenharmony_ci 34162306a36Sopenharmony_ci /* 34262306a36Sopenharmony_ci * build parent context, if it hasn't been built. 34362306a36Sopenharmony_ci * conservatively estimate that all parent snaps might be 34462306a36Sopenharmony_ci * included by us. 34562306a36Sopenharmony_ci */ 34662306a36Sopenharmony_ci if (parent) { 34762306a36Sopenharmony_ci if (!parent->cached_context) { 34862306a36Sopenharmony_ci /* add to the queue head */ 34962306a36Sopenharmony_ci list_add(&parent->rebuild_item, realm_queue); 35062306a36Sopenharmony_ci return 1; 35162306a36Sopenharmony_ci } 35262306a36Sopenharmony_ci num += parent->cached_context->num_snaps; 35362306a36Sopenharmony_ci } 35462306a36Sopenharmony_ci 35562306a36Sopenharmony_ci /* do i actually need to update? not if my context seq 35662306a36Sopenharmony_ci matches realm seq, and my parents' does to. (this works 35762306a36Sopenharmony_ci because we rebuild_snap_realms() works _downward_ in 35862306a36Sopenharmony_ci hierarchy after each update.) */ 35962306a36Sopenharmony_ci if (realm->cached_context && 36062306a36Sopenharmony_ci realm->cached_context->seq == realm->seq && 36162306a36Sopenharmony_ci (!parent || 36262306a36Sopenharmony_ci realm->cached_context->seq >= parent->cached_context->seq)) { 36362306a36Sopenharmony_ci dout("%s %llx %p: %p seq %lld (%u snaps) (unchanged)\n", 36462306a36Sopenharmony_ci __func__, realm->ino, realm, realm->cached_context, 36562306a36Sopenharmony_ci realm->cached_context->seq, 36662306a36Sopenharmony_ci (unsigned int)realm->cached_context->num_snaps); 36762306a36Sopenharmony_ci return 0; 36862306a36Sopenharmony_ci } 36962306a36Sopenharmony_ci 37062306a36Sopenharmony_ci /* alloc new snap context */ 37162306a36Sopenharmony_ci err = -ENOMEM; 37262306a36Sopenharmony_ci if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) 37362306a36Sopenharmony_ci goto fail; 37462306a36Sopenharmony_ci snapc = ceph_create_snap_context(num, GFP_NOFS); 37562306a36Sopenharmony_ci if (!snapc) 37662306a36Sopenharmony_ci goto fail; 37762306a36Sopenharmony_ci 37862306a36Sopenharmony_ci /* build (reverse sorted) snap vector */ 37962306a36Sopenharmony_ci num = 0; 38062306a36Sopenharmony_ci snapc->seq = realm->seq; 38162306a36Sopenharmony_ci if (parent) { 38262306a36Sopenharmony_ci u32 i; 38362306a36Sopenharmony_ci 38462306a36Sopenharmony_ci /* include any of parent's snaps occurring _after_ my 38562306a36Sopenharmony_ci parent became my parent */ 38662306a36Sopenharmony_ci for (i = 0; i < parent->cached_context->num_snaps; i++) 38762306a36Sopenharmony_ci if (parent->cached_context->snaps[i] >= 38862306a36Sopenharmony_ci realm->parent_since) 38962306a36Sopenharmony_ci snapc->snaps[num++] = 39062306a36Sopenharmony_ci parent->cached_context->snaps[i]; 39162306a36Sopenharmony_ci if (parent->cached_context->seq > snapc->seq) 39262306a36Sopenharmony_ci snapc->seq = parent->cached_context->seq; 39362306a36Sopenharmony_ci } 39462306a36Sopenharmony_ci memcpy(snapc->snaps + num, realm->snaps, 39562306a36Sopenharmony_ci sizeof(u64)*realm->num_snaps); 39662306a36Sopenharmony_ci num += realm->num_snaps; 39762306a36Sopenharmony_ci memcpy(snapc->snaps + num, realm->prior_parent_snaps, 39862306a36Sopenharmony_ci sizeof(u64)*realm->num_prior_parent_snaps); 39962306a36Sopenharmony_ci num += realm->num_prior_parent_snaps; 40062306a36Sopenharmony_ci 40162306a36Sopenharmony_ci sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL); 40262306a36Sopenharmony_ci snapc->num_snaps = num; 40362306a36Sopenharmony_ci dout("%s %llx %p: %p seq %lld (%u snaps)\n", __func__, realm->ino, 40462306a36Sopenharmony_ci realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); 40562306a36Sopenharmony_ci 40662306a36Sopenharmony_ci ceph_put_snap_context(realm->cached_context); 40762306a36Sopenharmony_ci realm->cached_context = snapc; 40862306a36Sopenharmony_ci /* queue realm for cap_snap creation */ 40962306a36Sopenharmony_ci list_add_tail(&realm->dirty_item, dirty_realms); 41062306a36Sopenharmony_ci return 0; 41162306a36Sopenharmony_ci 41262306a36Sopenharmony_cifail: 41362306a36Sopenharmony_ci /* 41462306a36Sopenharmony_ci * if we fail, clear old (incorrect) cached_context... hopefully 41562306a36Sopenharmony_ci * we'll have better luck building it later 41662306a36Sopenharmony_ci */ 41762306a36Sopenharmony_ci if (realm->cached_context) { 41862306a36Sopenharmony_ci ceph_put_snap_context(realm->cached_context); 41962306a36Sopenharmony_ci realm->cached_context = NULL; 42062306a36Sopenharmony_ci } 42162306a36Sopenharmony_ci pr_err("%s %llx %p fail %d\n", __func__, realm->ino, realm, err); 42262306a36Sopenharmony_ci return err; 42362306a36Sopenharmony_ci} 42462306a36Sopenharmony_ci 42562306a36Sopenharmony_ci/* 42662306a36Sopenharmony_ci * rebuild snap context for the given realm and all of its children. 42762306a36Sopenharmony_ci */ 42862306a36Sopenharmony_cistatic void rebuild_snap_realms(struct ceph_snap_realm *realm, 42962306a36Sopenharmony_ci struct list_head *dirty_realms) 43062306a36Sopenharmony_ci{ 43162306a36Sopenharmony_ci LIST_HEAD(realm_queue); 43262306a36Sopenharmony_ci int last = 0; 43362306a36Sopenharmony_ci bool skip = false; 43462306a36Sopenharmony_ci 43562306a36Sopenharmony_ci list_add_tail(&realm->rebuild_item, &realm_queue); 43662306a36Sopenharmony_ci 43762306a36Sopenharmony_ci while (!list_empty(&realm_queue)) { 43862306a36Sopenharmony_ci struct ceph_snap_realm *_realm, *child; 43962306a36Sopenharmony_ci 44062306a36Sopenharmony_ci _realm = list_first_entry(&realm_queue, 44162306a36Sopenharmony_ci struct ceph_snap_realm, 44262306a36Sopenharmony_ci rebuild_item); 44362306a36Sopenharmony_ci 44462306a36Sopenharmony_ci /* 44562306a36Sopenharmony_ci * If the last building failed dues to memory 44662306a36Sopenharmony_ci * issue, just empty the realm_queue and return 44762306a36Sopenharmony_ci * to avoid infinite loop. 44862306a36Sopenharmony_ci */ 44962306a36Sopenharmony_ci if (last < 0) { 45062306a36Sopenharmony_ci list_del_init(&_realm->rebuild_item); 45162306a36Sopenharmony_ci continue; 45262306a36Sopenharmony_ci } 45362306a36Sopenharmony_ci 45462306a36Sopenharmony_ci last = build_snap_context(_realm, &realm_queue, dirty_realms); 45562306a36Sopenharmony_ci dout("%s %llx %p, %s\n", __func__, _realm->ino, _realm, 45662306a36Sopenharmony_ci last > 0 ? "is deferred" : !last ? "succeeded" : "failed"); 45762306a36Sopenharmony_ci 45862306a36Sopenharmony_ci /* is any child in the list ? */ 45962306a36Sopenharmony_ci list_for_each_entry(child, &_realm->children, child_item) { 46062306a36Sopenharmony_ci if (!list_empty(&child->rebuild_item)) { 46162306a36Sopenharmony_ci skip = true; 46262306a36Sopenharmony_ci break; 46362306a36Sopenharmony_ci } 46462306a36Sopenharmony_ci } 46562306a36Sopenharmony_ci 46662306a36Sopenharmony_ci if (!skip) { 46762306a36Sopenharmony_ci list_for_each_entry(child, &_realm->children, child_item) 46862306a36Sopenharmony_ci list_add_tail(&child->rebuild_item, &realm_queue); 46962306a36Sopenharmony_ci } 47062306a36Sopenharmony_ci 47162306a36Sopenharmony_ci /* last == 1 means need to build parent first */ 47262306a36Sopenharmony_ci if (last <= 0) 47362306a36Sopenharmony_ci list_del_init(&_realm->rebuild_item); 47462306a36Sopenharmony_ci } 47562306a36Sopenharmony_ci} 47662306a36Sopenharmony_ci 47762306a36Sopenharmony_ci 47862306a36Sopenharmony_ci/* 47962306a36Sopenharmony_ci * helper to allocate and decode an array of snapids. free prior 48062306a36Sopenharmony_ci * instance, if any. 48162306a36Sopenharmony_ci */ 48262306a36Sopenharmony_cistatic int dup_array(u64 **dst, __le64 *src, u32 num) 48362306a36Sopenharmony_ci{ 48462306a36Sopenharmony_ci u32 i; 48562306a36Sopenharmony_ci 48662306a36Sopenharmony_ci kfree(*dst); 48762306a36Sopenharmony_ci if (num) { 48862306a36Sopenharmony_ci *dst = kcalloc(num, sizeof(u64), GFP_NOFS); 48962306a36Sopenharmony_ci if (!*dst) 49062306a36Sopenharmony_ci return -ENOMEM; 49162306a36Sopenharmony_ci for (i = 0; i < num; i++) 49262306a36Sopenharmony_ci (*dst)[i] = get_unaligned_le64(src + i); 49362306a36Sopenharmony_ci } else { 49462306a36Sopenharmony_ci *dst = NULL; 49562306a36Sopenharmony_ci } 49662306a36Sopenharmony_ci return 0; 49762306a36Sopenharmony_ci} 49862306a36Sopenharmony_ci 49962306a36Sopenharmony_cistatic bool has_new_snaps(struct ceph_snap_context *o, 50062306a36Sopenharmony_ci struct ceph_snap_context *n) 50162306a36Sopenharmony_ci{ 50262306a36Sopenharmony_ci if (n->num_snaps == 0) 50362306a36Sopenharmony_ci return false; 50462306a36Sopenharmony_ci /* snaps are in descending order */ 50562306a36Sopenharmony_ci return n->snaps[0] > o->seq; 50662306a36Sopenharmony_ci} 50762306a36Sopenharmony_ci 50862306a36Sopenharmony_ci/* 50962306a36Sopenharmony_ci * When a snapshot is applied, the size/mtime inode metadata is queued 51062306a36Sopenharmony_ci * in a ceph_cap_snap (one for each snapshot) until writeback 51162306a36Sopenharmony_ci * completes and the metadata can be flushed back to the MDS. 51262306a36Sopenharmony_ci * 51362306a36Sopenharmony_ci * However, if a (sync) write is currently in-progress when we apply 51462306a36Sopenharmony_ci * the snapshot, we have to wait until the write succeeds or fails 51562306a36Sopenharmony_ci * (and a final size/mtime is known). In this case the 51662306a36Sopenharmony_ci * cap_snap->writing = 1, and is said to be "pending." When the write 51762306a36Sopenharmony_ci * finishes, we __ceph_finish_cap_snap(). 51862306a36Sopenharmony_ci * 51962306a36Sopenharmony_ci * Caller must hold snap_rwsem for read (i.e., the realm topology won't 52062306a36Sopenharmony_ci * change). 52162306a36Sopenharmony_ci */ 52262306a36Sopenharmony_cistatic void ceph_queue_cap_snap(struct ceph_inode_info *ci, 52362306a36Sopenharmony_ci struct ceph_cap_snap **pcapsnap) 52462306a36Sopenharmony_ci{ 52562306a36Sopenharmony_ci struct inode *inode = &ci->netfs.inode; 52662306a36Sopenharmony_ci struct ceph_snap_context *old_snapc, *new_snapc; 52762306a36Sopenharmony_ci struct ceph_cap_snap *capsnap = *pcapsnap; 52862306a36Sopenharmony_ci struct ceph_buffer *old_blob = NULL; 52962306a36Sopenharmony_ci int used, dirty; 53062306a36Sopenharmony_ci 53162306a36Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 53262306a36Sopenharmony_ci used = __ceph_caps_used(ci); 53362306a36Sopenharmony_ci dirty = __ceph_caps_dirty(ci); 53462306a36Sopenharmony_ci 53562306a36Sopenharmony_ci old_snapc = ci->i_head_snapc; 53662306a36Sopenharmony_ci new_snapc = ci->i_snap_realm->cached_context; 53762306a36Sopenharmony_ci 53862306a36Sopenharmony_ci /* 53962306a36Sopenharmony_ci * If there is a write in progress, treat that as a dirty Fw, 54062306a36Sopenharmony_ci * even though it hasn't completed yet; by the time we finish 54162306a36Sopenharmony_ci * up this capsnap it will be. 54262306a36Sopenharmony_ci */ 54362306a36Sopenharmony_ci if (used & CEPH_CAP_FILE_WR) 54462306a36Sopenharmony_ci dirty |= CEPH_CAP_FILE_WR; 54562306a36Sopenharmony_ci 54662306a36Sopenharmony_ci if (__ceph_have_pending_cap_snap(ci)) { 54762306a36Sopenharmony_ci /* there is no point in queuing multiple "pending" cap_snaps, 54862306a36Sopenharmony_ci as no new writes are allowed to start when pending, so any 54962306a36Sopenharmony_ci writes in progress now were started before the previous 55062306a36Sopenharmony_ci cap_snap. lucky us. */ 55162306a36Sopenharmony_ci dout("%s %p %llx.%llx already pending\n", 55262306a36Sopenharmony_ci __func__, inode, ceph_vinop(inode)); 55362306a36Sopenharmony_ci goto update_snapc; 55462306a36Sopenharmony_ci } 55562306a36Sopenharmony_ci if (ci->i_wrbuffer_ref_head == 0 && 55662306a36Sopenharmony_ci !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) { 55762306a36Sopenharmony_ci dout("%s %p %llx.%llx nothing dirty|writing\n", 55862306a36Sopenharmony_ci __func__, inode, ceph_vinop(inode)); 55962306a36Sopenharmony_ci goto update_snapc; 56062306a36Sopenharmony_ci } 56162306a36Sopenharmony_ci 56262306a36Sopenharmony_ci BUG_ON(!old_snapc); 56362306a36Sopenharmony_ci 56462306a36Sopenharmony_ci /* 56562306a36Sopenharmony_ci * There is no need to send FLUSHSNAP message to MDS if there is 56662306a36Sopenharmony_ci * no new snapshot. But when there is dirty pages or on-going 56762306a36Sopenharmony_ci * writes, we still need to create cap_snap. cap_snap is needed 56862306a36Sopenharmony_ci * by the write path and page writeback path. 56962306a36Sopenharmony_ci * 57062306a36Sopenharmony_ci * also see ceph_try_drop_cap_snap() 57162306a36Sopenharmony_ci */ 57262306a36Sopenharmony_ci if (has_new_snaps(old_snapc, new_snapc)) { 57362306a36Sopenharmony_ci if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR)) 57462306a36Sopenharmony_ci capsnap->need_flush = true; 57562306a36Sopenharmony_ci } else { 57662306a36Sopenharmony_ci if (!(used & CEPH_CAP_FILE_WR) && 57762306a36Sopenharmony_ci ci->i_wrbuffer_ref_head == 0) { 57862306a36Sopenharmony_ci dout("%s %p %llx.%llx no new_snap|dirty_page|writing\n", 57962306a36Sopenharmony_ci __func__, inode, ceph_vinop(inode)); 58062306a36Sopenharmony_ci goto update_snapc; 58162306a36Sopenharmony_ci } 58262306a36Sopenharmony_ci } 58362306a36Sopenharmony_ci 58462306a36Sopenharmony_ci dout("%s %p %llx.%llx cap_snap %p queuing under %p %s %s\n", 58562306a36Sopenharmony_ci __func__, inode, ceph_vinop(inode), capsnap, old_snapc, 58662306a36Sopenharmony_ci ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush"); 58762306a36Sopenharmony_ci ihold(inode); 58862306a36Sopenharmony_ci 58962306a36Sopenharmony_ci capsnap->follows = old_snapc->seq; 59062306a36Sopenharmony_ci capsnap->issued = __ceph_caps_issued(ci, NULL); 59162306a36Sopenharmony_ci capsnap->dirty = dirty; 59262306a36Sopenharmony_ci 59362306a36Sopenharmony_ci capsnap->mode = inode->i_mode; 59462306a36Sopenharmony_ci capsnap->uid = inode->i_uid; 59562306a36Sopenharmony_ci capsnap->gid = inode->i_gid; 59662306a36Sopenharmony_ci 59762306a36Sopenharmony_ci if (dirty & CEPH_CAP_XATTR_EXCL) { 59862306a36Sopenharmony_ci old_blob = __ceph_build_xattrs_blob(ci); 59962306a36Sopenharmony_ci capsnap->xattr_blob = 60062306a36Sopenharmony_ci ceph_buffer_get(ci->i_xattrs.blob); 60162306a36Sopenharmony_ci capsnap->xattr_version = ci->i_xattrs.version; 60262306a36Sopenharmony_ci } else { 60362306a36Sopenharmony_ci capsnap->xattr_blob = NULL; 60462306a36Sopenharmony_ci capsnap->xattr_version = 0; 60562306a36Sopenharmony_ci } 60662306a36Sopenharmony_ci 60762306a36Sopenharmony_ci capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; 60862306a36Sopenharmony_ci 60962306a36Sopenharmony_ci /* dirty page count moved from _head to this cap_snap; 61062306a36Sopenharmony_ci all subsequent writes page dirties occur _after_ this 61162306a36Sopenharmony_ci snapshot. */ 61262306a36Sopenharmony_ci capsnap->dirty_pages = ci->i_wrbuffer_ref_head; 61362306a36Sopenharmony_ci ci->i_wrbuffer_ref_head = 0; 61462306a36Sopenharmony_ci capsnap->context = old_snapc; 61562306a36Sopenharmony_ci list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); 61662306a36Sopenharmony_ci 61762306a36Sopenharmony_ci if (used & CEPH_CAP_FILE_WR) { 61862306a36Sopenharmony_ci dout("%s %p %llx.%llx cap_snap %p snapc %p seq %llu used WR," 61962306a36Sopenharmony_ci " now pending\n", __func__, inode, ceph_vinop(inode), 62062306a36Sopenharmony_ci capsnap, old_snapc, old_snapc->seq); 62162306a36Sopenharmony_ci capsnap->writing = 1; 62262306a36Sopenharmony_ci } else { 62362306a36Sopenharmony_ci /* note mtime, size NOW. */ 62462306a36Sopenharmony_ci __ceph_finish_cap_snap(ci, capsnap); 62562306a36Sopenharmony_ci } 62662306a36Sopenharmony_ci *pcapsnap = NULL; 62762306a36Sopenharmony_ci old_snapc = NULL; 62862306a36Sopenharmony_ci 62962306a36Sopenharmony_ciupdate_snapc: 63062306a36Sopenharmony_ci if (ci->i_wrbuffer_ref_head == 0 && 63162306a36Sopenharmony_ci ci->i_wr_ref == 0 && 63262306a36Sopenharmony_ci ci->i_dirty_caps == 0 && 63362306a36Sopenharmony_ci ci->i_flushing_caps == 0) { 63462306a36Sopenharmony_ci ci->i_head_snapc = NULL; 63562306a36Sopenharmony_ci } else { 63662306a36Sopenharmony_ci ci->i_head_snapc = ceph_get_snap_context(new_snapc); 63762306a36Sopenharmony_ci dout(" new snapc is %p\n", new_snapc); 63862306a36Sopenharmony_ci } 63962306a36Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 64062306a36Sopenharmony_ci 64162306a36Sopenharmony_ci ceph_buffer_put(old_blob); 64262306a36Sopenharmony_ci ceph_put_snap_context(old_snapc); 64362306a36Sopenharmony_ci} 64462306a36Sopenharmony_ci 64562306a36Sopenharmony_ci/* 64662306a36Sopenharmony_ci * Finalize the size, mtime for a cap_snap.. that is, settle on final values 64762306a36Sopenharmony_ci * to be used for the snapshot, to be flushed back to the mds. 64862306a36Sopenharmony_ci * 64962306a36Sopenharmony_ci * If capsnap can now be flushed, add to snap_flush list, and return 1. 65062306a36Sopenharmony_ci * 65162306a36Sopenharmony_ci * Caller must hold i_ceph_lock. 65262306a36Sopenharmony_ci */ 65362306a36Sopenharmony_ciint __ceph_finish_cap_snap(struct ceph_inode_info *ci, 65462306a36Sopenharmony_ci struct ceph_cap_snap *capsnap) 65562306a36Sopenharmony_ci{ 65662306a36Sopenharmony_ci struct inode *inode = &ci->netfs.inode; 65762306a36Sopenharmony_ci struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); 65862306a36Sopenharmony_ci 65962306a36Sopenharmony_ci BUG_ON(capsnap->writing); 66062306a36Sopenharmony_ci capsnap->size = i_size_read(inode); 66162306a36Sopenharmony_ci capsnap->mtime = inode->i_mtime; 66262306a36Sopenharmony_ci capsnap->atime = inode->i_atime; 66362306a36Sopenharmony_ci capsnap->ctime = inode_get_ctime(inode); 66462306a36Sopenharmony_ci capsnap->btime = ci->i_btime; 66562306a36Sopenharmony_ci capsnap->change_attr = inode_peek_iversion_raw(inode); 66662306a36Sopenharmony_ci capsnap->time_warp_seq = ci->i_time_warp_seq; 66762306a36Sopenharmony_ci capsnap->truncate_size = ci->i_truncate_size; 66862306a36Sopenharmony_ci capsnap->truncate_seq = ci->i_truncate_seq; 66962306a36Sopenharmony_ci if (capsnap->dirty_pages) { 67062306a36Sopenharmony_ci dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu " 67162306a36Sopenharmony_ci "still has %d dirty pages\n", __func__, inode, 67262306a36Sopenharmony_ci ceph_vinop(inode), capsnap, capsnap->context, 67362306a36Sopenharmony_ci capsnap->context->seq, ceph_cap_string(capsnap->dirty), 67462306a36Sopenharmony_ci capsnap->size, capsnap->dirty_pages); 67562306a36Sopenharmony_ci return 0; 67662306a36Sopenharmony_ci } 67762306a36Sopenharmony_ci 67862306a36Sopenharmony_ci /* 67962306a36Sopenharmony_ci * Defer flushing the capsnap if the dirty buffer not flushed yet. 68062306a36Sopenharmony_ci * And trigger to flush the buffer immediately. 68162306a36Sopenharmony_ci */ 68262306a36Sopenharmony_ci if (ci->i_wrbuffer_ref) { 68362306a36Sopenharmony_ci dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu " 68462306a36Sopenharmony_ci "used WRBUFFER, delaying\n", __func__, inode, 68562306a36Sopenharmony_ci ceph_vinop(inode), capsnap, capsnap->context, 68662306a36Sopenharmony_ci capsnap->context->seq, ceph_cap_string(capsnap->dirty), 68762306a36Sopenharmony_ci capsnap->size); 68862306a36Sopenharmony_ci ceph_queue_writeback(inode); 68962306a36Sopenharmony_ci return 0; 69062306a36Sopenharmony_ci } 69162306a36Sopenharmony_ci 69262306a36Sopenharmony_ci ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; 69362306a36Sopenharmony_ci dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n", 69462306a36Sopenharmony_ci __func__, inode, ceph_vinop(inode), capsnap, capsnap->context, 69562306a36Sopenharmony_ci capsnap->context->seq, ceph_cap_string(capsnap->dirty), 69662306a36Sopenharmony_ci capsnap->size); 69762306a36Sopenharmony_ci 69862306a36Sopenharmony_ci spin_lock(&mdsc->snap_flush_lock); 69962306a36Sopenharmony_ci if (list_empty(&ci->i_snap_flush_item)) { 70062306a36Sopenharmony_ci ihold(inode); 70162306a36Sopenharmony_ci list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); 70262306a36Sopenharmony_ci } 70362306a36Sopenharmony_ci spin_unlock(&mdsc->snap_flush_lock); 70462306a36Sopenharmony_ci return 1; /* caller may want to ceph_flush_snaps */ 70562306a36Sopenharmony_ci} 70662306a36Sopenharmony_ci 70762306a36Sopenharmony_ci/* 70862306a36Sopenharmony_ci * Queue cap_snaps for snap writeback for this realm and its children. 70962306a36Sopenharmony_ci * Called under snap_rwsem, so realm topology won't change. 71062306a36Sopenharmony_ci */ 71162306a36Sopenharmony_cistatic void queue_realm_cap_snaps(struct ceph_snap_realm *realm) 71262306a36Sopenharmony_ci{ 71362306a36Sopenharmony_ci struct ceph_inode_info *ci; 71462306a36Sopenharmony_ci struct inode *lastinode = NULL; 71562306a36Sopenharmony_ci struct ceph_cap_snap *capsnap = NULL; 71662306a36Sopenharmony_ci 71762306a36Sopenharmony_ci dout("%s %p %llx inode\n", __func__, realm, realm->ino); 71862306a36Sopenharmony_ci 71962306a36Sopenharmony_ci spin_lock(&realm->inodes_with_caps_lock); 72062306a36Sopenharmony_ci list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) { 72162306a36Sopenharmony_ci struct inode *inode = igrab(&ci->netfs.inode); 72262306a36Sopenharmony_ci if (!inode) 72362306a36Sopenharmony_ci continue; 72462306a36Sopenharmony_ci spin_unlock(&realm->inodes_with_caps_lock); 72562306a36Sopenharmony_ci iput(lastinode); 72662306a36Sopenharmony_ci lastinode = inode; 72762306a36Sopenharmony_ci 72862306a36Sopenharmony_ci /* 72962306a36Sopenharmony_ci * Allocate the capsnap memory outside of ceph_queue_cap_snap() 73062306a36Sopenharmony_ci * to reduce very possible but unnecessary frequently memory 73162306a36Sopenharmony_ci * allocate/free in this loop. 73262306a36Sopenharmony_ci */ 73362306a36Sopenharmony_ci if (!capsnap) { 73462306a36Sopenharmony_ci capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS); 73562306a36Sopenharmony_ci if (!capsnap) { 73662306a36Sopenharmony_ci pr_err("ENOMEM allocating ceph_cap_snap on %p\n", 73762306a36Sopenharmony_ci inode); 73862306a36Sopenharmony_ci return; 73962306a36Sopenharmony_ci } 74062306a36Sopenharmony_ci } 74162306a36Sopenharmony_ci capsnap->cap_flush.is_capsnap = true; 74262306a36Sopenharmony_ci refcount_set(&capsnap->nref, 1); 74362306a36Sopenharmony_ci INIT_LIST_HEAD(&capsnap->cap_flush.i_list); 74462306a36Sopenharmony_ci INIT_LIST_HEAD(&capsnap->cap_flush.g_list); 74562306a36Sopenharmony_ci INIT_LIST_HEAD(&capsnap->ci_item); 74662306a36Sopenharmony_ci 74762306a36Sopenharmony_ci ceph_queue_cap_snap(ci, &capsnap); 74862306a36Sopenharmony_ci spin_lock(&realm->inodes_with_caps_lock); 74962306a36Sopenharmony_ci } 75062306a36Sopenharmony_ci spin_unlock(&realm->inodes_with_caps_lock); 75162306a36Sopenharmony_ci iput(lastinode); 75262306a36Sopenharmony_ci 75362306a36Sopenharmony_ci if (capsnap) 75462306a36Sopenharmony_ci kmem_cache_free(ceph_cap_snap_cachep, capsnap); 75562306a36Sopenharmony_ci dout("%s %p %llx done\n", __func__, realm, realm->ino); 75662306a36Sopenharmony_ci} 75762306a36Sopenharmony_ci 75862306a36Sopenharmony_ci/* 75962306a36Sopenharmony_ci * Parse and apply a snapblob "snap trace" from the MDS. This specifies 76062306a36Sopenharmony_ci * the snap realm parameters from a given realm and all of its ancestors, 76162306a36Sopenharmony_ci * up to the root. 76262306a36Sopenharmony_ci * 76362306a36Sopenharmony_ci * Caller must hold snap_rwsem for write. 76462306a36Sopenharmony_ci */ 76562306a36Sopenharmony_ciint ceph_update_snap_trace(struct ceph_mds_client *mdsc, 76662306a36Sopenharmony_ci void *p, void *e, bool deletion, 76762306a36Sopenharmony_ci struct ceph_snap_realm **realm_ret) 76862306a36Sopenharmony_ci{ 76962306a36Sopenharmony_ci struct ceph_mds_snap_realm *ri; /* encoded */ 77062306a36Sopenharmony_ci __le64 *snaps; /* encoded */ 77162306a36Sopenharmony_ci __le64 *prior_parent_snaps; /* encoded */ 77262306a36Sopenharmony_ci struct ceph_snap_realm *realm; 77362306a36Sopenharmony_ci struct ceph_snap_realm *first_realm = NULL; 77462306a36Sopenharmony_ci struct ceph_snap_realm *realm_to_rebuild = NULL; 77562306a36Sopenharmony_ci struct ceph_client *client = mdsc->fsc->client; 77662306a36Sopenharmony_ci int rebuild_snapcs; 77762306a36Sopenharmony_ci int err = -ENOMEM; 77862306a36Sopenharmony_ci int ret; 77962306a36Sopenharmony_ci LIST_HEAD(dirty_realms); 78062306a36Sopenharmony_ci 78162306a36Sopenharmony_ci lockdep_assert_held_write(&mdsc->snap_rwsem); 78262306a36Sopenharmony_ci 78362306a36Sopenharmony_ci dout("%s deletion=%d\n", __func__, deletion); 78462306a36Sopenharmony_cimore: 78562306a36Sopenharmony_ci realm = NULL; 78662306a36Sopenharmony_ci rebuild_snapcs = 0; 78762306a36Sopenharmony_ci ceph_decode_need(&p, e, sizeof(*ri), bad); 78862306a36Sopenharmony_ci ri = p; 78962306a36Sopenharmony_ci p += sizeof(*ri); 79062306a36Sopenharmony_ci ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) + 79162306a36Sopenharmony_ci le32_to_cpu(ri->num_prior_parent_snaps)), bad); 79262306a36Sopenharmony_ci snaps = p; 79362306a36Sopenharmony_ci p += sizeof(u64) * le32_to_cpu(ri->num_snaps); 79462306a36Sopenharmony_ci prior_parent_snaps = p; 79562306a36Sopenharmony_ci p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps); 79662306a36Sopenharmony_ci 79762306a36Sopenharmony_ci realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino)); 79862306a36Sopenharmony_ci if (!realm) { 79962306a36Sopenharmony_ci realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino)); 80062306a36Sopenharmony_ci if (IS_ERR(realm)) { 80162306a36Sopenharmony_ci err = PTR_ERR(realm); 80262306a36Sopenharmony_ci goto fail; 80362306a36Sopenharmony_ci } 80462306a36Sopenharmony_ci } 80562306a36Sopenharmony_ci 80662306a36Sopenharmony_ci /* ensure the parent is correct */ 80762306a36Sopenharmony_ci err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent)); 80862306a36Sopenharmony_ci if (err < 0) 80962306a36Sopenharmony_ci goto fail; 81062306a36Sopenharmony_ci rebuild_snapcs += err; 81162306a36Sopenharmony_ci 81262306a36Sopenharmony_ci if (le64_to_cpu(ri->seq) > realm->seq) { 81362306a36Sopenharmony_ci dout("%s updating %llx %p %lld -> %lld\n", __func__, 81462306a36Sopenharmony_ci realm->ino, realm, realm->seq, le64_to_cpu(ri->seq)); 81562306a36Sopenharmony_ci /* update realm parameters, snap lists */ 81662306a36Sopenharmony_ci realm->seq = le64_to_cpu(ri->seq); 81762306a36Sopenharmony_ci realm->created = le64_to_cpu(ri->created); 81862306a36Sopenharmony_ci realm->parent_since = le64_to_cpu(ri->parent_since); 81962306a36Sopenharmony_ci 82062306a36Sopenharmony_ci realm->num_snaps = le32_to_cpu(ri->num_snaps); 82162306a36Sopenharmony_ci err = dup_array(&realm->snaps, snaps, realm->num_snaps); 82262306a36Sopenharmony_ci if (err < 0) 82362306a36Sopenharmony_ci goto fail; 82462306a36Sopenharmony_ci 82562306a36Sopenharmony_ci realm->num_prior_parent_snaps = 82662306a36Sopenharmony_ci le32_to_cpu(ri->num_prior_parent_snaps); 82762306a36Sopenharmony_ci err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps, 82862306a36Sopenharmony_ci realm->num_prior_parent_snaps); 82962306a36Sopenharmony_ci if (err < 0) 83062306a36Sopenharmony_ci goto fail; 83162306a36Sopenharmony_ci 83262306a36Sopenharmony_ci if (realm->seq > mdsc->last_snap_seq) 83362306a36Sopenharmony_ci mdsc->last_snap_seq = realm->seq; 83462306a36Sopenharmony_ci 83562306a36Sopenharmony_ci rebuild_snapcs = 1; 83662306a36Sopenharmony_ci } else if (!realm->cached_context) { 83762306a36Sopenharmony_ci dout("%s %llx %p seq %lld new\n", __func__, 83862306a36Sopenharmony_ci realm->ino, realm, realm->seq); 83962306a36Sopenharmony_ci rebuild_snapcs = 1; 84062306a36Sopenharmony_ci } else { 84162306a36Sopenharmony_ci dout("%s %llx %p seq %lld unchanged\n", __func__, 84262306a36Sopenharmony_ci realm->ino, realm, realm->seq); 84362306a36Sopenharmony_ci } 84462306a36Sopenharmony_ci 84562306a36Sopenharmony_ci dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino, 84662306a36Sopenharmony_ci realm, rebuild_snapcs, p, e); 84762306a36Sopenharmony_ci 84862306a36Sopenharmony_ci /* 84962306a36Sopenharmony_ci * this will always track the uppest parent realm from which 85062306a36Sopenharmony_ci * we need to rebuild the snapshot contexts _downward_ in 85162306a36Sopenharmony_ci * hierarchy. 85262306a36Sopenharmony_ci */ 85362306a36Sopenharmony_ci if (rebuild_snapcs) 85462306a36Sopenharmony_ci realm_to_rebuild = realm; 85562306a36Sopenharmony_ci 85662306a36Sopenharmony_ci /* rebuild_snapcs when we reach the _end_ (root) of the trace */ 85762306a36Sopenharmony_ci if (realm_to_rebuild && p >= e) 85862306a36Sopenharmony_ci rebuild_snap_realms(realm_to_rebuild, &dirty_realms); 85962306a36Sopenharmony_ci 86062306a36Sopenharmony_ci if (!first_realm) 86162306a36Sopenharmony_ci first_realm = realm; 86262306a36Sopenharmony_ci else 86362306a36Sopenharmony_ci ceph_put_snap_realm(mdsc, realm); 86462306a36Sopenharmony_ci 86562306a36Sopenharmony_ci if (p < e) 86662306a36Sopenharmony_ci goto more; 86762306a36Sopenharmony_ci 86862306a36Sopenharmony_ci /* 86962306a36Sopenharmony_ci * queue cap snaps _after_ we've built the new snap contexts, 87062306a36Sopenharmony_ci * so that i_head_snapc can be set appropriately. 87162306a36Sopenharmony_ci */ 87262306a36Sopenharmony_ci while (!list_empty(&dirty_realms)) { 87362306a36Sopenharmony_ci realm = list_first_entry(&dirty_realms, struct ceph_snap_realm, 87462306a36Sopenharmony_ci dirty_item); 87562306a36Sopenharmony_ci list_del_init(&realm->dirty_item); 87662306a36Sopenharmony_ci queue_realm_cap_snaps(realm); 87762306a36Sopenharmony_ci } 87862306a36Sopenharmony_ci 87962306a36Sopenharmony_ci if (realm_ret) 88062306a36Sopenharmony_ci *realm_ret = first_realm; 88162306a36Sopenharmony_ci else 88262306a36Sopenharmony_ci ceph_put_snap_realm(mdsc, first_realm); 88362306a36Sopenharmony_ci 88462306a36Sopenharmony_ci __cleanup_empty_realms(mdsc); 88562306a36Sopenharmony_ci return 0; 88662306a36Sopenharmony_ci 88762306a36Sopenharmony_cibad: 88862306a36Sopenharmony_ci err = -EIO; 88962306a36Sopenharmony_cifail: 89062306a36Sopenharmony_ci if (realm && !IS_ERR(realm)) 89162306a36Sopenharmony_ci ceph_put_snap_realm(mdsc, realm); 89262306a36Sopenharmony_ci if (first_realm) 89362306a36Sopenharmony_ci ceph_put_snap_realm(mdsc, first_realm); 89462306a36Sopenharmony_ci pr_err("%s error %d\n", __func__, err); 89562306a36Sopenharmony_ci 89662306a36Sopenharmony_ci /* 89762306a36Sopenharmony_ci * When receiving a corrupted snap trace we don't know what 89862306a36Sopenharmony_ci * exactly has happened in MDS side. And we shouldn't continue 89962306a36Sopenharmony_ci * writing to OSD, which may corrupt the snapshot contents. 90062306a36Sopenharmony_ci * 90162306a36Sopenharmony_ci * Just try to blocklist this kclient and then this kclient 90262306a36Sopenharmony_ci * must be remounted to continue after the corrupted metadata 90362306a36Sopenharmony_ci * fixed in the MDS side. 90462306a36Sopenharmony_ci */ 90562306a36Sopenharmony_ci WRITE_ONCE(mdsc->fsc->mount_state, CEPH_MOUNT_FENCE_IO); 90662306a36Sopenharmony_ci ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr); 90762306a36Sopenharmony_ci if (ret) 90862306a36Sopenharmony_ci pr_err("%s failed to blocklist %s: %d\n", __func__, 90962306a36Sopenharmony_ci ceph_pr_addr(&client->msgr.inst.addr), ret); 91062306a36Sopenharmony_ci 91162306a36Sopenharmony_ci WARN(1, "%s: %s%sdo remount to continue%s", 91262306a36Sopenharmony_ci __func__, ret ? "" : ceph_pr_addr(&client->msgr.inst.addr), 91362306a36Sopenharmony_ci ret ? "" : " was blocklisted, ", 91462306a36Sopenharmony_ci err == -EIO ? " after corrupted snaptrace is fixed" : ""); 91562306a36Sopenharmony_ci 91662306a36Sopenharmony_ci return err; 91762306a36Sopenharmony_ci} 91862306a36Sopenharmony_ci 91962306a36Sopenharmony_ci 92062306a36Sopenharmony_ci/* 92162306a36Sopenharmony_ci * Send any cap_snaps that are queued for flush. Try to carry 92262306a36Sopenharmony_ci * s_mutex across multiple snap flushes to avoid locking overhead. 92362306a36Sopenharmony_ci * 92462306a36Sopenharmony_ci * Caller holds no locks. 92562306a36Sopenharmony_ci */ 92662306a36Sopenharmony_cistatic void flush_snaps(struct ceph_mds_client *mdsc) 92762306a36Sopenharmony_ci{ 92862306a36Sopenharmony_ci struct ceph_inode_info *ci; 92962306a36Sopenharmony_ci struct inode *inode; 93062306a36Sopenharmony_ci struct ceph_mds_session *session = NULL; 93162306a36Sopenharmony_ci 93262306a36Sopenharmony_ci dout("%s\n", __func__); 93362306a36Sopenharmony_ci spin_lock(&mdsc->snap_flush_lock); 93462306a36Sopenharmony_ci while (!list_empty(&mdsc->snap_flush_list)) { 93562306a36Sopenharmony_ci ci = list_first_entry(&mdsc->snap_flush_list, 93662306a36Sopenharmony_ci struct ceph_inode_info, i_snap_flush_item); 93762306a36Sopenharmony_ci inode = &ci->netfs.inode; 93862306a36Sopenharmony_ci ihold(inode); 93962306a36Sopenharmony_ci spin_unlock(&mdsc->snap_flush_lock); 94062306a36Sopenharmony_ci ceph_flush_snaps(ci, &session); 94162306a36Sopenharmony_ci iput(inode); 94262306a36Sopenharmony_ci spin_lock(&mdsc->snap_flush_lock); 94362306a36Sopenharmony_ci } 94462306a36Sopenharmony_ci spin_unlock(&mdsc->snap_flush_lock); 94562306a36Sopenharmony_ci 94662306a36Sopenharmony_ci ceph_put_mds_session(session); 94762306a36Sopenharmony_ci dout("%s done\n", __func__); 94862306a36Sopenharmony_ci} 94962306a36Sopenharmony_ci 95062306a36Sopenharmony_ci/** 95162306a36Sopenharmony_ci * ceph_change_snap_realm - change the snap_realm for an inode 95262306a36Sopenharmony_ci * @inode: inode to move to new snap realm 95362306a36Sopenharmony_ci * @realm: new realm to move inode into (may be NULL) 95462306a36Sopenharmony_ci * 95562306a36Sopenharmony_ci * Detach an inode from its old snaprealm (if any) and attach it to 95662306a36Sopenharmony_ci * the new snaprealm (if any). The old snap realm reference held by 95762306a36Sopenharmony_ci * the inode is put. If realm is non-NULL, then the caller's reference 95862306a36Sopenharmony_ci * to it is taken over by the inode. 95962306a36Sopenharmony_ci */ 96062306a36Sopenharmony_civoid ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm) 96162306a36Sopenharmony_ci{ 96262306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 96362306a36Sopenharmony_ci struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 96462306a36Sopenharmony_ci struct ceph_snap_realm *oldrealm = ci->i_snap_realm; 96562306a36Sopenharmony_ci 96662306a36Sopenharmony_ci lockdep_assert_held(&ci->i_ceph_lock); 96762306a36Sopenharmony_ci 96862306a36Sopenharmony_ci if (oldrealm) { 96962306a36Sopenharmony_ci spin_lock(&oldrealm->inodes_with_caps_lock); 97062306a36Sopenharmony_ci list_del_init(&ci->i_snap_realm_item); 97162306a36Sopenharmony_ci if (oldrealm->ino == ci->i_vino.ino) 97262306a36Sopenharmony_ci oldrealm->inode = NULL; 97362306a36Sopenharmony_ci spin_unlock(&oldrealm->inodes_with_caps_lock); 97462306a36Sopenharmony_ci ceph_put_snap_realm(mdsc, oldrealm); 97562306a36Sopenharmony_ci } 97662306a36Sopenharmony_ci 97762306a36Sopenharmony_ci ci->i_snap_realm = realm; 97862306a36Sopenharmony_ci 97962306a36Sopenharmony_ci if (realm) { 98062306a36Sopenharmony_ci spin_lock(&realm->inodes_with_caps_lock); 98162306a36Sopenharmony_ci list_add(&ci->i_snap_realm_item, &realm->inodes_with_caps); 98262306a36Sopenharmony_ci if (realm->ino == ci->i_vino.ino) 98362306a36Sopenharmony_ci realm->inode = inode; 98462306a36Sopenharmony_ci spin_unlock(&realm->inodes_with_caps_lock); 98562306a36Sopenharmony_ci } 98662306a36Sopenharmony_ci} 98762306a36Sopenharmony_ci 98862306a36Sopenharmony_ci/* 98962306a36Sopenharmony_ci * Handle a snap notification from the MDS. 99062306a36Sopenharmony_ci * 99162306a36Sopenharmony_ci * This can take two basic forms: the simplest is just a snap creation 99262306a36Sopenharmony_ci * or deletion notification on an existing realm. This should update the 99362306a36Sopenharmony_ci * realm and its children. 99462306a36Sopenharmony_ci * 99562306a36Sopenharmony_ci * The more difficult case is realm creation, due to snap creation at a 99662306a36Sopenharmony_ci * new point in the file hierarchy, or due to a rename that moves a file or 99762306a36Sopenharmony_ci * directory into another realm. 99862306a36Sopenharmony_ci */ 99962306a36Sopenharmony_civoid ceph_handle_snap(struct ceph_mds_client *mdsc, 100062306a36Sopenharmony_ci struct ceph_mds_session *session, 100162306a36Sopenharmony_ci struct ceph_msg *msg) 100262306a36Sopenharmony_ci{ 100362306a36Sopenharmony_ci struct super_block *sb = mdsc->fsc->sb; 100462306a36Sopenharmony_ci int mds = session->s_mds; 100562306a36Sopenharmony_ci u64 split; 100662306a36Sopenharmony_ci int op; 100762306a36Sopenharmony_ci int trace_len; 100862306a36Sopenharmony_ci struct ceph_snap_realm *realm = NULL; 100962306a36Sopenharmony_ci void *p = msg->front.iov_base; 101062306a36Sopenharmony_ci void *e = p + msg->front.iov_len; 101162306a36Sopenharmony_ci struct ceph_mds_snap_head *h; 101262306a36Sopenharmony_ci int num_split_inos, num_split_realms; 101362306a36Sopenharmony_ci __le64 *split_inos = NULL, *split_realms = NULL; 101462306a36Sopenharmony_ci int i; 101562306a36Sopenharmony_ci int locked_rwsem = 0; 101662306a36Sopenharmony_ci bool close_sessions = false; 101762306a36Sopenharmony_ci 101862306a36Sopenharmony_ci if (!ceph_inc_mds_stopping_blocker(mdsc, session)) 101962306a36Sopenharmony_ci return; 102062306a36Sopenharmony_ci 102162306a36Sopenharmony_ci /* decode */ 102262306a36Sopenharmony_ci if (msg->front.iov_len < sizeof(*h)) 102362306a36Sopenharmony_ci goto bad; 102462306a36Sopenharmony_ci h = p; 102562306a36Sopenharmony_ci op = le32_to_cpu(h->op); 102662306a36Sopenharmony_ci split = le64_to_cpu(h->split); /* non-zero if we are splitting an 102762306a36Sopenharmony_ci * existing realm */ 102862306a36Sopenharmony_ci num_split_inos = le32_to_cpu(h->num_split_inos); 102962306a36Sopenharmony_ci num_split_realms = le32_to_cpu(h->num_split_realms); 103062306a36Sopenharmony_ci trace_len = le32_to_cpu(h->trace_len); 103162306a36Sopenharmony_ci p += sizeof(*h); 103262306a36Sopenharmony_ci 103362306a36Sopenharmony_ci dout("%s from mds%d op %s split %llx tracelen %d\n", __func__, 103462306a36Sopenharmony_ci mds, ceph_snap_op_name(op), split, trace_len); 103562306a36Sopenharmony_ci 103662306a36Sopenharmony_ci down_write(&mdsc->snap_rwsem); 103762306a36Sopenharmony_ci locked_rwsem = 1; 103862306a36Sopenharmony_ci 103962306a36Sopenharmony_ci if (op == CEPH_SNAP_OP_SPLIT) { 104062306a36Sopenharmony_ci struct ceph_mds_snap_realm *ri; 104162306a36Sopenharmony_ci 104262306a36Sopenharmony_ci /* 104362306a36Sopenharmony_ci * A "split" breaks part of an existing realm off into 104462306a36Sopenharmony_ci * a new realm. The MDS provides a list of inodes 104562306a36Sopenharmony_ci * (with caps) and child realms that belong to the new 104662306a36Sopenharmony_ci * child. 104762306a36Sopenharmony_ci */ 104862306a36Sopenharmony_ci split_inos = p; 104962306a36Sopenharmony_ci p += sizeof(u64) * num_split_inos; 105062306a36Sopenharmony_ci split_realms = p; 105162306a36Sopenharmony_ci p += sizeof(u64) * num_split_realms; 105262306a36Sopenharmony_ci ceph_decode_need(&p, e, sizeof(*ri), bad); 105362306a36Sopenharmony_ci /* we will peek at realm info here, but will _not_ 105462306a36Sopenharmony_ci * advance p, as the realm update will occur below in 105562306a36Sopenharmony_ci * ceph_update_snap_trace. */ 105662306a36Sopenharmony_ci ri = p; 105762306a36Sopenharmony_ci 105862306a36Sopenharmony_ci realm = ceph_lookup_snap_realm(mdsc, split); 105962306a36Sopenharmony_ci if (!realm) { 106062306a36Sopenharmony_ci realm = ceph_create_snap_realm(mdsc, split); 106162306a36Sopenharmony_ci if (IS_ERR(realm)) 106262306a36Sopenharmony_ci goto out; 106362306a36Sopenharmony_ci } 106462306a36Sopenharmony_ci 106562306a36Sopenharmony_ci dout("splitting snap_realm %llx %p\n", realm->ino, realm); 106662306a36Sopenharmony_ci for (i = 0; i < num_split_inos; i++) { 106762306a36Sopenharmony_ci struct ceph_vino vino = { 106862306a36Sopenharmony_ci .ino = le64_to_cpu(split_inos[i]), 106962306a36Sopenharmony_ci .snap = CEPH_NOSNAP, 107062306a36Sopenharmony_ci }; 107162306a36Sopenharmony_ci struct inode *inode = ceph_find_inode(sb, vino); 107262306a36Sopenharmony_ci struct ceph_inode_info *ci; 107362306a36Sopenharmony_ci 107462306a36Sopenharmony_ci if (!inode) 107562306a36Sopenharmony_ci continue; 107662306a36Sopenharmony_ci ci = ceph_inode(inode); 107762306a36Sopenharmony_ci 107862306a36Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 107962306a36Sopenharmony_ci if (!ci->i_snap_realm) 108062306a36Sopenharmony_ci goto skip_inode; 108162306a36Sopenharmony_ci /* 108262306a36Sopenharmony_ci * If this inode belongs to a realm that was 108362306a36Sopenharmony_ci * created after our new realm, we experienced 108462306a36Sopenharmony_ci * a race (due to another split notifications 108562306a36Sopenharmony_ci * arriving from a different MDS). So skip 108662306a36Sopenharmony_ci * this inode. 108762306a36Sopenharmony_ci */ 108862306a36Sopenharmony_ci if (ci->i_snap_realm->created > 108962306a36Sopenharmony_ci le64_to_cpu(ri->created)) { 109062306a36Sopenharmony_ci dout(" leaving %p %llx.%llx in newer realm %llx %p\n", 109162306a36Sopenharmony_ci inode, ceph_vinop(inode), ci->i_snap_realm->ino, 109262306a36Sopenharmony_ci ci->i_snap_realm); 109362306a36Sopenharmony_ci goto skip_inode; 109462306a36Sopenharmony_ci } 109562306a36Sopenharmony_ci dout(" will move %p %llx.%llx to split realm %llx %p\n", 109662306a36Sopenharmony_ci inode, ceph_vinop(inode), realm->ino, realm); 109762306a36Sopenharmony_ci 109862306a36Sopenharmony_ci ceph_get_snap_realm(mdsc, realm); 109962306a36Sopenharmony_ci ceph_change_snap_realm(inode, realm); 110062306a36Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 110162306a36Sopenharmony_ci iput(inode); 110262306a36Sopenharmony_ci continue; 110362306a36Sopenharmony_ci 110462306a36Sopenharmony_ciskip_inode: 110562306a36Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 110662306a36Sopenharmony_ci iput(inode); 110762306a36Sopenharmony_ci } 110862306a36Sopenharmony_ci 110962306a36Sopenharmony_ci /* we may have taken some of the old realm's children. */ 111062306a36Sopenharmony_ci for (i = 0; i < num_split_realms; i++) { 111162306a36Sopenharmony_ci struct ceph_snap_realm *child = 111262306a36Sopenharmony_ci __lookup_snap_realm(mdsc, 111362306a36Sopenharmony_ci le64_to_cpu(split_realms[i])); 111462306a36Sopenharmony_ci if (!child) 111562306a36Sopenharmony_ci continue; 111662306a36Sopenharmony_ci adjust_snap_realm_parent(mdsc, child, realm->ino); 111762306a36Sopenharmony_ci } 111862306a36Sopenharmony_ci } else { 111962306a36Sopenharmony_ci /* 112062306a36Sopenharmony_ci * In the non-split case both 'num_split_inos' and 112162306a36Sopenharmony_ci * 'num_split_realms' should be 0, making this a no-op. 112262306a36Sopenharmony_ci * However the MDS happens to populate 'split_realms' list 112362306a36Sopenharmony_ci * in one of the UPDATE op cases by mistake. 112462306a36Sopenharmony_ci * 112562306a36Sopenharmony_ci * Skip both lists just in case to ensure that 'p' is 112662306a36Sopenharmony_ci * positioned at the start of realm info, as expected by 112762306a36Sopenharmony_ci * ceph_update_snap_trace(). 112862306a36Sopenharmony_ci */ 112962306a36Sopenharmony_ci p += sizeof(u64) * num_split_inos; 113062306a36Sopenharmony_ci p += sizeof(u64) * num_split_realms; 113162306a36Sopenharmony_ci } 113262306a36Sopenharmony_ci 113362306a36Sopenharmony_ci /* 113462306a36Sopenharmony_ci * update using the provided snap trace. if we are deleting a 113562306a36Sopenharmony_ci * snap, we can avoid queueing cap_snaps. 113662306a36Sopenharmony_ci */ 113762306a36Sopenharmony_ci if (ceph_update_snap_trace(mdsc, p, e, 113862306a36Sopenharmony_ci op == CEPH_SNAP_OP_DESTROY, 113962306a36Sopenharmony_ci NULL)) { 114062306a36Sopenharmony_ci close_sessions = true; 114162306a36Sopenharmony_ci goto bad; 114262306a36Sopenharmony_ci } 114362306a36Sopenharmony_ci 114462306a36Sopenharmony_ci if (op == CEPH_SNAP_OP_SPLIT) 114562306a36Sopenharmony_ci /* we took a reference when we created the realm, above */ 114662306a36Sopenharmony_ci ceph_put_snap_realm(mdsc, realm); 114762306a36Sopenharmony_ci 114862306a36Sopenharmony_ci __cleanup_empty_realms(mdsc); 114962306a36Sopenharmony_ci 115062306a36Sopenharmony_ci up_write(&mdsc->snap_rwsem); 115162306a36Sopenharmony_ci 115262306a36Sopenharmony_ci flush_snaps(mdsc); 115362306a36Sopenharmony_ci ceph_dec_mds_stopping_blocker(mdsc); 115462306a36Sopenharmony_ci return; 115562306a36Sopenharmony_ci 115662306a36Sopenharmony_cibad: 115762306a36Sopenharmony_ci pr_err("%s corrupt snap message from mds%d\n", __func__, mds); 115862306a36Sopenharmony_ci ceph_msg_dump(msg); 115962306a36Sopenharmony_ciout: 116062306a36Sopenharmony_ci if (locked_rwsem) 116162306a36Sopenharmony_ci up_write(&mdsc->snap_rwsem); 116262306a36Sopenharmony_ci 116362306a36Sopenharmony_ci ceph_dec_mds_stopping_blocker(mdsc); 116462306a36Sopenharmony_ci 116562306a36Sopenharmony_ci if (close_sessions) 116662306a36Sopenharmony_ci ceph_mdsc_close_sessions(mdsc); 116762306a36Sopenharmony_ci return; 116862306a36Sopenharmony_ci} 116962306a36Sopenharmony_ci 117062306a36Sopenharmony_cistruct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, 117162306a36Sopenharmony_ci u64 snap) 117262306a36Sopenharmony_ci{ 117362306a36Sopenharmony_ci struct ceph_snapid_map *sm, *exist; 117462306a36Sopenharmony_ci struct rb_node **p, *parent; 117562306a36Sopenharmony_ci int ret; 117662306a36Sopenharmony_ci 117762306a36Sopenharmony_ci exist = NULL; 117862306a36Sopenharmony_ci spin_lock(&mdsc->snapid_map_lock); 117962306a36Sopenharmony_ci p = &mdsc->snapid_map_tree.rb_node; 118062306a36Sopenharmony_ci while (*p) { 118162306a36Sopenharmony_ci exist = rb_entry(*p, struct ceph_snapid_map, node); 118262306a36Sopenharmony_ci if (snap > exist->snap) { 118362306a36Sopenharmony_ci p = &(*p)->rb_left; 118462306a36Sopenharmony_ci } else if (snap < exist->snap) { 118562306a36Sopenharmony_ci p = &(*p)->rb_right; 118662306a36Sopenharmony_ci } else { 118762306a36Sopenharmony_ci if (atomic_inc_return(&exist->ref) == 1) 118862306a36Sopenharmony_ci list_del_init(&exist->lru); 118962306a36Sopenharmony_ci break; 119062306a36Sopenharmony_ci } 119162306a36Sopenharmony_ci exist = NULL; 119262306a36Sopenharmony_ci } 119362306a36Sopenharmony_ci spin_unlock(&mdsc->snapid_map_lock); 119462306a36Sopenharmony_ci if (exist) { 119562306a36Sopenharmony_ci dout("%s found snapid map %llx -> %x\n", __func__, 119662306a36Sopenharmony_ci exist->snap, exist->dev); 119762306a36Sopenharmony_ci return exist; 119862306a36Sopenharmony_ci } 119962306a36Sopenharmony_ci 120062306a36Sopenharmony_ci sm = kmalloc(sizeof(*sm), GFP_NOFS); 120162306a36Sopenharmony_ci if (!sm) 120262306a36Sopenharmony_ci return NULL; 120362306a36Sopenharmony_ci 120462306a36Sopenharmony_ci ret = get_anon_bdev(&sm->dev); 120562306a36Sopenharmony_ci if (ret < 0) { 120662306a36Sopenharmony_ci kfree(sm); 120762306a36Sopenharmony_ci return NULL; 120862306a36Sopenharmony_ci } 120962306a36Sopenharmony_ci 121062306a36Sopenharmony_ci INIT_LIST_HEAD(&sm->lru); 121162306a36Sopenharmony_ci atomic_set(&sm->ref, 1); 121262306a36Sopenharmony_ci sm->snap = snap; 121362306a36Sopenharmony_ci 121462306a36Sopenharmony_ci exist = NULL; 121562306a36Sopenharmony_ci parent = NULL; 121662306a36Sopenharmony_ci p = &mdsc->snapid_map_tree.rb_node; 121762306a36Sopenharmony_ci spin_lock(&mdsc->snapid_map_lock); 121862306a36Sopenharmony_ci while (*p) { 121962306a36Sopenharmony_ci parent = *p; 122062306a36Sopenharmony_ci exist = rb_entry(*p, struct ceph_snapid_map, node); 122162306a36Sopenharmony_ci if (snap > exist->snap) 122262306a36Sopenharmony_ci p = &(*p)->rb_left; 122362306a36Sopenharmony_ci else if (snap < exist->snap) 122462306a36Sopenharmony_ci p = &(*p)->rb_right; 122562306a36Sopenharmony_ci else 122662306a36Sopenharmony_ci break; 122762306a36Sopenharmony_ci exist = NULL; 122862306a36Sopenharmony_ci } 122962306a36Sopenharmony_ci if (exist) { 123062306a36Sopenharmony_ci if (atomic_inc_return(&exist->ref) == 1) 123162306a36Sopenharmony_ci list_del_init(&exist->lru); 123262306a36Sopenharmony_ci } else { 123362306a36Sopenharmony_ci rb_link_node(&sm->node, parent, p); 123462306a36Sopenharmony_ci rb_insert_color(&sm->node, &mdsc->snapid_map_tree); 123562306a36Sopenharmony_ci } 123662306a36Sopenharmony_ci spin_unlock(&mdsc->snapid_map_lock); 123762306a36Sopenharmony_ci if (exist) { 123862306a36Sopenharmony_ci free_anon_bdev(sm->dev); 123962306a36Sopenharmony_ci kfree(sm); 124062306a36Sopenharmony_ci dout("%s found snapid map %llx -> %x\n", __func__, 124162306a36Sopenharmony_ci exist->snap, exist->dev); 124262306a36Sopenharmony_ci return exist; 124362306a36Sopenharmony_ci } 124462306a36Sopenharmony_ci 124562306a36Sopenharmony_ci dout("%s create snapid map %llx -> %x\n", __func__, 124662306a36Sopenharmony_ci sm->snap, sm->dev); 124762306a36Sopenharmony_ci return sm; 124862306a36Sopenharmony_ci} 124962306a36Sopenharmony_ci 125062306a36Sopenharmony_civoid ceph_put_snapid_map(struct ceph_mds_client* mdsc, 125162306a36Sopenharmony_ci struct ceph_snapid_map *sm) 125262306a36Sopenharmony_ci{ 125362306a36Sopenharmony_ci if (!sm) 125462306a36Sopenharmony_ci return; 125562306a36Sopenharmony_ci if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) { 125662306a36Sopenharmony_ci if (!RB_EMPTY_NODE(&sm->node)) { 125762306a36Sopenharmony_ci sm->last_used = jiffies; 125862306a36Sopenharmony_ci list_add_tail(&sm->lru, &mdsc->snapid_map_lru); 125962306a36Sopenharmony_ci spin_unlock(&mdsc->snapid_map_lock); 126062306a36Sopenharmony_ci } else { 126162306a36Sopenharmony_ci /* already cleaned up by 126262306a36Sopenharmony_ci * ceph_cleanup_snapid_map() */ 126362306a36Sopenharmony_ci spin_unlock(&mdsc->snapid_map_lock); 126462306a36Sopenharmony_ci kfree(sm); 126562306a36Sopenharmony_ci } 126662306a36Sopenharmony_ci } 126762306a36Sopenharmony_ci} 126862306a36Sopenharmony_ci 126962306a36Sopenharmony_civoid ceph_trim_snapid_map(struct ceph_mds_client *mdsc) 127062306a36Sopenharmony_ci{ 127162306a36Sopenharmony_ci struct ceph_snapid_map *sm; 127262306a36Sopenharmony_ci unsigned long now; 127362306a36Sopenharmony_ci LIST_HEAD(to_free); 127462306a36Sopenharmony_ci 127562306a36Sopenharmony_ci spin_lock(&mdsc->snapid_map_lock); 127662306a36Sopenharmony_ci now = jiffies; 127762306a36Sopenharmony_ci 127862306a36Sopenharmony_ci while (!list_empty(&mdsc->snapid_map_lru)) { 127962306a36Sopenharmony_ci sm = list_first_entry(&mdsc->snapid_map_lru, 128062306a36Sopenharmony_ci struct ceph_snapid_map, lru); 128162306a36Sopenharmony_ci if (time_after(sm->last_used + CEPH_SNAPID_MAP_TIMEOUT, now)) 128262306a36Sopenharmony_ci break; 128362306a36Sopenharmony_ci 128462306a36Sopenharmony_ci rb_erase(&sm->node, &mdsc->snapid_map_tree); 128562306a36Sopenharmony_ci list_move(&sm->lru, &to_free); 128662306a36Sopenharmony_ci } 128762306a36Sopenharmony_ci spin_unlock(&mdsc->snapid_map_lock); 128862306a36Sopenharmony_ci 128962306a36Sopenharmony_ci while (!list_empty(&to_free)) { 129062306a36Sopenharmony_ci sm = list_first_entry(&to_free, struct ceph_snapid_map, lru); 129162306a36Sopenharmony_ci list_del(&sm->lru); 129262306a36Sopenharmony_ci dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev); 129362306a36Sopenharmony_ci free_anon_bdev(sm->dev); 129462306a36Sopenharmony_ci kfree(sm); 129562306a36Sopenharmony_ci } 129662306a36Sopenharmony_ci} 129762306a36Sopenharmony_ci 129862306a36Sopenharmony_civoid ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc) 129962306a36Sopenharmony_ci{ 130062306a36Sopenharmony_ci struct ceph_snapid_map *sm; 130162306a36Sopenharmony_ci struct rb_node *p; 130262306a36Sopenharmony_ci LIST_HEAD(to_free); 130362306a36Sopenharmony_ci 130462306a36Sopenharmony_ci spin_lock(&mdsc->snapid_map_lock); 130562306a36Sopenharmony_ci while ((p = rb_first(&mdsc->snapid_map_tree))) { 130662306a36Sopenharmony_ci sm = rb_entry(p, struct ceph_snapid_map, node); 130762306a36Sopenharmony_ci rb_erase(p, &mdsc->snapid_map_tree); 130862306a36Sopenharmony_ci RB_CLEAR_NODE(p); 130962306a36Sopenharmony_ci list_move(&sm->lru, &to_free); 131062306a36Sopenharmony_ci } 131162306a36Sopenharmony_ci spin_unlock(&mdsc->snapid_map_lock); 131262306a36Sopenharmony_ci 131362306a36Sopenharmony_ci while (!list_empty(&to_free)) { 131462306a36Sopenharmony_ci sm = list_first_entry(&to_free, struct ceph_snapid_map, lru); 131562306a36Sopenharmony_ci list_del(&sm->lru); 131662306a36Sopenharmony_ci free_anon_bdev(sm->dev); 131762306a36Sopenharmony_ci if (WARN_ON_ONCE(atomic_read(&sm->ref))) { 131862306a36Sopenharmony_ci pr_err("snapid map %llx -> %x still in use\n", 131962306a36Sopenharmony_ci sm->snap, sm->dev); 132062306a36Sopenharmony_ci } 132162306a36Sopenharmony_ci kfree(sm); 132262306a36Sopenharmony_ci } 132362306a36Sopenharmony_ci} 1324