162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci#include <linux/ceph/ceph_debug.h>
362306a36Sopenharmony_ci
462306a36Sopenharmony_ci#include <linux/fs.h>
562306a36Sopenharmony_ci#include <linux/wait.h>
662306a36Sopenharmony_ci#include <linux/slab.h>
762306a36Sopenharmony_ci#include <linux/gfp.h>
862306a36Sopenharmony_ci#include <linux/sched.h>
962306a36Sopenharmony_ci#include <linux/debugfs.h>
1062306a36Sopenharmony_ci#include <linux/seq_file.h>
1162306a36Sopenharmony_ci#include <linux/ratelimit.h>
1262306a36Sopenharmony_ci#include <linux/bits.h>
1362306a36Sopenharmony_ci#include <linux/ktime.h>
1462306a36Sopenharmony_ci#include <linux/bitmap.h>
1562306a36Sopenharmony_ci
1662306a36Sopenharmony_ci#include "super.h"
1762306a36Sopenharmony_ci#include "mds_client.h"
1862306a36Sopenharmony_ci#include "crypto.h"
1962306a36Sopenharmony_ci
2062306a36Sopenharmony_ci#include <linux/ceph/ceph_features.h>
2162306a36Sopenharmony_ci#include <linux/ceph/messenger.h>
2262306a36Sopenharmony_ci#include <linux/ceph/decode.h>
2362306a36Sopenharmony_ci#include <linux/ceph/pagelist.h>
2462306a36Sopenharmony_ci#include <linux/ceph/auth.h>
2562306a36Sopenharmony_ci#include <linux/ceph/debugfs.h>
2662306a36Sopenharmony_ci
2762306a36Sopenharmony_ci#define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE)
2862306a36Sopenharmony_ci
2962306a36Sopenharmony_ci/*
3062306a36Sopenharmony_ci * A cluster of MDS (metadata server) daemons is responsible for
3162306a36Sopenharmony_ci * managing the file system namespace (the directory hierarchy and
3262306a36Sopenharmony_ci * inodes) and for coordinating shared access to storage.  Metadata is
3362306a36Sopenharmony_ci * partitioning hierarchically across a number of servers, and that
3462306a36Sopenharmony_ci * partition varies over time as the cluster adjusts the distribution
3562306a36Sopenharmony_ci * in order to balance load.
3662306a36Sopenharmony_ci *
3762306a36Sopenharmony_ci * The MDS client is primarily responsible to managing synchronous
3862306a36Sopenharmony_ci * metadata requests for operations like open, unlink, and so forth.
3962306a36Sopenharmony_ci * If there is a MDS failure, we find out about it when we (possibly
4062306a36Sopenharmony_ci * request and) receive a new MDS map, and can resubmit affected
4162306a36Sopenharmony_ci * requests.
4262306a36Sopenharmony_ci *
4362306a36Sopenharmony_ci * For the most part, though, we take advantage of a lossless
4462306a36Sopenharmony_ci * communications channel to the MDS, and do not need to worry about
4562306a36Sopenharmony_ci * timing out or resubmitting requests.
4662306a36Sopenharmony_ci *
4762306a36Sopenharmony_ci * We maintain a stateful "session" with each MDS we interact with.
4862306a36Sopenharmony_ci * Within each session, we sent periodic heartbeat messages to ensure
4962306a36Sopenharmony_ci * any capabilities or leases we have been issues remain valid.  If
5062306a36Sopenharmony_ci * the session times out and goes stale, our leases and capabilities
5162306a36Sopenharmony_ci * are no longer valid.
5262306a36Sopenharmony_ci */
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_cistruct ceph_reconnect_state {
5562306a36Sopenharmony_ci	struct ceph_mds_session *session;
5662306a36Sopenharmony_ci	int nr_caps, nr_realms;
5762306a36Sopenharmony_ci	struct ceph_pagelist *pagelist;
5862306a36Sopenharmony_ci	unsigned msg_version;
5962306a36Sopenharmony_ci	bool allow_multi;
6062306a36Sopenharmony_ci};
6162306a36Sopenharmony_ci
6262306a36Sopenharmony_cistatic void __wake_requests(struct ceph_mds_client *mdsc,
6362306a36Sopenharmony_ci			    struct list_head *head);
6462306a36Sopenharmony_cistatic void ceph_cap_release_work(struct work_struct *work);
6562306a36Sopenharmony_cistatic void ceph_cap_reclaim_work(struct work_struct *work);
6662306a36Sopenharmony_ci
6762306a36Sopenharmony_cistatic const struct ceph_connection_operations mds_con_ops;
6862306a36Sopenharmony_ci
6962306a36Sopenharmony_ci
7062306a36Sopenharmony_ci/*
7162306a36Sopenharmony_ci * mds reply parsing
7262306a36Sopenharmony_ci */
7362306a36Sopenharmony_ci
7462306a36Sopenharmony_cistatic int parse_reply_info_quota(void **p, void *end,
7562306a36Sopenharmony_ci				  struct ceph_mds_reply_info_in *info)
7662306a36Sopenharmony_ci{
7762306a36Sopenharmony_ci	u8 struct_v, struct_compat;
7862306a36Sopenharmony_ci	u32 struct_len;
7962306a36Sopenharmony_ci
8062306a36Sopenharmony_ci	ceph_decode_8_safe(p, end, struct_v, bad);
8162306a36Sopenharmony_ci	ceph_decode_8_safe(p, end, struct_compat, bad);
8262306a36Sopenharmony_ci	/* struct_v is expected to be >= 1. we only
8362306a36Sopenharmony_ci	 * understand encoding with struct_compat == 1. */
8462306a36Sopenharmony_ci	if (!struct_v || struct_compat != 1)
8562306a36Sopenharmony_ci		goto bad;
8662306a36Sopenharmony_ci	ceph_decode_32_safe(p, end, struct_len, bad);
8762306a36Sopenharmony_ci	ceph_decode_need(p, end, struct_len, bad);
8862306a36Sopenharmony_ci	end = *p + struct_len;
8962306a36Sopenharmony_ci	ceph_decode_64_safe(p, end, info->max_bytes, bad);
9062306a36Sopenharmony_ci	ceph_decode_64_safe(p, end, info->max_files, bad);
9162306a36Sopenharmony_ci	*p = end;
9262306a36Sopenharmony_ci	return 0;
9362306a36Sopenharmony_cibad:
9462306a36Sopenharmony_ci	return -EIO;
9562306a36Sopenharmony_ci}
9662306a36Sopenharmony_ci
9762306a36Sopenharmony_ci/*
9862306a36Sopenharmony_ci * parse individual inode info
9962306a36Sopenharmony_ci */
10062306a36Sopenharmony_cistatic int parse_reply_info_in(void **p, void *end,
10162306a36Sopenharmony_ci			       struct ceph_mds_reply_info_in *info,
10262306a36Sopenharmony_ci			       u64 features)
10362306a36Sopenharmony_ci{
10462306a36Sopenharmony_ci	int err = 0;
10562306a36Sopenharmony_ci	u8 struct_v = 0;
10662306a36Sopenharmony_ci
10762306a36Sopenharmony_ci	if (features == (u64)-1) {
10862306a36Sopenharmony_ci		u32 struct_len;
10962306a36Sopenharmony_ci		u8 struct_compat;
11062306a36Sopenharmony_ci		ceph_decode_8_safe(p, end, struct_v, bad);
11162306a36Sopenharmony_ci		ceph_decode_8_safe(p, end, struct_compat, bad);
11262306a36Sopenharmony_ci		/* struct_v is expected to be >= 1. we only understand
11362306a36Sopenharmony_ci		 * encoding with struct_compat == 1. */
11462306a36Sopenharmony_ci		if (!struct_v || struct_compat != 1)
11562306a36Sopenharmony_ci			goto bad;
11662306a36Sopenharmony_ci		ceph_decode_32_safe(p, end, struct_len, bad);
11762306a36Sopenharmony_ci		ceph_decode_need(p, end, struct_len, bad);
11862306a36Sopenharmony_ci		end = *p + struct_len;
11962306a36Sopenharmony_ci	}
12062306a36Sopenharmony_ci
12162306a36Sopenharmony_ci	ceph_decode_need(p, end, sizeof(struct ceph_mds_reply_inode), bad);
12262306a36Sopenharmony_ci	info->in = *p;
12362306a36Sopenharmony_ci	*p += sizeof(struct ceph_mds_reply_inode) +
12462306a36Sopenharmony_ci		sizeof(*info->in->fragtree.splits) *
12562306a36Sopenharmony_ci		le32_to_cpu(info->in->fragtree.nsplits);
12662306a36Sopenharmony_ci
12762306a36Sopenharmony_ci	ceph_decode_32_safe(p, end, info->symlink_len, bad);
12862306a36Sopenharmony_ci	ceph_decode_need(p, end, info->symlink_len, bad);
12962306a36Sopenharmony_ci	info->symlink = *p;
13062306a36Sopenharmony_ci	*p += info->symlink_len;
13162306a36Sopenharmony_ci
13262306a36Sopenharmony_ci	ceph_decode_copy_safe(p, end, &info->dir_layout,
13362306a36Sopenharmony_ci			      sizeof(info->dir_layout), bad);
13462306a36Sopenharmony_ci	ceph_decode_32_safe(p, end, info->xattr_len, bad);
13562306a36Sopenharmony_ci	ceph_decode_need(p, end, info->xattr_len, bad);
13662306a36Sopenharmony_ci	info->xattr_data = *p;
13762306a36Sopenharmony_ci	*p += info->xattr_len;
13862306a36Sopenharmony_ci
13962306a36Sopenharmony_ci	if (features == (u64)-1) {
14062306a36Sopenharmony_ci		/* inline data */
14162306a36Sopenharmony_ci		ceph_decode_64_safe(p, end, info->inline_version, bad);
14262306a36Sopenharmony_ci		ceph_decode_32_safe(p, end, info->inline_len, bad);
14362306a36Sopenharmony_ci		ceph_decode_need(p, end, info->inline_len, bad);
14462306a36Sopenharmony_ci		info->inline_data = *p;
14562306a36Sopenharmony_ci		*p += info->inline_len;
14662306a36Sopenharmony_ci		/* quota */
14762306a36Sopenharmony_ci		err = parse_reply_info_quota(p, end, info);
14862306a36Sopenharmony_ci		if (err < 0)
14962306a36Sopenharmony_ci			goto out_bad;
15062306a36Sopenharmony_ci		/* pool namespace */
15162306a36Sopenharmony_ci		ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
15262306a36Sopenharmony_ci		if (info->pool_ns_len > 0) {
15362306a36Sopenharmony_ci			ceph_decode_need(p, end, info->pool_ns_len, bad);
15462306a36Sopenharmony_ci			info->pool_ns_data = *p;
15562306a36Sopenharmony_ci			*p += info->pool_ns_len;
15662306a36Sopenharmony_ci		}
15762306a36Sopenharmony_ci
15862306a36Sopenharmony_ci		/* btime */
15962306a36Sopenharmony_ci		ceph_decode_need(p, end, sizeof(info->btime), bad);
16062306a36Sopenharmony_ci		ceph_decode_copy(p, &info->btime, sizeof(info->btime));
16162306a36Sopenharmony_ci
16262306a36Sopenharmony_ci		/* change attribute */
16362306a36Sopenharmony_ci		ceph_decode_64_safe(p, end, info->change_attr, bad);
16462306a36Sopenharmony_ci
16562306a36Sopenharmony_ci		/* dir pin */
16662306a36Sopenharmony_ci		if (struct_v >= 2) {
16762306a36Sopenharmony_ci			ceph_decode_32_safe(p, end, info->dir_pin, bad);
16862306a36Sopenharmony_ci		} else {
16962306a36Sopenharmony_ci			info->dir_pin = -ENODATA;
17062306a36Sopenharmony_ci		}
17162306a36Sopenharmony_ci
17262306a36Sopenharmony_ci		/* snapshot birth time, remains zero for v<=2 */
17362306a36Sopenharmony_ci		if (struct_v >= 3) {
17462306a36Sopenharmony_ci			ceph_decode_need(p, end, sizeof(info->snap_btime), bad);
17562306a36Sopenharmony_ci			ceph_decode_copy(p, &info->snap_btime,
17662306a36Sopenharmony_ci					 sizeof(info->snap_btime));
17762306a36Sopenharmony_ci		} else {
17862306a36Sopenharmony_ci			memset(&info->snap_btime, 0, sizeof(info->snap_btime));
17962306a36Sopenharmony_ci		}
18062306a36Sopenharmony_ci
18162306a36Sopenharmony_ci		/* snapshot count, remains zero for v<=3 */
18262306a36Sopenharmony_ci		if (struct_v >= 4) {
18362306a36Sopenharmony_ci			ceph_decode_64_safe(p, end, info->rsnaps, bad);
18462306a36Sopenharmony_ci		} else {
18562306a36Sopenharmony_ci			info->rsnaps = 0;
18662306a36Sopenharmony_ci		}
18762306a36Sopenharmony_ci
18862306a36Sopenharmony_ci		if (struct_v >= 5) {
18962306a36Sopenharmony_ci			u32 alen;
19062306a36Sopenharmony_ci
19162306a36Sopenharmony_ci			ceph_decode_32_safe(p, end, alen, bad);
19262306a36Sopenharmony_ci
19362306a36Sopenharmony_ci			while (alen--) {
19462306a36Sopenharmony_ci				u32 len;
19562306a36Sopenharmony_ci
19662306a36Sopenharmony_ci				/* key */
19762306a36Sopenharmony_ci				ceph_decode_32_safe(p, end, len, bad);
19862306a36Sopenharmony_ci				ceph_decode_skip_n(p, end, len, bad);
19962306a36Sopenharmony_ci				/* value */
20062306a36Sopenharmony_ci				ceph_decode_32_safe(p, end, len, bad);
20162306a36Sopenharmony_ci				ceph_decode_skip_n(p, end, len, bad);
20262306a36Sopenharmony_ci			}
20362306a36Sopenharmony_ci		}
20462306a36Sopenharmony_ci
20562306a36Sopenharmony_ci		/* fscrypt flag -- ignore */
20662306a36Sopenharmony_ci		if (struct_v >= 6)
20762306a36Sopenharmony_ci			ceph_decode_skip_8(p, end, bad);
20862306a36Sopenharmony_ci
20962306a36Sopenharmony_ci		info->fscrypt_auth = NULL;
21062306a36Sopenharmony_ci		info->fscrypt_auth_len = 0;
21162306a36Sopenharmony_ci		info->fscrypt_file = NULL;
21262306a36Sopenharmony_ci		info->fscrypt_file_len = 0;
21362306a36Sopenharmony_ci		if (struct_v >= 7) {
21462306a36Sopenharmony_ci			ceph_decode_32_safe(p, end, info->fscrypt_auth_len, bad);
21562306a36Sopenharmony_ci			if (info->fscrypt_auth_len) {
21662306a36Sopenharmony_ci				info->fscrypt_auth = kmalloc(info->fscrypt_auth_len,
21762306a36Sopenharmony_ci							     GFP_KERNEL);
21862306a36Sopenharmony_ci				if (!info->fscrypt_auth)
21962306a36Sopenharmony_ci					return -ENOMEM;
22062306a36Sopenharmony_ci				ceph_decode_copy_safe(p, end, info->fscrypt_auth,
22162306a36Sopenharmony_ci						      info->fscrypt_auth_len, bad);
22262306a36Sopenharmony_ci			}
22362306a36Sopenharmony_ci			ceph_decode_32_safe(p, end, info->fscrypt_file_len, bad);
22462306a36Sopenharmony_ci			if (info->fscrypt_file_len) {
22562306a36Sopenharmony_ci				info->fscrypt_file = kmalloc(info->fscrypt_file_len,
22662306a36Sopenharmony_ci							     GFP_KERNEL);
22762306a36Sopenharmony_ci				if (!info->fscrypt_file)
22862306a36Sopenharmony_ci					return -ENOMEM;
22962306a36Sopenharmony_ci				ceph_decode_copy_safe(p, end, info->fscrypt_file,
23062306a36Sopenharmony_ci						      info->fscrypt_file_len, bad);
23162306a36Sopenharmony_ci			}
23262306a36Sopenharmony_ci		}
23362306a36Sopenharmony_ci		*p = end;
23462306a36Sopenharmony_ci	} else {
23562306a36Sopenharmony_ci		/* legacy (unversioned) struct */
23662306a36Sopenharmony_ci		if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
23762306a36Sopenharmony_ci			ceph_decode_64_safe(p, end, info->inline_version, bad);
23862306a36Sopenharmony_ci			ceph_decode_32_safe(p, end, info->inline_len, bad);
23962306a36Sopenharmony_ci			ceph_decode_need(p, end, info->inline_len, bad);
24062306a36Sopenharmony_ci			info->inline_data = *p;
24162306a36Sopenharmony_ci			*p += info->inline_len;
24262306a36Sopenharmony_ci		} else
24362306a36Sopenharmony_ci			info->inline_version = CEPH_INLINE_NONE;
24462306a36Sopenharmony_ci
24562306a36Sopenharmony_ci		if (features & CEPH_FEATURE_MDS_QUOTA) {
24662306a36Sopenharmony_ci			err = parse_reply_info_quota(p, end, info);
24762306a36Sopenharmony_ci			if (err < 0)
24862306a36Sopenharmony_ci				goto out_bad;
24962306a36Sopenharmony_ci		} else {
25062306a36Sopenharmony_ci			info->max_bytes = 0;
25162306a36Sopenharmony_ci			info->max_files = 0;
25262306a36Sopenharmony_ci		}
25362306a36Sopenharmony_ci
25462306a36Sopenharmony_ci		info->pool_ns_len = 0;
25562306a36Sopenharmony_ci		info->pool_ns_data = NULL;
25662306a36Sopenharmony_ci		if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
25762306a36Sopenharmony_ci			ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
25862306a36Sopenharmony_ci			if (info->pool_ns_len > 0) {
25962306a36Sopenharmony_ci				ceph_decode_need(p, end, info->pool_ns_len, bad);
26062306a36Sopenharmony_ci				info->pool_ns_data = *p;
26162306a36Sopenharmony_ci				*p += info->pool_ns_len;
26262306a36Sopenharmony_ci			}
26362306a36Sopenharmony_ci		}
26462306a36Sopenharmony_ci
26562306a36Sopenharmony_ci		if (features & CEPH_FEATURE_FS_BTIME) {
26662306a36Sopenharmony_ci			ceph_decode_need(p, end, sizeof(info->btime), bad);
26762306a36Sopenharmony_ci			ceph_decode_copy(p, &info->btime, sizeof(info->btime));
26862306a36Sopenharmony_ci			ceph_decode_64_safe(p, end, info->change_attr, bad);
26962306a36Sopenharmony_ci		}
27062306a36Sopenharmony_ci
27162306a36Sopenharmony_ci		info->dir_pin = -ENODATA;
27262306a36Sopenharmony_ci		/* info->snap_btime and info->rsnaps remain zero */
27362306a36Sopenharmony_ci	}
27462306a36Sopenharmony_ci	return 0;
27562306a36Sopenharmony_cibad:
27662306a36Sopenharmony_ci	err = -EIO;
27762306a36Sopenharmony_ciout_bad:
27862306a36Sopenharmony_ci	return err;
27962306a36Sopenharmony_ci}
28062306a36Sopenharmony_ci
28162306a36Sopenharmony_cistatic int parse_reply_info_dir(void **p, void *end,
28262306a36Sopenharmony_ci				struct ceph_mds_reply_dirfrag **dirfrag,
28362306a36Sopenharmony_ci				u64 features)
28462306a36Sopenharmony_ci{
28562306a36Sopenharmony_ci	if (features == (u64)-1) {
28662306a36Sopenharmony_ci		u8 struct_v, struct_compat;
28762306a36Sopenharmony_ci		u32 struct_len;
28862306a36Sopenharmony_ci		ceph_decode_8_safe(p, end, struct_v, bad);
28962306a36Sopenharmony_ci		ceph_decode_8_safe(p, end, struct_compat, bad);
29062306a36Sopenharmony_ci		/* struct_v is expected to be >= 1. we only understand
29162306a36Sopenharmony_ci		 * encoding whose struct_compat == 1. */
29262306a36Sopenharmony_ci		if (!struct_v || struct_compat != 1)
29362306a36Sopenharmony_ci			goto bad;
29462306a36Sopenharmony_ci		ceph_decode_32_safe(p, end, struct_len, bad);
29562306a36Sopenharmony_ci		ceph_decode_need(p, end, struct_len, bad);
29662306a36Sopenharmony_ci		end = *p + struct_len;
29762306a36Sopenharmony_ci	}
29862306a36Sopenharmony_ci
29962306a36Sopenharmony_ci	ceph_decode_need(p, end, sizeof(**dirfrag), bad);
30062306a36Sopenharmony_ci	*dirfrag = *p;
30162306a36Sopenharmony_ci	*p += sizeof(**dirfrag) + sizeof(u32) * le32_to_cpu((*dirfrag)->ndist);
30262306a36Sopenharmony_ci	if (unlikely(*p > end))
30362306a36Sopenharmony_ci		goto bad;
30462306a36Sopenharmony_ci	if (features == (u64)-1)
30562306a36Sopenharmony_ci		*p = end;
30662306a36Sopenharmony_ci	return 0;
30762306a36Sopenharmony_cibad:
30862306a36Sopenharmony_ci	return -EIO;
30962306a36Sopenharmony_ci}
31062306a36Sopenharmony_ci
31162306a36Sopenharmony_cistatic int parse_reply_info_lease(void **p, void *end,
31262306a36Sopenharmony_ci				  struct ceph_mds_reply_lease **lease,
31362306a36Sopenharmony_ci				  u64 features, u32 *altname_len, u8 **altname)
31462306a36Sopenharmony_ci{
31562306a36Sopenharmony_ci	u8 struct_v;
31662306a36Sopenharmony_ci	u32 struct_len;
31762306a36Sopenharmony_ci	void *lend;
31862306a36Sopenharmony_ci
31962306a36Sopenharmony_ci	if (features == (u64)-1) {
32062306a36Sopenharmony_ci		u8 struct_compat;
32162306a36Sopenharmony_ci
32262306a36Sopenharmony_ci		ceph_decode_8_safe(p, end, struct_v, bad);
32362306a36Sopenharmony_ci		ceph_decode_8_safe(p, end, struct_compat, bad);
32462306a36Sopenharmony_ci
32562306a36Sopenharmony_ci		/* struct_v is expected to be >= 1. we only understand
32662306a36Sopenharmony_ci		 * encoding whose struct_compat == 1. */
32762306a36Sopenharmony_ci		if (!struct_v || struct_compat != 1)
32862306a36Sopenharmony_ci			goto bad;
32962306a36Sopenharmony_ci
33062306a36Sopenharmony_ci		ceph_decode_32_safe(p, end, struct_len, bad);
33162306a36Sopenharmony_ci	} else {
33262306a36Sopenharmony_ci		struct_len = sizeof(**lease);
33362306a36Sopenharmony_ci		*altname_len = 0;
33462306a36Sopenharmony_ci		*altname = NULL;
33562306a36Sopenharmony_ci	}
33662306a36Sopenharmony_ci
33762306a36Sopenharmony_ci	lend = *p + struct_len;
33862306a36Sopenharmony_ci	ceph_decode_need(p, end, struct_len, bad);
33962306a36Sopenharmony_ci	*lease = *p;
34062306a36Sopenharmony_ci	*p += sizeof(**lease);
34162306a36Sopenharmony_ci
34262306a36Sopenharmony_ci	if (features == (u64)-1) {
34362306a36Sopenharmony_ci		if (struct_v >= 2) {
34462306a36Sopenharmony_ci			ceph_decode_32_safe(p, end, *altname_len, bad);
34562306a36Sopenharmony_ci			ceph_decode_need(p, end, *altname_len, bad);
34662306a36Sopenharmony_ci			*altname = *p;
34762306a36Sopenharmony_ci			*p += *altname_len;
34862306a36Sopenharmony_ci		} else {
34962306a36Sopenharmony_ci			*altname = NULL;
35062306a36Sopenharmony_ci			*altname_len = 0;
35162306a36Sopenharmony_ci		}
35262306a36Sopenharmony_ci	}
35362306a36Sopenharmony_ci	*p = lend;
35462306a36Sopenharmony_ci	return 0;
35562306a36Sopenharmony_cibad:
35662306a36Sopenharmony_ci	return -EIO;
35762306a36Sopenharmony_ci}
35862306a36Sopenharmony_ci
35962306a36Sopenharmony_ci/*
36062306a36Sopenharmony_ci * parse a normal reply, which may contain a (dir+)dentry and/or a
36162306a36Sopenharmony_ci * target inode.
36262306a36Sopenharmony_ci */
36362306a36Sopenharmony_cistatic int parse_reply_info_trace(void **p, void *end,
36462306a36Sopenharmony_ci				  struct ceph_mds_reply_info_parsed *info,
36562306a36Sopenharmony_ci				  u64 features)
36662306a36Sopenharmony_ci{
36762306a36Sopenharmony_ci	int err;
36862306a36Sopenharmony_ci
36962306a36Sopenharmony_ci	if (info->head->is_dentry) {
37062306a36Sopenharmony_ci		err = parse_reply_info_in(p, end, &info->diri, features);
37162306a36Sopenharmony_ci		if (err < 0)
37262306a36Sopenharmony_ci			goto out_bad;
37362306a36Sopenharmony_ci
37462306a36Sopenharmony_ci		err = parse_reply_info_dir(p, end, &info->dirfrag, features);
37562306a36Sopenharmony_ci		if (err < 0)
37662306a36Sopenharmony_ci			goto out_bad;
37762306a36Sopenharmony_ci
37862306a36Sopenharmony_ci		ceph_decode_32_safe(p, end, info->dname_len, bad);
37962306a36Sopenharmony_ci		ceph_decode_need(p, end, info->dname_len, bad);
38062306a36Sopenharmony_ci		info->dname = *p;
38162306a36Sopenharmony_ci		*p += info->dname_len;
38262306a36Sopenharmony_ci
38362306a36Sopenharmony_ci		err = parse_reply_info_lease(p, end, &info->dlease, features,
38462306a36Sopenharmony_ci					     &info->altname_len, &info->altname);
38562306a36Sopenharmony_ci		if (err < 0)
38662306a36Sopenharmony_ci			goto out_bad;
38762306a36Sopenharmony_ci	}
38862306a36Sopenharmony_ci
38962306a36Sopenharmony_ci	if (info->head->is_target) {
39062306a36Sopenharmony_ci		err = parse_reply_info_in(p, end, &info->targeti, features);
39162306a36Sopenharmony_ci		if (err < 0)
39262306a36Sopenharmony_ci			goto out_bad;
39362306a36Sopenharmony_ci	}
39462306a36Sopenharmony_ci
39562306a36Sopenharmony_ci	if (unlikely(*p != end))
39662306a36Sopenharmony_ci		goto bad;
39762306a36Sopenharmony_ci	return 0;
39862306a36Sopenharmony_ci
39962306a36Sopenharmony_cibad:
40062306a36Sopenharmony_ci	err = -EIO;
40162306a36Sopenharmony_ciout_bad:
40262306a36Sopenharmony_ci	pr_err("problem parsing mds trace %d\n", err);
40362306a36Sopenharmony_ci	return err;
40462306a36Sopenharmony_ci}
40562306a36Sopenharmony_ci
40662306a36Sopenharmony_ci/*
40762306a36Sopenharmony_ci * parse readdir results
40862306a36Sopenharmony_ci */
40962306a36Sopenharmony_cistatic int parse_reply_info_readdir(void **p, void *end,
41062306a36Sopenharmony_ci				    struct ceph_mds_request *req,
41162306a36Sopenharmony_ci				    u64 features)
41262306a36Sopenharmony_ci{
41362306a36Sopenharmony_ci	struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
41462306a36Sopenharmony_ci	u32 num, i = 0;
41562306a36Sopenharmony_ci	int err;
41662306a36Sopenharmony_ci
41762306a36Sopenharmony_ci	err = parse_reply_info_dir(p, end, &info->dir_dir, features);
41862306a36Sopenharmony_ci	if (err < 0)
41962306a36Sopenharmony_ci		goto out_bad;
42062306a36Sopenharmony_ci
42162306a36Sopenharmony_ci	ceph_decode_need(p, end, sizeof(num) + 2, bad);
42262306a36Sopenharmony_ci	num = ceph_decode_32(p);
42362306a36Sopenharmony_ci	{
42462306a36Sopenharmony_ci		u16 flags = ceph_decode_16(p);
42562306a36Sopenharmony_ci		info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
42662306a36Sopenharmony_ci		info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
42762306a36Sopenharmony_ci		info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
42862306a36Sopenharmony_ci		info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH);
42962306a36Sopenharmony_ci	}
43062306a36Sopenharmony_ci	if (num == 0)
43162306a36Sopenharmony_ci		goto done;
43262306a36Sopenharmony_ci
43362306a36Sopenharmony_ci	BUG_ON(!info->dir_entries);
43462306a36Sopenharmony_ci	if ((unsigned long)(info->dir_entries + num) >
43562306a36Sopenharmony_ci	    (unsigned long)info->dir_entries + info->dir_buf_size) {
43662306a36Sopenharmony_ci		pr_err("dir contents are larger than expected\n");
43762306a36Sopenharmony_ci		WARN_ON(1);
43862306a36Sopenharmony_ci		goto bad;
43962306a36Sopenharmony_ci	}
44062306a36Sopenharmony_ci
44162306a36Sopenharmony_ci	info->dir_nr = num;
44262306a36Sopenharmony_ci	while (num) {
44362306a36Sopenharmony_ci		struct inode *inode = d_inode(req->r_dentry);
44462306a36Sopenharmony_ci		struct ceph_inode_info *ci = ceph_inode(inode);
44562306a36Sopenharmony_ci		struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
44662306a36Sopenharmony_ci		struct fscrypt_str tname = FSTR_INIT(NULL, 0);
44762306a36Sopenharmony_ci		struct fscrypt_str oname = FSTR_INIT(NULL, 0);
44862306a36Sopenharmony_ci		struct ceph_fname fname;
44962306a36Sopenharmony_ci		u32 altname_len, _name_len;
45062306a36Sopenharmony_ci		u8 *altname, *_name;
45162306a36Sopenharmony_ci
45262306a36Sopenharmony_ci		/* dentry */
45362306a36Sopenharmony_ci		ceph_decode_32_safe(p, end, _name_len, bad);
45462306a36Sopenharmony_ci		ceph_decode_need(p, end, _name_len, bad);
45562306a36Sopenharmony_ci		_name = *p;
45662306a36Sopenharmony_ci		*p += _name_len;
45762306a36Sopenharmony_ci		dout("parsed dir dname '%.*s'\n", _name_len, _name);
45862306a36Sopenharmony_ci
45962306a36Sopenharmony_ci		if (info->hash_order)
46062306a36Sopenharmony_ci			rde->raw_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
46162306a36Sopenharmony_ci						      _name, _name_len);
46262306a36Sopenharmony_ci
46362306a36Sopenharmony_ci		/* dentry lease */
46462306a36Sopenharmony_ci		err = parse_reply_info_lease(p, end, &rde->lease, features,
46562306a36Sopenharmony_ci					     &altname_len, &altname);
46662306a36Sopenharmony_ci		if (err)
46762306a36Sopenharmony_ci			goto out_bad;
46862306a36Sopenharmony_ci
46962306a36Sopenharmony_ci		/*
47062306a36Sopenharmony_ci		 * Try to dencrypt the dentry names and update them
47162306a36Sopenharmony_ci		 * in the ceph_mds_reply_dir_entry struct.
47262306a36Sopenharmony_ci		 */
47362306a36Sopenharmony_ci		fname.dir = inode;
47462306a36Sopenharmony_ci		fname.name = _name;
47562306a36Sopenharmony_ci		fname.name_len = _name_len;
47662306a36Sopenharmony_ci		fname.ctext = altname;
47762306a36Sopenharmony_ci		fname.ctext_len = altname_len;
47862306a36Sopenharmony_ci		/*
47962306a36Sopenharmony_ci		 * The _name_len maybe larger than altname_len, such as
48062306a36Sopenharmony_ci		 * when the human readable name length is in range of
48162306a36Sopenharmony_ci		 * (CEPH_NOHASH_NAME_MAX, CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE),
48262306a36Sopenharmony_ci		 * then the copy in ceph_fname_to_usr will corrupt the
48362306a36Sopenharmony_ci		 * data if there has no encryption key.
48462306a36Sopenharmony_ci		 *
48562306a36Sopenharmony_ci		 * Just set the no_copy flag and then if there has no
48662306a36Sopenharmony_ci		 * encryption key the oname.name will be assigned to
48762306a36Sopenharmony_ci		 * _name always.
48862306a36Sopenharmony_ci		 */
48962306a36Sopenharmony_ci		fname.no_copy = true;
49062306a36Sopenharmony_ci		if (altname_len == 0) {
49162306a36Sopenharmony_ci			/*
49262306a36Sopenharmony_ci			 * Set tname to _name, and this will be used
49362306a36Sopenharmony_ci			 * to do the base64_decode in-place. It's
49462306a36Sopenharmony_ci			 * safe because the decoded string should
49562306a36Sopenharmony_ci			 * always be shorter, which is 3/4 of origin
49662306a36Sopenharmony_ci			 * string.
49762306a36Sopenharmony_ci			 */
49862306a36Sopenharmony_ci			tname.name = _name;
49962306a36Sopenharmony_ci
50062306a36Sopenharmony_ci			/*
50162306a36Sopenharmony_ci			 * Set oname to _name too, and this will be
50262306a36Sopenharmony_ci			 * used to do the dencryption in-place.
50362306a36Sopenharmony_ci			 */
50462306a36Sopenharmony_ci			oname.name = _name;
50562306a36Sopenharmony_ci			oname.len = _name_len;
50662306a36Sopenharmony_ci		} else {
50762306a36Sopenharmony_ci			/*
50862306a36Sopenharmony_ci			 * This will do the decryption only in-place
50962306a36Sopenharmony_ci			 * from altname cryptext directly.
51062306a36Sopenharmony_ci			 */
51162306a36Sopenharmony_ci			oname.name = altname;
51262306a36Sopenharmony_ci			oname.len = altname_len;
51362306a36Sopenharmony_ci		}
51462306a36Sopenharmony_ci		rde->is_nokey = false;
51562306a36Sopenharmony_ci		err = ceph_fname_to_usr(&fname, &tname, &oname, &rde->is_nokey);
51662306a36Sopenharmony_ci		if (err) {
51762306a36Sopenharmony_ci			pr_err("%s unable to decode %.*s, got %d\n", __func__,
51862306a36Sopenharmony_ci			       _name_len, _name, err);
51962306a36Sopenharmony_ci			goto out_bad;
52062306a36Sopenharmony_ci		}
52162306a36Sopenharmony_ci		rde->name = oname.name;
52262306a36Sopenharmony_ci		rde->name_len = oname.len;
52362306a36Sopenharmony_ci
52462306a36Sopenharmony_ci		/* inode */
52562306a36Sopenharmony_ci		err = parse_reply_info_in(p, end, &rde->inode, features);
52662306a36Sopenharmony_ci		if (err < 0)
52762306a36Sopenharmony_ci			goto out_bad;
52862306a36Sopenharmony_ci		/* ceph_readdir_prepopulate() will update it */
52962306a36Sopenharmony_ci		rde->offset = 0;
53062306a36Sopenharmony_ci		i++;
53162306a36Sopenharmony_ci		num--;
53262306a36Sopenharmony_ci	}
53362306a36Sopenharmony_ci
53462306a36Sopenharmony_cidone:
53562306a36Sopenharmony_ci	/* Skip over any unrecognized fields */
53662306a36Sopenharmony_ci	*p = end;
53762306a36Sopenharmony_ci	return 0;
53862306a36Sopenharmony_ci
53962306a36Sopenharmony_cibad:
54062306a36Sopenharmony_ci	err = -EIO;
54162306a36Sopenharmony_ciout_bad:
54262306a36Sopenharmony_ci	pr_err("problem parsing dir contents %d\n", err);
54362306a36Sopenharmony_ci	return err;
54462306a36Sopenharmony_ci}
54562306a36Sopenharmony_ci
54662306a36Sopenharmony_ci/*
54762306a36Sopenharmony_ci * parse fcntl F_GETLK results
54862306a36Sopenharmony_ci */
54962306a36Sopenharmony_cistatic int parse_reply_info_filelock(void **p, void *end,
55062306a36Sopenharmony_ci				     struct ceph_mds_reply_info_parsed *info,
55162306a36Sopenharmony_ci				     u64 features)
55262306a36Sopenharmony_ci{
55362306a36Sopenharmony_ci	if (*p + sizeof(*info->filelock_reply) > end)
55462306a36Sopenharmony_ci		goto bad;
55562306a36Sopenharmony_ci
55662306a36Sopenharmony_ci	info->filelock_reply = *p;
55762306a36Sopenharmony_ci
55862306a36Sopenharmony_ci	/* Skip over any unrecognized fields */
55962306a36Sopenharmony_ci	*p = end;
56062306a36Sopenharmony_ci	return 0;
56162306a36Sopenharmony_cibad:
56262306a36Sopenharmony_ci	return -EIO;
56362306a36Sopenharmony_ci}
56462306a36Sopenharmony_ci
56562306a36Sopenharmony_ci
56662306a36Sopenharmony_ci#if BITS_PER_LONG == 64
56762306a36Sopenharmony_ci
56862306a36Sopenharmony_ci#define DELEGATED_INO_AVAILABLE		xa_mk_value(1)
56962306a36Sopenharmony_ci
57062306a36Sopenharmony_cistatic int ceph_parse_deleg_inos(void **p, void *end,
57162306a36Sopenharmony_ci				 struct ceph_mds_session *s)
57262306a36Sopenharmony_ci{
57362306a36Sopenharmony_ci	u32 sets;
57462306a36Sopenharmony_ci
57562306a36Sopenharmony_ci	ceph_decode_32_safe(p, end, sets, bad);
57662306a36Sopenharmony_ci	dout("got %u sets of delegated inodes\n", sets);
57762306a36Sopenharmony_ci	while (sets--) {
57862306a36Sopenharmony_ci		u64 start, len;
57962306a36Sopenharmony_ci
58062306a36Sopenharmony_ci		ceph_decode_64_safe(p, end, start, bad);
58162306a36Sopenharmony_ci		ceph_decode_64_safe(p, end, len, bad);
58262306a36Sopenharmony_ci
58362306a36Sopenharmony_ci		/* Don't accept a delegation of system inodes */
58462306a36Sopenharmony_ci		if (start < CEPH_INO_SYSTEM_BASE) {
58562306a36Sopenharmony_ci			pr_warn_ratelimited("ceph: ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n",
58662306a36Sopenharmony_ci					start, len);
58762306a36Sopenharmony_ci			continue;
58862306a36Sopenharmony_ci		}
58962306a36Sopenharmony_ci		while (len--) {
59062306a36Sopenharmony_ci			int err = xa_insert(&s->s_delegated_inos, start++,
59162306a36Sopenharmony_ci					    DELEGATED_INO_AVAILABLE,
59262306a36Sopenharmony_ci					    GFP_KERNEL);
59362306a36Sopenharmony_ci			if (!err) {
59462306a36Sopenharmony_ci				dout("added delegated inode 0x%llx\n",
59562306a36Sopenharmony_ci				     start - 1);
59662306a36Sopenharmony_ci			} else if (err == -EBUSY) {
59762306a36Sopenharmony_ci				pr_warn("MDS delegated inode 0x%llx more than once.\n",
59862306a36Sopenharmony_ci					start - 1);
59962306a36Sopenharmony_ci			} else {
60062306a36Sopenharmony_ci				return err;
60162306a36Sopenharmony_ci			}
60262306a36Sopenharmony_ci		}
60362306a36Sopenharmony_ci	}
60462306a36Sopenharmony_ci	return 0;
60562306a36Sopenharmony_cibad:
60662306a36Sopenharmony_ci	return -EIO;
60762306a36Sopenharmony_ci}
60862306a36Sopenharmony_ci
60962306a36Sopenharmony_ciu64 ceph_get_deleg_ino(struct ceph_mds_session *s)
61062306a36Sopenharmony_ci{
61162306a36Sopenharmony_ci	unsigned long ino;
61262306a36Sopenharmony_ci	void *val;
61362306a36Sopenharmony_ci
61462306a36Sopenharmony_ci	xa_for_each(&s->s_delegated_inos, ino, val) {
61562306a36Sopenharmony_ci		val = xa_erase(&s->s_delegated_inos, ino);
61662306a36Sopenharmony_ci		if (val == DELEGATED_INO_AVAILABLE)
61762306a36Sopenharmony_ci			return ino;
61862306a36Sopenharmony_ci	}
61962306a36Sopenharmony_ci	return 0;
62062306a36Sopenharmony_ci}
62162306a36Sopenharmony_ci
62262306a36Sopenharmony_ciint ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
62362306a36Sopenharmony_ci{
62462306a36Sopenharmony_ci	return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE,
62562306a36Sopenharmony_ci			 GFP_KERNEL);
62662306a36Sopenharmony_ci}
62762306a36Sopenharmony_ci#else /* BITS_PER_LONG == 64 */
62862306a36Sopenharmony_ci/*
62962306a36Sopenharmony_ci * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just
63062306a36Sopenharmony_ci * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top
63162306a36Sopenharmony_ci * and bottom words?
63262306a36Sopenharmony_ci */
63362306a36Sopenharmony_cistatic int ceph_parse_deleg_inos(void **p, void *end,
63462306a36Sopenharmony_ci				 struct ceph_mds_session *s)
63562306a36Sopenharmony_ci{
63662306a36Sopenharmony_ci	u32 sets;
63762306a36Sopenharmony_ci
63862306a36Sopenharmony_ci	ceph_decode_32_safe(p, end, sets, bad);
63962306a36Sopenharmony_ci	if (sets)
64062306a36Sopenharmony_ci		ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad);
64162306a36Sopenharmony_ci	return 0;
64262306a36Sopenharmony_cibad:
64362306a36Sopenharmony_ci	return -EIO;
64462306a36Sopenharmony_ci}
64562306a36Sopenharmony_ci
64662306a36Sopenharmony_ciu64 ceph_get_deleg_ino(struct ceph_mds_session *s)
64762306a36Sopenharmony_ci{
64862306a36Sopenharmony_ci	return 0;
64962306a36Sopenharmony_ci}
65062306a36Sopenharmony_ci
65162306a36Sopenharmony_ciint ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
65262306a36Sopenharmony_ci{
65362306a36Sopenharmony_ci	return 0;
65462306a36Sopenharmony_ci}
65562306a36Sopenharmony_ci#endif /* BITS_PER_LONG == 64 */
65662306a36Sopenharmony_ci
65762306a36Sopenharmony_ci/*
65862306a36Sopenharmony_ci * parse create results
65962306a36Sopenharmony_ci */
66062306a36Sopenharmony_cistatic int parse_reply_info_create(void **p, void *end,
66162306a36Sopenharmony_ci				  struct ceph_mds_reply_info_parsed *info,
66262306a36Sopenharmony_ci				  u64 features, struct ceph_mds_session *s)
66362306a36Sopenharmony_ci{
66462306a36Sopenharmony_ci	int ret;
66562306a36Sopenharmony_ci
66662306a36Sopenharmony_ci	if (features == (u64)-1 ||
66762306a36Sopenharmony_ci	    (features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
66862306a36Sopenharmony_ci		if (*p == end) {
66962306a36Sopenharmony_ci			/* Malformed reply? */
67062306a36Sopenharmony_ci			info->has_create_ino = false;
67162306a36Sopenharmony_ci		} else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {
67262306a36Sopenharmony_ci			info->has_create_ino = true;
67362306a36Sopenharmony_ci			/* struct_v, struct_compat, and len */
67462306a36Sopenharmony_ci			ceph_decode_skip_n(p, end, 2 + sizeof(u32), bad);
67562306a36Sopenharmony_ci			ceph_decode_64_safe(p, end, info->ino, bad);
67662306a36Sopenharmony_ci			ret = ceph_parse_deleg_inos(p, end, s);
67762306a36Sopenharmony_ci			if (ret)
67862306a36Sopenharmony_ci				return ret;
67962306a36Sopenharmony_ci		} else {
68062306a36Sopenharmony_ci			/* legacy */
68162306a36Sopenharmony_ci			ceph_decode_64_safe(p, end, info->ino, bad);
68262306a36Sopenharmony_ci			info->has_create_ino = true;
68362306a36Sopenharmony_ci		}
68462306a36Sopenharmony_ci	} else {
68562306a36Sopenharmony_ci		if (*p != end)
68662306a36Sopenharmony_ci			goto bad;
68762306a36Sopenharmony_ci	}
68862306a36Sopenharmony_ci
68962306a36Sopenharmony_ci	/* Skip over any unrecognized fields */
69062306a36Sopenharmony_ci	*p = end;
69162306a36Sopenharmony_ci	return 0;
69262306a36Sopenharmony_cibad:
69362306a36Sopenharmony_ci	return -EIO;
69462306a36Sopenharmony_ci}
69562306a36Sopenharmony_ci
69662306a36Sopenharmony_cistatic int parse_reply_info_getvxattr(void **p, void *end,
69762306a36Sopenharmony_ci				      struct ceph_mds_reply_info_parsed *info,
69862306a36Sopenharmony_ci				      u64 features)
69962306a36Sopenharmony_ci{
70062306a36Sopenharmony_ci	u32 value_len;
70162306a36Sopenharmony_ci
70262306a36Sopenharmony_ci	ceph_decode_skip_8(p, end, bad); /* skip current version: 1 */
70362306a36Sopenharmony_ci	ceph_decode_skip_8(p, end, bad); /* skip first version: 1 */
70462306a36Sopenharmony_ci	ceph_decode_skip_32(p, end, bad); /* skip payload length */
70562306a36Sopenharmony_ci
70662306a36Sopenharmony_ci	ceph_decode_32_safe(p, end, value_len, bad);
70762306a36Sopenharmony_ci
70862306a36Sopenharmony_ci	if (value_len == end - *p) {
70962306a36Sopenharmony_ci	  info->xattr_info.xattr_value = *p;
71062306a36Sopenharmony_ci	  info->xattr_info.xattr_value_len = value_len;
71162306a36Sopenharmony_ci	  *p = end;
71262306a36Sopenharmony_ci	  return value_len;
71362306a36Sopenharmony_ci	}
71462306a36Sopenharmony_cibad:
71562306a36Sopenharmony_ci	return -EIO;
71662306a36Sopenharmony_ci}
71762306a36Sopenharmony_ci
71862306a36Sopenharmony_ci/*
71962306a36Sopenharmony_ci * parse extra results
72062306a36Sopenharmony_ci */
72162306a36Sopenharmony_cistatic int parse_reply_info_extra(void **p, void *end,
72262306a36Sopenharmony_ci				  struct ceph_mds_request *req,
72362306a36Sopenharmony_ci				  u64 features, struct ceph_mds_session *s)
72462306a36Sopenharmony_ci{
72562306a36Sopenharmony_ci	struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
72662306a36Sopenharmony_ci	u32 op = le32_to_cpu(info->head->op);
72762306a36Sopenharmony_ci
72862306a36Sopenharmony_ci	if (op == CEPH_MDS_OP_GETFILELOCK)
72962306a36Sopenharmony_ci		return parse_reply_info_filelock(p, end, info, features);
73062306a36Sopenharmony_ci	else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
73162306a36Sopenharmony_ci		return parse_reply_info_readdir(p, end, req, features);
73262306a36Sopenharmony_ci	else if (op == CEPH_MDS_OP_CREATE)
73362306a36Sopenharmony_ci		return parse_reply_info_create(p, end, info, features, s);
73462306a36Sopenharmony_ci	else if (op == CEPH_MDS_OP_GETVXATTR)
73562306a36Sopenharmony_ci		return parse_reply_info_getvxattr(p, end, info, features);
73662306a36Sopenharmony_ci	else
73762306a36Sopenharmony_ci		return -EIO;
73862306a36Sopenharmony_ci}
73962306a36Sopenharmony_ci
74062306a36Sopenharmony_ci/*
74162306a36Sopenharmony_ci * parse entire mds reply
74262306a36Sopenharmony_ci */
74362306a36Sopenharmony_cistatic int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
74462306a36Sopenharmony_ci			    struct ceph_mds_request *req, u64 features)
74562306a36Sopenharmony_ci{
74662306a36Sopenharmony_ci	struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
74762306a36Sopenharmony_ci	void *p, *end;
74862306a36Sopenharmony_ci	u32 len;
74962306a36Sopenharmony_ci	int err;
75062306a36Sopenharmony_ci
75162306a36Sopenharmony_ci	info->head = msg->front.iov_base;
75262306a36Sopenharmony_ci	p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
75362306a36Sopenharmony_ci	end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
75462306a36Sopenharmony_ci
75562306a36Sopenharmony_ci	/* trace */
75662306a36Sopenharmony_ci	ceph_decode_32_safe(&p, end, len, bad);
75762306a36Sopenharmony_ci	if (len > 0) {
75862306a36Sopenharmony_ci		ceph_decode_need(&p, end, len, bad);
75962306a36Sopenharmony_ci		err = parse_reply_info_trace(&p, p+len, info, features);
76062306a36Sopenharmony_ci		if (err < 0)
76162306a36Sopenharmony_ci			goto out_bad;
76262306a36Sopenharmony_ci	}
76362306a36Sopenharmony_ci
76462306a36Sopenharmony_ci	/* extra */
76562306a36Sopenharmony_ci	ceph_decode_32_safe(&p, end, len, bad);
76662306a36Sopenharmony_ci	if (len > 0) {
76762306a36Sopenharmony_ci		ceph_decode_need(&p, end, len, bad);
76862306a36Sopenharmony_ci		err = parse_reply_info_extra(&p, p+len, req, features, s);
76962306a36Sopenharmony_ci		if (err < 0)
77062306a36Sopenharmony_ci			goto out_bad;
77162306a36Sopenharmony_ci	}
77262306a36Sopenharmony_ci
77362306a36Sopenharmony_ci	/* snap blob */
77462306a36Sopenharmony_ci	ceph_decode_32_safe(&p, end, len, bad);
77562306a36Sopenharmony_ci	info->snapblob_len = len;
77662306a36Sopenharmony_ci	info->snapblob = p;
77762306a36Sopenharmony_ci	p += len;
77862306a36Sopenharmony_ci
77962306a36Sopenharmony_ci	if (p != end)
78062306a36Sopenharmony_ci		goto bad;
78162306a36Sopenharmony_ci	return 0;
78262306a36Sopenharmony_ci
78362306a36Sopenharmony_cibad:
78462306a36Sopenharmony_ci	err = -EIO;
78562306a36Sopenharmony_ciout_bad:
78662306a36Sopenharmony_ci	pr_err("mds parse_reply err %d\n", err);
78762306a36Sopenharmony_ci	ceph_msg_dump(msg);
78862306a36Sopenharmony_ci	return err;
78962306a36Sopenharmony_ci}
79062306a36Sopenharmony_ci
79162306a36Sopenharmony_cistatic void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
79262306a36Sopenharmony_ci{
79362306a36Sopenharmony_ci	int i;
79462306a36Sopenharmony_ci
79562306a36Sopenharmony_ci	kfree(info->diri.fscrypt_auth);
79662306a36Sopenharmony_ci	kfree(info->diri.fscrypt_file);
79762306a36Sopenharmony_ci	kfree(info->targeti.fscrypt_auth);
79862306a36Sopenharmony_ci	kfree(info->targeti.fscrypt_file);
79962306a36Sopenharmony_ci	if (!info->dir_entries)
80062306a36Sopenharmony_ci		return;
80162306a36Sopenharmony_ci
80262306a36Sopenharmony_ci	for (i = 0; i < info->dir_nr; i++) {
80362306a36Sopenharmony_ci		struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
80462306a36Sopenharmony_ci
80562306a36Sopenharmony_ci		kfree(rde->inode.fscrypt_auth);
80662306a36Sopenharmony_ci		kfree(rde->inode.fscrypt_file);
80762306a36Sopenharmony_ci	}
80862306a36Sopenharmony_ci	free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
80962306a36Sopenharmony_ci}
81062306a36Sopenharmony_ci
81162306a36Sopenharmony_ci/*
81262306a36Sopenharmony_ci * In async unlink case the kclient won't wait for the first reply
81362306a36Sopenharmony_ci * from MDS and just drop all the links and unhash the dentry and then
81462306a36Sopenharmony_ci * succeeds immediately.
81562306a36Sopenharmony_ci *
81662306a36Sopenharmony_ci * For any new create/link/rename,etc requests followed by using the
81762306a36Sopenharmony_ci * same file names we must wait for the first reply of the inflight
81862306a36Sopenharmony_ci * unlink request, or the MDS possibly will fail these following
81962306a36Sopenharmony_ci * requests with -EEXIST if the inflight async unlink request was
82062306a36Sopenharmony_ci * delayed for some reasons.
82162306a36Sopenharmony_ci *
82262306a36Sopenharmony_ci * And the worst case is that for the none async openc request it will
82362306a36Sopenharmony_ci * successfully open the file if the CDentry hasn't been unlinked yet,
82462306a36Sopenharmony_ci * but later the previous delayed async unlink request will remove the
82562306a36Sopenharmony_ci * CDenty. That means the just created file is possiblly deleted later
82662306a36Sopenharmony_ci * by accident.
82762306a36Sopenharmony_ci *
82862306a36Sopenharmony_ci * We need to wait for the inflight async unlink requests to finish
82962306a36Sopenharmony_ci * when creating new files/directories by using the same file names.
83062306a36Sopenharmony_ci */
83162306a36Sopenharmony_ciint ceph_wait_on_conflict_unlink(struct dentry *dentry)
83262306a36Sopenharmony_ci{
83362306a36Sopenharmony_ci	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
83462306a36Sopenharmony_ci	struct dentry *pdentry = dentry->d_parent;
83562306a36Sopenharmony_ci	struct dentry *udentry, *found = NULL;
83662306a36Sopenharmony_ci	struct ceph_dentry_info *di;
83762306a36Sopenharmony_ci	struct qstr dname;
83862306a36Sopenharmony_ci	u32 hash = dentry->d_name.hash;
83962306a36Sopenharmony_ci	int err;
84062306a36Sopenharmony_ci
84162306a36Sopenharmony_ci	dname.name = dentry->d_name.name;
84262306a36Sopenharmony_ci	dname.len = dentry->d_name.len;
84362306a36Sopenharmony_ci
84462306a36Sopenharmony_ci	rcu_read_lock();
84562306a36Sopenharmony_ci	hash_for_each_possible_rcu(fsc->async_unlink_conflict, di,
84662306a36Sopenharmony_ci				   hnode, hash) {
84762306a36Sopenharmony_ci		udentry = di->dentry;
84862306a36Sopenharmony_ci
84962306a36Sopenharmony_ci		spin_lock(&udentry->d_lock);
85062306a36Sopenharmony_ci		if (udentry->d_name.hash != hash)
85162306a36Sopenharmony_ci			goto next;
85262306a36Sopenharmony_ci		if (unlikely(udentry->d_parent != pdentry))
85362306a36Sopenharmony_ci			goto next;
85462306a36Sopenharmony_ci		if (!hash_hashed(&di->hnode))
85562306a36Sopenharmony_ci			goto next;
85662306a36Sopenharmony_ci
85762306a36Sopenharmony_ci		if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags))
85862306a36Sopenharmony_ci			pr_warn("%s dentry %p:%pd async unlink bit is not set\n",
85962306a36Sopenharmony_ci				__func__, dentry, dentry);
86062306a36Sopenharmony_ci
86162306a36Sopenharmony_ci		if (!d_same_name(udentry, pdentry, &dname))
86262306a36Sopenharmony_ci			goto next;
86362306a36Sopenharmony_ci
86462306a36Sopenharmony_ci		found = dget_dlock(udentry);
86562306a36Sopenharmony_ci		spin_unlock(&udentry->d_lock);
86662306a36Sopenharmony_ci		break;
86762306a36Sopenharmony_cinext:
86862306a36Sopenharmony_ci		spin_unlock(&udentry->d_lock);
86962306a36Sopenharmony_ci	}
87062306a36Sopenharmony_ci	rcu_read_unlock();
87162306a36Sopenharmony_ci
87262306a36Sopenharmony_ci	if (likely(!found))
87362306a36Sopenharmony_ci		return 0;
87462306a36Sopenharmony_ci
87562306a36Sopenharmony_ci	dout("%s dentry %p:%pd conflict with old %p:%pd\n", __func__,
87662306a36Sopenharmony_ci	     dentry, dentry, found, found);
87762306a36Sopenharmony_ci
87862306a36Sopenharmony_ci	err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT,
87962306a36Sopenharmony_ci			  TASK_KILLABLE);
88062306a36Sopenharmony_ci	dput(found);
88162306a36Sopenharmony_ci	return err;
88262306a36Sopenharmony_ci}
88362306a36Sopenharmony_ci
88462306a36Sopenharmony_ci
88562306a36Sopenharmony_ci/*
88662306a36Sopenharmony_ci * sessions
88762306a36Sopenharmony_ci */
88862306a36Sopenharmony_ciconst char *ceph_session_state_name(int s)
88962306a36Sopenharmony_ci{
89062306a36Sopenharmony_ci	switch (s) {
89162306a36Sopenharmony_ci	case CEPH_MDS_SESSION_NEW: return "new";
89262306a36Sopenharmony_ci	case CEPH_MDS_SESSION_OPENING: return "opening";
89362306a36Sopenharmony_ci	case CEPH_MDS_SESSION_OPEN: return "open";
89462306a36Sopenharmony_ci	case CEPH_MDS_SESSION_HUNG: return "hung";
89562306a36Sopenharmony_ci	case CEPH_MDS_SESSION_CLOSING: return "closing";
89662306a36Sopenharmony_ci	case CEPH_MDS_SESSION_CLOSED: return "closed";
89762306a36Sopenharmony_ci	case CEPH_MDS_SESSION_RESTARTING: return "restarting";
89862306a36Sopenharmony_ci	case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
89962306a36Sopenharmony_ci	case CEPH_MDS_SESSION_REJECTED: return "rejected";
90062306a36Sopenharmony_ci	default: return "???";
90162306a36Sopenharmony_ci	}
90262306a36Sopenharmony_ci}
90362306a36Sopenharmony_ci
90462306a36Sopenharmony_cistruct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s)
90562306a36Sopenharmony_ci{
90662306a36Sopenharmony_ci	if (refcount_inc_not_zero(&s->s_ref))
90762306a36Sopenharmony_ci		return s;
90862306a36Sopenharmony_ci	return NULL;
90962306a36Sopenharmony_ci}
91062306a36Sopenharmony_ci
91162306a36Sopenharmony_civoid ceph_put_mds_session(struct ceph_mds_session *s)
91262306a36Sopenharmony_ci{
91362306a36Sopenharmony_ci	if (IS_ERR_OR_NULL(s))
91462306a36Sopenharmony_ci		return;
91562306a36Sopenharmony_ci
91662306a36Sopenharmony_ci	if (refcount_dec_and_test(&s->s_ref)) {
91762306a36Sopenharmony_ci		if (s->s_auth.authorizer)
91862306a36Sopenharmony_ci			ceph_auth_destroy_authorizer(s->s_auth.authorizer);
91962306a36Sopenharmony_ci		WARN_ON(mutex_is_locked(&s->s_mutex));
92062306a36Sopenharmony_ci		xa_destroy(&s->s_delegated_inos);
92162306a36Sopenharmony_ci		kfree(s);
92262306a36Sopenharmony_ci	}
92362306a36Sopenharmony_ci}
92462306a36Sopenharmony_ci
92562306a36Sopenharmony_ci/*
92662306a36Sopenharmony_ci * called under mdsc->mutex
92762306a36Sopenharmony_ci */
92862306a36Sopenharmony_cistruct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
92962306a36Sopenharmony_ci						   int mds)
93062306a36Sopenharmony_ci{
93162306a36Sopenharmony_ci	if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
93262306a36Sopenharmony_ci		return NULL;
93362306a36Sopenharmony_ci	return ceph_get_mds_session(mdsc->sessions[mds]);
93462306a36Sopenharmony_ci}
93562306a36Sopenharmony_ci
93662306a36Sopenharmony_cistatic bool __have_session(struct ceph_mds_client *mdsc, int mds)
93762306a36Sopenharmony_ci{
93862306a36Sopenharmony_ci	if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
93962306a36Sopenharmony_ci		return false;
94062306a36Sopenharmony_ci	else
94162306a36Sopenharmony_ci		return true;
94262306a36Sopenharmony_ci}
94362306a36Sopenharmony_ci
94462306a36Sopenharmony_cistatic int __verify_registered_session(struct ceph_mds_client *mdsc,
94562306a36Sopenharmony_ci				       struct ceph_mds_session *s)
94662306a36Sopenharmony_ci{
94762306a36Sopenharmony_ci	if (s->s_mds >= mdsc->max_sessions ||
94862306a36Sopenharmony_ci	    mdsc->sessions[s->s_mds] != s)
94962306a36Sopenharmony_ci		return -ENOENT;
95062306a36Sopenharmony_ci	return 0;
95162306a36Sopenharmony_ci}
95262306a36Sopenharmony_ci
95362306a36Sopenharmony_ci/*
95462306a36Sopenharmony_ci * create+register a new session for given mds.
95562306a36Sopenharmony_ci * called under mdsc->mutex.
95662306a36Sopenharmony_ci */
95762306a36Sopenharmony_cistatic struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
95862306a36Sopenharmony_ci						 int mds)
95962306a36Sopenharmony_ci{
96062306a36Sopenharmony_ci	struct ceph_mds_session *s;
96162306a36Sopenharmony_ci
96262306a36Sopenharmony_ci	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)
96362306a36Sopenharmony_ci		return ERR_PTR(-EIO);
96462306a36Sopenharmony_ci
96562306a36Sopenharmony_ci	if (mds >= mdsc->mdsmap->possible_max_rank)
96662306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
96762306a36Sopenharmony_ci
96862306a36Sopenharmony_ci	s = kzalloc(sizeof(*s), GFP_NOFS);
96962306a36Sopenharmony_ci	if (!s)
97062306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
97162306a36Sopenharmony_ci
97262306a36Sopenharmony_ci	if (mds >= mdsc->max_sessions) {
97362306a36Sopenharmony_ci		int newmax = 1 << get_count_order(mds + 1);
97462306a36Sopenharmony_ci		struct ceph_mds_session **sa;
97562306a36Sopenharmony_ci
97662306a36Sopenharmony_ci		dout("%s: realloc to %d\n", __func__, newmax);
97762306a36Sopenharmony_ci		sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
97862306a36Sopenharmony_ci		if (!sa)
97962306a36Sopenharmony_ci			goto fail_realloc;
98062306a36Sopenharmony_ci		if (mdsc->sessions) {
98162306a36Sopenharmony_ci			memcpy(sa, mdsc->sessions,
98262306a36Sopenharmony_ci			       mdsc->max_sessions * sizeof(void *));
98362306a36Sopenharmony_ci			kfree(mdsc->sessions);
98462306a36Sopenharmony_ci		}
98562306a36Sopenharmony_ci		mdsc->sessions = sa;
98662306a36Sopenharmony_ci		mdsc->max_sessions = newmax;
98762306a36Sopenharmony_ci	}
98862306a36Sopenharmony_ci
98962306a36Sopenharmony_ci	dout("%s: mds%d\n", __func__, mds);
99062306a36Sopenharmony_ci	s->s_mdsc = mdsc;
99162306a36Sopenharmony_ci	s->s_mds = mds;
99262306a36Sopenharmony_ci	s->s_state = CEPH_MDS_SESSION_NEW;
99362306a36Sopenharmony_ci	mutex_init(&s->s_mutex);
99462306a36Sopenharmony_ci
99562306a36Sopenharmony_ci	ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
99662306a36Sopenharmony_ci
99762306a36Sopenharmony_ci	atomic_set(&s->s_cap_gen, 1);
99862306a36Sopenharmony_ci	s->s_cap_ttl = jiffies - 1;
99962306a36Sopenharmony_ci
100062306a36Sopenharmony_ci	spin_lock_init(&s->s_cap_lock);
100162306a36Sopenharmony_ci	INIT_LIST_HEAD(&s->s_caps);
100262306a36Sopenharmony_ci	refcount_set(&s->s_ref, 1);
100362306a36Sopenharmony_ci	INIT_LIST_HEAD(&s->s_waiting);
100462306a36Sopenharmony_ci	INIT_LIST_HEAD(&s->s_unsafe);
100562306a36Sopenharmony_ci	xa_init(&s->s_delegated_inos);
100662306a36Sopenharmony_ci	INIT_LIST_HEAD(&s->s_cap_releases);
100762306a36Sopenharmony_ci	INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work);
100862306a36Sopenharmony_ci
100962306a36Sopenharmony_ci	INIT_LIST_HEAD(&s->s_cap_dirty);
101062306a36Sopenharmony_ci	INIT_LIST_HEAD(&s->s_cap_flushing);
101162306a36Sopenharmony_ci
101262306a36Sopenharmony_ci	mdsc->sessions[mds] = s;
101362306a36Sopenharmony_ci	atomic_inc(&mdsc->num_sessions);
101462306a36Sopenharmony_ci	refcount_inc(&s->s_ref);  /* one ref to sessions[], one to caller */
101562306a36Sopenharmony_ci
101662306a36Sopenharmony_ci	ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
101762306a36Sopenharmony_ci		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
101862306a36Sopenharmony_ci
101962306a36Sopenharmony_ci	return s;
102062306a36Sopenharmony_ci
102162306a36Sopenharmony_cifail_realloc:
102262306a36Sopenharmony_ci	kfree(s);
102362306a36Sopenharmony_ci	return ERR_PTR(-ENOMEM);
102462306a36Sopenharmony_ci}
102562306a36Sopenharmony_ci
102662306a36Sopenharmony_ci/*
102762306a36Sopenharmony_ci * called under mdsc->mutex
102862306a36Sopenharmony_ci */
102962306a36Sopenharmony_cistatic void __unregister_session(struct ceph_mds_client *mdsc,
103062306a36Sopenharmony_ci			       struct ceph_mds_session *s)
103162306a36Sopenharmony_ci{
103262306a36Sopenharmony_ci	dout("__unregister_session mds%d %p\n", s->s_mds, s);
103362306a36Sopenharmony_ci	BUG_ON(mdsc->sessions[s->s_mds] != s);
103462306a36Sopenharmony_ci	mdsc->sessions[s->s_mds] = NULL;
103562306a36Sopenharmony_ci	ceph_con_close(&s->s_con);
103662306a36Sopenharmony_ci	ceph_put_mds_session(s);
103762306a36Sopenharmony_ci	atomic_dec(&mdsc->num_sessions);
103862306a36Sopenharmony_ci}
103962306a36Sopenharmony_ci
104062306a36Sopenharmony_ci/*
104162306a36Sopenharmony_ci * drop session refs in request.
104262306a36Sopenharmony_ci *
104362306a36Sopenharmony_ci * should be last request ref, or hold mdsc->mutex
104462306a36Sopenharmony_ci */
104562306a36Sopenharmony_cistatic void put_request_session(struct ceph_mds_request *req)
104662306a36Sopenharmony_ci{
104762306a36Sopenharmony_ci	if (req->r_session) {
104862306a36Sopenharmony_ci		ceph_put_mds_session(req->r_session);
104962306a36Sopenharmony_ci		req->r_session = NULL;
105062306a36Sopenharmony_ci	}
105162306a36Sopenharmony_ci}
105262306a36Sopenharmony_ci
105362306a36Sopenharmony_civoid ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
105462306a36Sopenharmony_ci				void (*cb)(struct ceph_mds_session *),
105562306a36Sopenharmony_ci				bool check_state)
105662306a36Sopenharmony_ci{
105762306a36Sopenharmony_ci	int mds;
105862306a36Sopenharmony_ci
105962306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
106062306a36Sopenharmony_ci	for (mds = 0; mds < mdsc->max_sessions; ++mds) {
106162306a36Sopenharmony_ci		struct ceph_mds_session *s;
106262306a36Sopenharmony_ci
106362306a36Sopenharmony_ci		s = __ceph_lookup_mds_session(mdsc, mds);
106462306a36Sopenharmony_ci		if (!s)
106562306a36Sopenharmony_ci			continue;
106662306a36Sopenharmony_ci
106762306a36Sopenharmony_ci		if (check_state && !check_session_state(s)) {
106862306a36Sopenharmony_ci			ceph_put_mds_session(s);
106962306a36Sopenharmony_ci			continue;
107062306a36Sopenharmony_ci		}
107162306a36Sopenharmony_ci
107262306a36Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
107362306a36Sopenharmony_ci		cb(s);
107462306a36Sopenharmony_ci		ceph_put_mds_session(s);
107562306a36Sopenharmony_ci		mutex_lock(&mdsc->mutex);
107662306a36Sopenharmony_ci	}
107762306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
107862306a36Sopenharmony_ci}
107962306a36Sopenharmony_ci
108062306a36Sopenharmony_civoid ceph_mdsc_release_request(struct kref *kref)
108162306a36Sopenharmony_ci{
108262306a36Sopenharmony_ci	struct ceph_mds_request *req = container_of(kref,
108362306a36Sopenharmony_ci						    struct ceph_mds_request,
108462306a36Sopenharmony_ci						    r_kref);
108562306a36Sopenharmony_ci	ceph_mdsc_release_dir_caps_no_check(req);
108662306a36Sopenharmony_ci	destroy_reply_info(&req->r_reply_info);
108762306a36Sopenharmony_ci	if (req->r_request)
108862306a36Sopenharmony_ci		ceph_msg_put(req->r_request);
108962306a36Sopenharmony_ci	if (req->r_reply)
109062306a36Sopenharmony_ci		ceph_msg_put(req->r_reply);
109162306a36Sopenharmony_ci	if (req->r_inode) {
109262306a36Sopenharmony_ci		ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
109362306a36Sopenharmony_ci		iput(req->r_inode);
109462306a36Sopenharmony_ci	}
109562306a36Sopenharmony_ci	if (req->r_parent) {
109662306a36Sopenharmony_ci		ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
109762306a36Sopenharmony_ci		iput(req->r_parent);
109862306a36Sopenharmony_ci	}
109962306a36Sopenharmony_ci	iput(req->r_target_inode);
110062306a36Sopenharmony_ci	iput(req->r_new_inode);
110162306a36Sopenharmony_ci	if (req->r_dentry)
110262306a36Sopenharmony_ci		dput(req->r_dentry);
110362306a36Sopenharmony_ci	if (req->r_old_dentry)
110462306a36Sopenharmony_ci		dput(req->r_old_dentry);
110562306a36Sopenharmony_ci	if (req->r_old_dentry_dir) {
110662306a36Sopenharmony_ci		/*
110762306a36Sopenharmony_ci		 * track (and drop pins for) r_old_dentry_dir
110862306a36Sopenharmony_ci		 * separately, since r_old_dentry's d_parent may have
110962306a36Sopenharmony_ci		 * changed between the dir mutex being dropped and
111062306a36Sopenharmony_ci		 * this request being freed.
111162306a36Sopenharmony_ci		 */
111262306a36Sopenharmony_ci		ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
111362306a36Sopenharmony_ci				  CEPH_CAP_PIN);
111462306a36Sopenharmony_ci		iput(req->r_old_dentry_dir);
111562306a36Sopenharmony_ci	}
111662306a36Sopenharmony_ci	kfree(req->r_path1);
111762306a36Sopenharmony_ci	kfree(req->r_path2);
111862306a36Sopenharmony_ci	put_cred(req->r_cred);
111962306a36Sopenharmony_ci	if (req->r_pagelist)
112062306a36Sopenharmony_ci		ceph_pagelist_release(req->r_pagelist);
112162306a36Sopenharmony_ci	kfree(req->r_fscrypt_auth);
112262306a36Sopenharmony_ci	kfree(req->r_altname);
112362306a36Sopenharmony_ci	put_request_session(req);
112462306a36Sopenharmony_ci	ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
112562306a36Sopenharmony_ci	WARN_ON_ONCE(!list_empty(&req->r_wait));
112662306a36Sopenharmony_ci	kmem_cache_free(ceph_mds_request_cachep, req);
112762306a36Sopenharmony_ci}
112862306a36Sopenharmony_ci
112962306a36Sopenharmony_ciDEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
113062306a36Sopenharmony_ci
113162306a36Sopenharmony_ci/*
113262306a36Sopenharmony_ci * lookup session, bump ref if found.
113362306a36Sopenharmony_ci *
113462306a36Sopenharmony_ci * called under mdsc->mutex.
113562306a36Sopenharmony_ci */
113662306a36Sopenharmony_cistatic struct ceph_mds_request *
113762306a36Sopenharmony_cilookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
113862306a36Sopenharmony_ci{
113962306a36Sopenharmony_ci	struct ceph_mds_request *req;
114062306a36Sopenharmony_ci
114162306a36Sopenharmony_ci	req = lookup_request(&mdsc->request_tree, tid);
114262306a36Sopenharmony_ci	if (req)
114362306a36Sopenharmony_ci		ceph_mdsc_get_request(req);
114462306a36Sopenharmony_ci
114562306a36Sopenharmony_ci	return req;
114662306a36Sopenharmony_ci}
114762306a36Sopenharmony_ci
114862306a36Sopenharmony_ci/*
114962306a36Sopenharmony_ci * Register an in-flight request, and assign a tid.  Link to directory
115062306a36Sopenharmony_ci * are modifying (if any).
115162306a36Sopenharmony_ci *
115262306a36Sopenharmony_ci * Called under mdsc->mutex.
115362306a36Sopenharmony_ci */
115462306a36Sopenharmony_cistatic void __register_request(struct ceph_mds_client *mdsc,
115562306a36Sopenharmony_ci			       struct ceph_mds_request *req,
115662306a36Sopenharmony_ci			       struct inode *dir)
115762306a36Sopenharmony_ci{
115862306a36Sopenharmony_ci	int ret = 0;
115962306a36Sopenharmony_ci
116062306a36Sopenharmony_ci	req->r_tid = ++mdsc->last_tid;
116162306a36Sopenharmony_ci	if (req->r_num_caps) {
116262306a36Sopenharmony_ci		ret = ceph_reserve_caps(mdsc, &req->r_caps_reservation,
116362306a36Sopenharmony_ci					req->r_num_caps);
116462306a36Sopenharmony_ci		if (ret < 0) {
116562306a36Sopenharmony_ci			pr_err("__register_request %p "
116662306a36Sopenharmony_ci			       "failed to reserve caps: %d\n", req, ret);
116762306a36Sopenharmony_ci			/* set req->r_err to fail early from __do_request */
116862306a36Sopenharmony_ci			req->r_err = ret;
116962306a36Sopenharmony_ci			return;
117062306a36Sopenharmony_ci		}
117162306a36Sopenharmony_ci	}
117262306a36Sopenharmony_ci	dout("__register_request %p tid %lld\n", req, req->r_tid);
117362306a36Sopenharmony_ci	ceph_mdsc_get_request(req);
117462306a36Sopenharmony_ci	insert_request(&mdsc->request_tree, req);
117562306a36Sopenharmony_ci
117662306a36Sopenharmony_ci	req->r_cred = get_current_cred();
117762306a36Sopenharmony_ci
117862306a36Sopenharmony_ci	if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
117962306a36Sopenharmony_ci		mdsc->oldest_tid = req->r_tid;
118062306a36Sopenharmony_ci
118162306a36Sopenharmony_ci	if (dir) {
118262306a36Sopenharmony_ci		struct ceph_inode_info *ci = ceph_inode(dir);
118362306a36Sopenharmony_ci
118462306a36Sopenharmony_ci		ihold(dir);
118562306a36Sopenharmony_ci		req->r_unsafe_dir = dir;
118662306a36Sopenharmony_ci		spin_lock(&ci->i_unsafe_lock);
118762306a36Sopenharmony_ci		list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
118862306a36Sopenharmony_ci		spin_unlock(&ci->i_unsafe_lock);
118962306a36Sopenharmony_ci	}
119062306a36Sopenharmony_ci}
119162306a36Sopenharmony_ci
119262306a36Sopenharmony_cistatic void __unregister_request(struct ceph_mds_client *mdsc,
119362306a36Sopenharmony_ci				 struct ceph_mds_request *req)
119462306a36Sopenharmony_ci{
119562306a36Sopenharmony_ci	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
119662306a36Sopenharmony_ci
119762306a36Sopenharmony_ci	/* Never leave an unregistered request on an unsafe list! */
119862306a36Sopenharmony_ci	list_del_init(&req->r_unsafe_item);
119962306a36Sopenharmony_ci
120062306a36Sopenharmony_ci	if (req->r_tid == mdsc->oldest_tid) {
120162306a36Sopenharmony_ci		struct rb_node *p = rb_next(&req->r_node);
120262306a36Sopenharmony_ci		mdsc->oldest_tid = 0;
120362306a36Sopenharmony_ci		while (p) {
120462306a36Sopenharmony_ci			struct ceph_mds_request *next_req =
120562306a36Sopenharmony_ci				rb_entry(p, struct ceph_mds_request, r_node);
120662306a36Sopenharmony_ci			if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) {
120762306a36Sopenharmony_ci				mdsc->oldest_tid = next_req->r_tid;
120862306a36Sopenharmony_ci				break;
120962306a36Sopenharmony_ci			}
121062306a36Sopenharmony_ci			p = rb_next(p);
121162306a36Sopenharmony_ci		}
121262306a36Sopenharmony_ci	}
121362306a36Sopenharmony_ci
121462306a36Sopenharmony_ci	erase_request(&mdsc->request_tree, req);
121562306a36Sopenharmony_ci
121662306a36Sopenharmony_ci	if (req->r_unsafe_dir) {
121762306a36Sopenharmony_ci		struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
121862306a36Sopenharmony_ci		spin_lock(&ci->i_unsafe_lock);
121962306a36Sopenharmony_ci		list_del_init(&req->r_unsafe_dir_item);
122062306a36Sopenharmony_ci		spin_unlock(&ci->i_unsafe_lock);
122162306a36Sopenharmony_ci	}
122262306a36Sopenharmony_ci	if (req->r_target_inode &&
122362306a36Sopenharmony_ci	    test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
122462306a36Sopenharmony_ci		struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
122562306a36Sopenharmony_ci		spin_lock(&ci->i_unsafe_lock);
122662306a36Sopenharmony_ci		list_del_init(&req->r_unsafe_target_item);
122762306a36Sopenharmony_ci		spin_unlock(&ci->i_unsafe_lock);
122862306a36Sopenharmony_ci	}
122962306a36Sopenharmony_ci
123062306a36Sopenharmony_ci	if (req->r_unsafe_dir) {
123162306a36Sopenharmony_ci		iput(req->r_unsafe_dir);
123262306a36Sopenharmony_ci		req->r_unsafe_dir = NULL;
123362306a36Sopenharmony_ci	}
123462306a36Sopenharmony_ci
123562306a36Sopenharmony_ci	complete_all(&req->r_safe_completion);
123662306a36Sopenharmony_ci
123762306a36Sopenharmony_ci	ceph_mdsc_put_request(req);
123862306a36Sopenharmony_ci}
123962306a36Sopenharmony_ci
124062306a36Sopenharmony_ci/*
124162306a36Sopenharmony_ci * Walk back up the dentry tree until we hit a dentry representing a
124262306a36Sopenharmony_ci * non-snapshot inode. We do this using the rcu_read_lock (which must be held
124362306a36Sopenharmony_ci * when calling this) to ensure that the objects won't disappear while we're
124462306a36Sopenharmony_ci * working with them. Once we hit a candidate dentry, we attempt to take a
124562306a36Sopenharmony_ci * reference to it, and return that as the result.
124662306a36Sopenharmony_ci */
124762306a36Sopenharmony_cistatic struct inode *get_nonsnap_parent(struct dentry *dentry)
124862306a36Sopenharmony_ci{
124962306a36Sopenharmony_ci	struct inode *inode = NULL;
125062306a36Sopenharmony_ci
125162306a36Sopenharmony_ci	while (dentry && !IS_ROOT(dentry)) {
125262306a36Sopenharmony_ci		inode = d_inode_rcu(dentry);
125362306a36Sopenharmony_ci		if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
125462306a36Sopenharmony_ci			break;
125562306a36Sopenharmony_ci		dentry = dentry->d_parent;
125662306a36Sopenharmony_ci	}
125762306a36Sopenharmony_ci	if (inode)
125862306a36Sopenharmony_ci		inode = igrab(inode);
125962306a36Sopenharmony_ci	return inode;
126062306a36Sopenharmony_ci}
126162306a36Sopenharmony_ci
126262306a36Sopenharmony_ci/*
126362306a36Sopenharmony_ci * Choose mds to send request to next.  If there is a hint set in the
126462306a36Sopenharmony_ci * request (e.g., due to a prior forward hint from the mds), use that.
126562306a36Sopenharmony_ci * Otherwise, consult frag tree and/or caps to identify the
126662306a36Sopenharmony_ci * appropriate mds.  If all else fails, choose randomly.
126762306a36Sopenharmony_ci *
126862306a36Sopenharmony_ci * Called under mdsc->mutex.
126962306a36Sopenharmony_ci */
127062306a36Sopenharmony_cistatic int __choose_mds(struct ceph_mds_client *mdsc,
127162306a36Sopenharmony_ci			struct ceph_mds_request *req,
127262306a36Sopenharmony_ci			bool *random)
127362306a36Sopenharmony_ci{
127462306a36Sopenharmony_ci	struct inode *inode;
127562306a36Sopenharmony_ci	struct ceph_inode_info *ci;
127662306a36Sopenharmony_ci	struct ceph_cap *cap;
127762306a36Sopenharmony_ci	int mode = req->r_direct_mode;
127862306a36Sopenharmony_ci	int mds = -1;
127962306a36Sopenharmony_ci	u32 hash = req->r_direct_hash;
128062306a36Sopenharmony_ci	bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
128162306a36Sopenharmony_ci
128262306a36Sopenharmony_ci	if (random)
128362306a36Sopenharmony_ci		*random = false;
128462306a36Sopenharmony_ci
128562306a36Sopenharmony_ci	/*
128662306a36Sopenharmony_ci	 * is there a specific mds we should try?  ignore hint if we have
128762306a36Sopenharmony_ci	 * no session and the mds is not up (active or recovering).
128862306a36Sopenharmony_ci	 */
128962306a36Sopenharmony_ci	if (req->r_resend_mds >= 0 &&
129062306a36Sopenharmony_ci	    (__have_session(mdsc, req->r_resend_mds) ||
129162306a36Sopenharmony_ci	     ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
129262306a36Sopenharmony_ci		dout("%s using resend_mds mds%d\n", __func__,
129362306a36Sopenharmony_ci		     req->r_resend_mds);
129462306a36Sopenharmony_ci		return req->r_resend_mds;
129562306a36Sopenharmony_ci	}
129662306a36Sopenharmony_ci
129762306a36Sopenharmony_ci	if (mode == USE_RANDOM_MDS)
129862306a36Sopenharmony_ci		goto random;
129962306a36Sopenharmony_ci
130062306a36Sopenharmony_ci	inode = NULL;
130162306a36Sopenharmony_ci	if (req->r_inode) {
130262306a36Sopenharmony_ci		if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) {
130362306a36Sopenharmony_ci			inode = req->r_inode;
130462306a36Sopenharmony_ci			ihold(inode);
130562306a36Sopenharmony_ci		} else {
130662306a36Sopenharmony_ci			/* req->r_dentry is non-null for LSSNAP request */
130762306a36Sopenharmony_ci			rcu_read_lock();
130862306a36Sopenharmony_ci			inode = get_nonsnap_parent(req->r_dentry);
130962306a36Sopenharmony_ci			rcu_read_unlock();
131062306a36Sopenharmony_ci			dout("%s using snapdir's parent %p\n", __func__, inode);
131162306a36Sopenharmony_ci		}
131262306a36Sopenharmony_ci	} else if (req->r_dentry) {
131362306a36Sopenharmony_ci		/* ignore race with rename; old or new d_parent is okay */
131462306a36Sopenharmony_ci		struct dentry *parent;
131562306a36Sopenharmony_ci		struct inode *dir;
131662306a36Sopenharmony_ci
131762306a36Sopenharmony_ci		rcu_read_lock();
131862306a36Sopenharmony_ci		parent = READ_ONCE(req->r_dentry->d_parent);
131962306a36Sopenharmony_ci		dir = req->r_parent ? : d_inode_rcu(parent);
132062306a36Sopenharmony_ci
132162306a36Sopenharmony_ci		if (!dir || dir->i_sb != mdsc->fsc->sb) {
132262306a36Sopenharmony_ci			/*  not this fs or parent went negative */
132362306a36Sopenharmony_ci			inode = d_inode(req->r_dentry);
132462306a36Sopenharmony_ci			if (inode)
132562306a36Sopenharmony_ci				ihold(inode);
132662306a36Sopenharmony_ci		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
132762306a36Sopenharmony_ci			/* direct snapped/virtual snapdir requests
132862306a36Sopenharmony_ci			 * based on parent dir inode */
132962306a36Sopenharmony_ci			inode = get_nonsnap_parent(parent);
133062306a36Sopenharmony_ci			dout("%s using nonsnap parent %p\n", __func__, inode);
133162306a36Sopenharmony_ci		} else {
133262306a36Sopenharmony_ci			/* dentry target */
133362306a36Sopenharmony_ci			inode = d_inode(req->r_dentry);
133462306a36Sopenharmony_ci			if (!inode || mode == USE_AUTH_MDS) {
133562306a36Sopenharmony_ci				/* dir + name */
133662306a36Sopenharmony_ci				inode = igrab(dir);
133762306a36Sopenharmony_ci				hash = ceph_dentry_hash(dir, req->r_dentry);
133862306a36Sopenharmony_ci				is_hash = true;
133962306a36Sopenharmony_ci			} else {
134062306a36Sopenharmony_ci				ihold(inode);
134162306a36Sopenharmony_ci			}
134262306a36Sopenharmony_ci		}
134362306a36Sopenharmony_ci		rcu_read_unlock();
134462306a36Sopenharmony_ci	}
134562306a36Sopenharmony_ci
134662306a36Sopenharmony_ci	dout("%s %p is_hash=%d (0x%x) mode %d\n", __func__, inode, (int)is_hash,
134762306a36Sopenharmony_ci	     hash, mode);
134862306a36Sopenharmony_ci	if (!inode)
134962306a36Sopenharmony_ci		goto random;
135062306a36Sopenharmony_ci	ci = ceph_inode(inode);
135162306a36Sopenharmony_ci
135262306a36Sopenharmony_ci	if (is_hash && S_ISDIR(inode->i_mode)) {
135362306a36Sopenharmony_ci		struct ceph_inode_frag frag;
135462306a36Sopenharmony_ci		int found;
135562306a36Sopenharmony_ci
135662306a36Sopenharmony_ci		ceph_choose_frag(ci, hash, &frag, &found);
135762306a36Sopenharmony_ci		if (found) {
135862306a36Sopenharmony_ci			if (mode == USE_ANY_MDS && frag.ndist > 0) {
135962306a36Sopenharmony_ci				u8 r;
136062306a36Sopenharmony_ci
136162306a36Sopenharmony_ci				/* choose a random replica */
136262306a36Sopenharmony_ci				get_random_bytes(&r, 1);
136362306a36Sopenharmony_ci				r %= frag.ndist;
136462306a36Sopenharmony_ci				mds = frag.dist[r];
136562306a36Sopenharmony_ci				dout("%s %p %llx.%llx frag %u mds%d (%d/%d)\n",
136662306a36Sopenharmony_ci				     __func__, inode, ceph_vinop(inode),
136762306a36Sopenharmony_ci				     frag.frag, mds, (int)r, frag.ndist);
136862306a36Sopenharmony_ci				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
136962306a36Sopenharmony_ci				    CEPH_MDS_STATE_ACTIVE &&
137062306a36Sopenharmony_ci				    !ceph_mdsmap_is_laggy(mdsc->mdsmap, mds))
137162306a36Sopenharmony_ci					goto out;
137262306a36Sopenharmony_ci			}
137362306a36Sopenharmony_ci
137462306a36Sopenharmony_ci			/* since this file/dir wasn't known to be
137562306a36Sopenharmony_ci			 * replicated, then we want to look for the
137662306a36Sopenharmony_ci			 * authoritative mds. */
137762306a36Sopenharmony_ci			if (frag.mds >= 0) {
137862306a36Sopenharmony_ci				/* choose auth mds */
137962306a36Sopenharmony_ci				mds = frag.mds;
138062306a36Sopenharmony_ci				dout("%s %p %llx.%llx frag %u mds%d (auth)\n",
138162306a36Sopenharmony_ci				     __func__, inode, ceph_vinop(inode),
138262306a36Sopenharmony_ci				     frag.frag, mds);
138362306a36Sopenharmony_ci				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
138462306a36Sopenharmony_ci				    CEPH_MDS_STATE_ACTIVE) {
138562306a36Sopenharmony_ci					if (!ceph_mdsmap_is_laggy(mdsc->mdsmap,
138662306a36Sopenharmony_ci								  mds))
138762306a36Sopenharmony_ci						goto out;
138862306a36Sopenharmony_ci				}
138962306a36Sopenharmony_ci			}
139062306a36Sopenharmony_ci			mode = USE_AUTH_MDS;
139162306a36Sopenharmony_ci		}
139262306a36Sopenharmony_ci	}
139362306a36Sopenharmony_ci
139462306a36Sopenharmony_ci	spin_lock(&ci->i_ceph_lock);
139562306a36Sopenharmony_ci	cap = NULL;
139662306a36Sopenharmony_ci	if (mode == USE_AUTH_MDS)
139762306a36Sopenharmony_ci		cap = ci->i_auth_cap;
139862306a36Sopenharmony_ci	if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
139962306a36Sopenharmony_ci		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
140062306a36Sopenharmony_ci	if (!cap) {
140162306a36Sopenharmony_ci		spin_unlock(&ci->i_ceph_lock);
140262306a36Sopenharmony_ci		iput(inode);
140362306a36Sopenharmony_ci		goto random;
140462306a36Sopenharmony_ci	}
140562306a36Sopenharmony_ci	mds = cap->session->s_mds;
140662306a36Sopenharmony_ci	dout("%s %p %llx.%llx mds%d (%scap %p)\n", __func__,
140762306a36Sopenharmony_ci	     inode, ceph_vinop(inode), mds,
140862306a36Sopenharmony_ci	     cap == ci->i_auth_cap ? "auth " : "", cap);
140962306a36Sopenharmony_ci	spin_unlock(&ci->i_ceph_lock);
141062306a36Sopenharmony_ciout:
141162306a36Sopenharmony_ci	iput(inode);
141262306a36Sopenharmony_ci	return mds;
141362306a36Sopenharmony_ci
141462306a36Sopenharmony_cirandom:
141562306a36Sopenharmony_ci	if (random)
141662306a36Sopenharmony_ci		*random = true;
141762306a36Sopenharmony_ci
141862306a36Sopenharmony_ci	mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
141962306a36Sopenharmony_ci	dout("%s chose random mds%d\n", __func__, mds);
142062306a36Sopenharmony_ci	return mds;
142162306a36Sopenharmony_ci}
142262306a36Sopenharmony_ci
142362306a36Sopenharmony_ci
142462306a36Sopenharmony_ci/*
142562306a36Sopenharmony_ci * session messages
142662306a36Sopenharmony_ci */
142762306a36Sopenharmony_cistruct ceph_msg *ceph_create_session_msg(u32 op, u64 seq)
142862306a36Sopenharmony_ci{
142962306a36Sopenharmony_ci	struct ceph_msg *msg;
143062306a36Sopenharmony_ci	struct ceph_mds_session_head *h;
143162306a36Sopenharmony_ci
143262306a36Sopenharmony_ci	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
143362306a36Sopenharmony_ci			   false);
143462306a36Sopenharmony_ci	if (!msg) {
143562306a36Sopenharmony_ci		pr_err("ENOMEM creating session %s msg\n",
143662306a36Sopenharmony_ci		       ceph_session_op_name(op));
143762306a36Sopenharmony_ci		return NULL;
143862306a36Sopenharmony_ci	}
143962306a36Sopenharmony_ci	h = msg->front.iov_base;
144062306a36Sopenharmony_ci	h->op = cpu_to_le32(op);
144162306a36Sopenharmony_ci	h->seq = cpu_to_le64(seq);
144262306a36Sopenharmony_ci
144362306a36Sopenharmony_ci	return msg;
144462306a36Sopenharmony_ci}
144562306a36Sopenharmony_ci
144662306a36Sopenharmony_cistatic const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;
144762306a36Sopenharmony_ci#define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8)
144862306a36Sopenharmony_cistatic int encode_supported_features(void **p, void *end)
144962306a36Sopenharmony_ci{
145062306a36Sopenharmony_ci	static const size_t count = ARRAY_SIZE(feature_bits);
145162306a36Sopenharmony_ci
145262306a36Sopenharmony_ci	if (count > 0) {
145362306a36Sopenharmony_ci		size_t i;
145462306a36Sopenharmony_ci		size_t size = FEATURE_BYTES(count);
145562306a36Sopenharmony_ci		unsigned long bit;
145662306a36Sopenharmony_ci
145762306a36Sopenharmony_ci		if (WARN_ON_ONCE(*p + 4 + size > end))
145862306a36Sopenharmony_ci			return -ERANGE;
145962306a36Sopenharmony_ci
146062306a36Sopenharmony_ci		ceph_encode_32(p, size);
146162306a36Sopenharmony_ci		memset(*p, 0, size);
146262306a36Sopenharmony_ci		for (i = 0; i < count; i++) {
146362306a36Sopenharmony_ci			bit = feature_bits[i];
146462306a36Sopenharmony_ci			((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8);
146562306a36Sopenharmony_ci		}
146662306a36Sopenharmony_ci		*p += size;
146762306a36Sopenharmony_ci	} else {
146862306a36Sopenharmony_ci		if (WARN_ON_ONCE(*p + 4 > end))
146962306a36Sopenharmony_ci			return -ERANGE;
147062306a36Sopenharmony_ci
147162306a36Sopenharmony_ci		ceph_encode_32(p, 0);
147262306a36Sopenharmony_ci	}
147362306a36Sopenharmony_ci
147462306a36Sopenharmony_ci	return 0;
147562306a36Sopenharmony_ci}
147662306a36Sopenharmony_ci
147762306a36Sopenharmony_cistatic const unsigned char metric_bits[] = CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED;
147862306a36Sopenharmony_ci#define METRIC_BYTES(cnt) (DIV_ROUND_UP((size_t)metric_bits[cnt - 1] + 1, 64) * 8)
147962306a36Sopenharmony_cistatic int encode_metric_spec(void **p, void *end)
148062306a36Sopenharmony_ci{
148162306a36Sopenharmony_ci	static const size_t count = ARRAY_SIZE(metric_bits);
148262306a36Sopenharmony_ci
148362306a36Sopenharmony_ci	/* header */
148462306a36Sopenharmony_ci	if (WARN_ON_ONCE(*p + 2 > end))
148562306a36Sopenharmony_ci		return -ERANGE;
148662306a36Sopenharmony_ci
148762306a36Sopenharmony_ci	ceph_encode_8(p, 1); /* version */
148862306a36Sopenharmony_ci	ceph_encode_8(p, 1); /* compat */
148962306a36Sopenharmony_ci
149062306a36Sopenharmony_ci	if (count > 0) {
149162306a36Sopenharmony_ci		size_t i;
149262306a36Sopenharmony_ci		size_t size = METRIC_BYTES(count);
149362306a36Sopenharmony_ci
149462306a36Sopenharmony_ci		if (WARN_ON_ONCE(*p + 4 + 4 + size > end))
149562306a36Sopenharmony_ci			return -ERANGE;
149662306a36Sopenharmony_ci
149762306a36Sopenharmony_ci		/* metric spec info length */
149862306a36Sopenharmony_ci		ceph_encode_32(p, 4 + size);
149962306a36Sopenharmony_ci
150062306a36Sopenharmony_ci		/* metric spec */
150162306a36Sopenharmony_ci		ceph_encode_32(p, size);
150262306a36Sopenharmony_ci		memset(*p, 0, size);
150362306a36Sopenharmony_ci		for (i = 0; i < count; i++)
150462306a36Sopenharmony_ci			((unsigned char *)(*p))[i / 8] |= BIT(metric_bits[i] % 8);
150562306a36Sopenharmony_ci		*p += size;
150662306a36Sopenharmony_ci	} else {
150762306a36Sopenharmony_ci		if (WARN_ON_ONCE(*p + 4 + 4 > end))
150862306a36Sopenharmony_ci			return -ERANGE;
150962306a36Sopenharmony_ci
151062306a36Sopenharmony_ci		/* metric spec info length */
151162306a36Sopenharmony_ci		ceph_encode_32(p, 4);
151262306a36Sopenharmony_ci		/* metric spec */
151362306a36Sopenharmony_ci		ceph_encode_32(p, 0);
151462306a36Sopenharmony_ci	}
151562306a36Sopenharmony_ci
151662306a36Sopenharmony_ci	return 0;
151762306a36Sopenharmony_ci}
151862306a36Sopenharmony_ci
151962306a36Sopenharmony_ci/*
152062306a36Sopenharmony_ci * session message, specialization for CEPH_SESSION_REQUEST_OPEN
152162306a36Sopenharmony_ci * to include additional client metadata fields.
152262306a36Sopenharmony_ci */
152362306a36Sopenharmony_cistatic struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq)
152462306a36Sopenharmony_ci{
152562306a36Sopenharmony_ci	struct ceph_msg *msg;
152662306a36Sopenharmony_ci	struct ceph_mds_session_head *h;
152762306a36Sopenharmony_ci	int i;
152862306a36Sopenharmony_ci	int extra_bytes = 0;
152962306a36Sopenharmony_ci	int metadata_key_count = 0;
153062306a36Sopenharmony_ci	struct ceph_options *opt = mdsc->fsc->client->options;
153162306a36Sopenharmony_ci	struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
153262306a36Sopenharmony_ci	size_t size, count;
153362306a36Sopenharmony_ci	void *p, *end;
153462306a36Sopenharmony_ci	int ret;
153562306a36Sopenharmony_ci
153662306a36Sopenharmony_ci	const char* metadata[][2] = {
153762306a36Sopenharmony_ci		{"hostname", mdsc->nodename},
153862306a36Sopenharmony_ci		{"kernel_version", init_utsname()->release},
153962306a36Sopenharmony_ci		{"entity_id", opt->name ? : ""},
154062306a36Sopenharmony_ci		{"root", fsopt->server_path ? : "/"},
154162306a36Sopenharmony_ci		{NULL, NULL}
154262306a36Sopenharmony_ci	};
154362306a36Sopenharmony_ci
154462306a36Sopenharmony_ci	/* Calculate serialized length of metadata */
154562306a36Sopenharmony_ci	extra_bytes = 4;  /* map length */
154662306a36Sopenharmony_ci	for (i = 0; metadata[i][0]; ++i) {
154762306a36Sopenharmony_ci		extra_bytes += 8 + strlen(metadata[i][0]) +
154862306a36Sopenharmony_ci			strlen(metadata[i][1]);
154962306a36Sopenharmony_ci		metadata_key_count++;
155062306a36Sopenharmony_ci	}
155162306a36Sopenharmony_ci
155262306a36Sopenharmony_ci	/* supported feature */
155362306a36Sopenharmony_ci	size = 0;
155462306a36Sopenharmony_ci	count = ARRAY_SIZE(feature_bits);
155562306a36Sopenharmony_ci	if (count > 0)
155662306a36Sopenharmony_ci		size = FEATURE_BYTES(count);
155762306a36Sopenharmony_ci	extra_bytes += 4 + size;
155862306a36Sopenharmony_ci
155962306a36Sopenharmony_ci	/* metric spec */
156062306a36Sopenharmony_ci	size = 0;
156162306a36Sopenharmony_ci	count = ARRAY_SIZE(metric_bits);
156262306a36Sopenharmony_ci	if (count > 0)
156362306a36Sopenharmony_ci		size = METRIC_BYTES(count);
156462306a36Sopenharmony_ci	extra_bytes += 2 + 4 + 4 + size;
156562306a36Sopenharmony_ci
156662306a36Sopenharmony_ci	/* Allocate the message */
156762306a36Sopenharmony_ci	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
156862306a36Sopenharmony_ci			   GFP_NOFS, false);
156962306a36Sopenharmony_ci	if (!msg) {
157062306a36Sopenharmony_ci		pr_err("ENOMEM creating session open msg\n");
157162306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
157262306a36Sopenharmony_ci	}
157362306a36Sopenharmony_ci	p = msg->front.iov_base;
157462306a36Sopenharmony_ci	end = p + msg->front.iov_len;
157562306a36Sopenharmony_ci
157662306a36Sopenharmony_ci	h = p;
157762306a36Sopenharmony_ci	h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN);
157862306a36Sopenharmony_ci	h->seq = cpu_to_le64(seq);
157962306a36Sopenharmony_ci
158062306a36Sopenharmony_ci	/*
158162306a36Sopenharmony_ci	 * Serialize client metadata into waiting buffer space, using
158262306a36Sopenharmony_ci	 * the format that userspace expects for map<string, string>
158362306a36Sopenharmony_ci	 *
158462306a36Sopenharmony_ci	 * ClientSession messages with metadata are v4
158562306a36Sopenharmony_ci	 */
158662306a36Sopenharmony_ci	msg->hdr.version = cpu_to_le16(4);
158762306a36Sopenharmony_ci	msg->hdr.compat_version = cpu_to_le16(1);
158862306a36Sopenharmony_ci
158962306a36Sopenharmony_ci	/* The write pointer, following the session_head structure */
159062306a36Sopenharmony_ci	p += sizeof(*h);
159162306a36Sopenharmony_ci
159262306a36Sopenharmony_ci	/* Number of entries in the map */
159362306a36Sopenharmony_ci	ceph_encode_32(&p, metadata_key_count);
159462306a36Sopenharmony_ci
159562306a36Sopenharmony_ci	/* Two length-prefixed strings for each entry in the map */
159662306a36Sopenharmony_ci	for (i = 0; metadata[i][0]; ++i) {
159762306a36Sopenharmony_ci		size_t const key_len = strlen(metadata[i][0]);
159862306a36Sopenharmony_ci		size_t const val_len = strlen(metadata[i][1]);
159962306a36Sopenharmony_ci
160062306a36Sopenharmony_ci		ceph_encode_32(&p, key_len);
160162306a36Sopenharmony_ci		memcpy(p, metadata[i][0], key_len);
160262306a36Sopenharmony_ci		p += key_len;
160362306a36Sopenharmony_ci		ceph_encode_32(&p, val_len);
160462306a36Sopenharmony_ci		memcpy(p, metadata[i][1], val_len);
160562306a36Sopenharmony_ci		p += val_len;
160662306a36Sopenharmony_ci	}
160762306a36Sopenharmony_ci
160862306a36Sopenharmony_ci	ret = encode_supported_features(&p, end);
160962306a36Sopenharmony_ci	if (ret) {
161062306a36Sopenharmony_ci		pr_err("encode_supported_features failed!\n");
161162306a36Sopenharmony_ci		ceph_msg_put(msg);
161262306a36Sopenharmony_ci		return ERR_PTR(ret);
161362306a36Sopenharmony_ci	}
161462306a36Sopenharmony_ci
161562306a36Sopenharmony_ci	ret = encode_metric_spec(&p, end);
161662306a36Sopenharmony_ci	if (ret) {
161762306a36Sopenharmony_ci		pr_err("encode_metric_spec failed!\n");
161862306a36Sopenharmony_ci		ceph_msg_put(msg);
161962306a36Sopenharmony_ci		return ERR_PTR(ret);
162062306a36Sopenharmony_ci	}
162162306a36Sopenharmony_ci
162262306a36Sopenharmony_ci	msg->front.iov_len = p - msg->front.iov_base;
162362306a36Sopenharmony_ci	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
162462306a36Sopenharmony_ci
162562306a36Sopenharmony_ci	return msg;
162662306a36Sopenharmony_ci}
162762306a36Sopenharmony_ci
162862306a36Sopenharmony_ci/*
162962306a36Sopenharmony_ci * send session open request.
163062306a36Sopenharmony_ci *
163162306a36Sopenharmony_ci * called under mdsc->mutex
163262306a36Sopenharmony_ci */
163362306a36Sopenharmony_cistatic int __open_session(struct ceph_mds_client *mdsc,
163462306a36Sopenharmony_ci			  struct ceph_mds_session *session)
163562306a36Sopenharmony_ci{
163662306a36Sopenharmony_ci	struct ceph_msg *msg;
163762306a36Sopenharmony_ci	int mstate;
163862306a36Sopenharmony_ci	int mds = session->s_mds;
163962306a36Sopenharmony_ci
164062306a36Sopenharmony_ci	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO)
164162306a36Sopenharmony_ci		return -EIO;
164262306a36Sopenharmony_ci
164362306a36Sopenharmony_ci	/* wait for mds to go active? */
164462306a36Sopenharmony_ci	mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
164562306a36Sopenharmony_ci	dout("open_session to mds%d (%s)\n", mds,
164662306a36Sopenharmony_ci	     ceph_mds_state_name(mstate));
164762306a36Sopenharmony_ci	session->s_state = CEPH_MDS_SESSION_OPENING;
164862306a36Sopenharmony_ci	session->s_renew_requested = jiffies;
164962306a36Sopenharmony_ci
165062306a36Sopenharmony_ci	/* send connect message */
165162306a36Sopenharmony_ci	msg = create_session_open_msg(mdsc, session->s_seq);
165262306a36Sopenharmony_ci	if (IS_ERR(msg))
165362306a36Sopenharmony_ci		return PTR_ERR(msg);
165462306a36Sopenharmony_ci	ceph_con_send(&session->s_con, msg);
165562306a36Sopenharmony_ci	return 0;
165662306a36Sopenharmony_ci}
165762306a36Sopenharmony_ci
165862306a36Sopenharmony_ci/*
165962306a36Sopenharmony_ci * open sessions for any export targets for the given mds
166062306a36Sopenharmony_ci *
166162306a36Sopenharmony_ci * called under mdsc->mutex
166262306a36Sopenharmony_ci */
166362306a36Sopenharmony_cistatic struct ceph_mds_session *
166462306a36Sopenharmony_ci__open_export_target_session(struct ceph_mds_client *mdsc, int target)
166562306a36Sopenharmony_ci{
166662306a36Sopenharmony_ci	struct ceph_mds_session *session;
166762306a36Sopenharmony_ci	int ret;
166862306a36Sopenharmony_ci
166962306a36Sopenharmony_ci	session = __ceph_lookup_mds_session(mdsc, target);
167062306a36Sopenharmony_ci	if (!session) {
167162306a36Sopenharmony_ci		session = register_session(mdsc, target);
167262306a36Sopenharmony_ci		if (IS_ERR(session))
167362306a36Sopenharmony_ci			return session;
167462306a36Sopenharmony_ci	}
167562306a36Sopenharmony_ci	if (session->s_state == CEPH_MDS_SESSION_NEW ||
167662306a36Sopenharmony_ci	    session->s_state == CEPH_MDS_SESSION_CLOSING) {
167762306a36Sopenharmony_ci		ret = __open_session(mdsc, session);
167862306a36Sopenharmony_ci		if (ret)
167962306a36Sopenharmony_ci			return ERR_PTR(ret);
168062306a36Sopenharmony_ci	}
168162306a36Sopenharmony_ci
168262306a36Sopenharmony_ci	return session;
168362306a36Sopenharmony_ci}
168462306a36Sopenharmony_ci
168562306a36Sopenharmony_cistruct ceph_mds_session *
168662306a36Sopenharmony_ciceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
168762306a36Sopenharmony_ci{
168862306a36Sopenharmony_ci	struct ceph_mds_session *session;
168962306a36Sopenharmony_ci
169062306a36Sopenharmony_ci	dout("open_export_target_session to mds%d\n", target);
169162306a36Sopenharmony_ci
169262306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
169362306a36Sopenharmony_ci	session = __open_export_target_session(mdsc, target);
169462306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
169562306a36Sopenharmony_ci
169662306a36Sopenharmony_ci	return session;
169762306a36Sopenharmony_ci}
169862306a36Sopenharmony_ci
169962306a36Sopenharmony_cistatic void __open_export_target_sessions(struct ceph_mds_client *mdsc,
170062306a36Sopenharmony_ci					  struct ceph_mds_session *session)
170162306a36Sopenharmony_ci{
170262306a36Sopenharmony_ci	struct ceph_mds_info *mi;
170362306a36Sopenharmony_ci	struct ceph_mds_session *ts;
170462306a36Sopenharmony_ci	int i, mds = session->s_mds;
170562306a36Sopenharmony_ci
170662306a36Sopenharmony_ci	if (mds >= mdsc->mdsmap->possible_max_rank)
170762306a36Sopenharmony_ci		return;
170862306a36Sopenharmony_ci
170962306a36Sopenharmony_ci	mi = &mdsc->mdsmap->m_info[mds];
171062306a36Sopenharmony_ci	dout("open_export_target_sessions for mds%d (%d targets)\n",
171162306a36Sopenharmony_ci	     session->s_mds, mi->num_export_targets);
171262306a36Sopenharmony_ci
171362306a36Sopenharmony_ci	for (i = 0; i < mi->num_export_targets; i++) {
171462306a36Sopenharmony_ci		ts = __open_export_target_session(mdsc, mi->export_targets[i]);
171562306a36Sopenharmony_ci		ceph_put_mds_session(ts);
171662306a36Sopenharmony_ci	}
171762306a36Sopenharmony_ci}
171862306a36Sopenharmony_ci
171962306a36Sopenharmony_civoid ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
172062306a36Sopenharmony_ci					   struct ceph_mds_session *session)
172162306a36Sopenharmony_ci{
172262306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
172362306a36Sopenharmony_ci	__open_export_target_sessions(mdsc, session);
172462306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
172562306a36Sopenharmony_ci}
172662306a36Sopenharmony_ci
172762306a36Sopenharmony_ci/*
172862306a36Sopenharmony_ci * session caps
172962306a36Sopenharmony_ci */
173062306a36Sopenharmony_ci
173162306a36Sopenharmony_cistatic void detach_cap_releases(struct ceph_mds_session *session,
173262306a36Sopenharmony_ci				struct list_head *target)
173362306a36Sopenharmony_ci{
173462306a36Sopenharmony_ci	lockdep_assert_held(&session->s_cap_lock);
173562306a36Sopenharmony_ci
173662306a36Sopenharmony_ci	list_splice_init(&session->s_cap_releases, target);
173762306a36Sopenharmony_ci	session->s_num_cap_releases = 0;
173862306a36Sopenharmony_ci	dout("dispose_cap_releases mds%d\n", session->s_mds);
173962306a36Sopenharmony_ci}
174062306a36Sopenharmony_ci
174162306a36Sopenharmony_cistatic void dispose_cap_releases(struct ceph_mds_client *mdsc,
174262306a36Sopenharmony_ci				 struct list_head *dispose)
174362306a36Sopenharmony_ci{
174462306a36Sopenharmony_ci	while (!list_empty(dispose)) {
174562306a36Sopenharmony_ci		struct ceph_cap *cap;
174662306a36Sopenharmony_ci		/* zero out the in-progress message */
174762306a36Sopenharmony_ci		cap = list_first_entry(dispose, struct ceph_cap, session_caps);
174862306a36Sopenharmony_ci		list_del(&cap->session_caps);
174962306a36Sopenharmony_ci		ceph_put_cap(mdsc, cap);
175062306a36Sopenharmony_ci	}
175162306a36Sopenharmony_ci}
175262306a36Sopenharmony_ci
175362306a36Sopenharmony_cistatic void cleanup_session_requests(struct ceph_mds_client *mdsc,
175462306a36Sopenharmony_ci				     struct ceph_mds_session *session)
175562306a36Sopenharmony_ci{
175662306a36Sopenharmony_ci	struct ceph_mds_request *req;
175762306a36Sopenharmony_ci	struct rb_node *p;
175862306a36Sopenharmony_ci
175962306a36Sopenharmony_ci	dout("cleanup_session_requests mds%d\n", session->s_mds);
176062306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
176162306a36Sopenharmony_ci	while (!list_empty(&session->s_unsafe)) {
176262306a36Sopenharmony_ci		req = list_first_entry(&session->s_unsafe,
176362306a36Sopenharmony_ci				       struct ceph_mds_request, r_unsafe_item);
176462306a36Sopenharmony_ci		pr_warn_ratelimited(" dropping unsafe request %llu\n",
176562306a36Sopenharmony_ci				    req->r_tid);
176662306a36Sopenharmony_ci		if (req->r_target_inode)
176762306a36Sopenharmony_ci			mapping_set_error(req->r_target_inode->i_mapping, -EIO);
176862306a36Sopenharmony_ci		if (req->r_unsafe_dir)
176962306a36Sopenharmony_ci			mapping_set_error(req->r_unsafe_dir->i_mapping, -EIO);
177062306a36Sopenharmony_ci		__unregister_request(mdsc, req);
177162306a36Sopenharmony_ci	}
177262306a36Sopenharmony_ci	/* zero r_attempts, so kick_requests() will re-send requests */
177362306a36Sopenharmony_ci	p = rb_first(&mdsc->request_tree);
177462306a36Sopenharmony_ci	while (p) {
177562306a36Sopenharmony_ci		req = rb_entry(p, struct ceph_mds_request, r_node);
177662306a36Sopenharmony_ci		p = rb_next(p);
177762306a36Sopenharmony_ci		if (req->r_session &&
177862306a36Sopenharmony_ci		    req->r_session->s_mds == session->s_mds)
177962306a36Sopenharmony_ci			req->r_attempts = 0;
178062306a36Sopenharmony_ci	}
178162306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
178262306a36Sopenharmony_ci}
178362306a36Sopenharmony_ci
178462306a36Sopenharmony_ci/*
178562306a36Sopenharmony_ci * Helper to safely iterate over all caps associated with a session, with
178662306a36Sopenharmony_ci * special care taken to handle a racing __ceph_remove_cap().
178762306a36Sopenharmony_ci *
178862306a36Sopenharmony_ci * Caller must hold session s_mutex.
178962306a36Sopenharmony_ci */
179062306a36Sopenharmony_ciint ceph_iterate_session_caps(struct ceph_mds_session *session,
179162306a36Sopenharmony_ci			      int (*cb)(struct inode *, int mds, void *),
179262306a36Sopenharmony_ci			      void *arg)
179362306a36Sopenharmony_ci{
179462306a36Sopenharmony_ci	struct list_head *p;
179562306a36Sopenharmony_ci	struct ceph_cap *cap;
179662306a36Sopenharmony_ci	struct inode *inode, *last_inode = NULL;
179762306a36Sopenharmony_ci	struct ceph_cap *old_cap = NULL;
179862306a36Sopenharmony_ci	int ret;
179962306a36Sopenharmony_ci
180062306a36Sopenharmony_ci	dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
180162306a36Sopenharmony_ci	spin_lock(&session->s_cap_lock);
180262306a36Sopenharmony_ci	p = session->s_caps.next;
180362306a36Sopenharmony_ci	while (p != &session->s_caps) {
180462306a36Sopenharmony_ci		int mds;
180562306a36Sopenharmony_ci
180662306a36Sopenharmony_ci		cap = list_entry(p, struct ceph_cap, session_caps);
180762306a36Sopenharmony_ci		inode = igrab(&cap->ci->netfs.inode);
180862306a36Sopenharmony_ci		if (!inode) {
180962306a36Sopenharmony_ci			p = p->next;
181062306a36Sopenharmony_ci			continue;
181162306a36Sopenharmony_ci		}
181262306a36Sopenharmony_ci		session->s_cap_iterator = cap;
181362306a36Sopenharmony_ci		mds = cap->mds;
181462306a36Sopenharmony_ci		spin_unlock(&session->s_cap_lock);
181562306a36Sopenharmony_ci
181662306a36Sopenharmony_ci		if (last_inode) {
181762306a36Sopenharmony_ci			iput(last_inode);
181862306a36Sopenharmony_ci			last_inode = NULL;
181962306a36Sopenharmony_ci		}
182062306a36Sopenharmony_ci		if (old_cap) {
182162306a36Sopenharmony_ci			ceph_put_cap(session->s_mdsc, old_cap);
182262306a36Sopenharmony_ci			old_cap = NULL;
182362306a36Sopenharmony_ci		}
182462306a36Sopenharmony_ci
182562306a36Sopenharmony_ci		ret = cb(inode, mds, arg);
182662306a36Sopenharmony_ci		last_inode = inode;
182762306a36Sopenharmony_ci
182862306a36Sopenharmony_ci		spin_lock(&session->s_cap_lock);
182962306a36Sopenharmony_ci		p = p->next;
183062306a36Sopenharmony_ci		if (!cap->ci) {
183162306a36Sopenharmony_ci			dout("iterate_session_caps  finishing cap %p removal\n",
183262306a36Sopenharmony_ci			     cap);
183362306a36Sopenharmony_ci			BUG_ON(cap->session != session);
183462306a36Sopenharmony_ci			cap->session = NULL;
183562306a36Sopenharmony_ci			list_del_init(&cap->session_caps);
183662306a36Sopenharmony_ci			session->s_nr_caps--;
183762306a36Sopenharmony_ci			atomic64_dec(&session->s_mdsc->metric.total_caps);
183862306a36Sopenharmony_ci			if (cap->queue_release)
183962306a36Sopenharmony_ci				__ceph_queue_cap_release(session, cap);
184062306a36Sopenharmony_ci			else
184162306a36Sopenharmony_ci				old_cap = cap;  /* put_cap it w/o locks held */
184262306a36Sopenharmony_ci		}
184362306a36Sopenharmony_ci		if (ret < 0)
184462306a36Sopenharmony_ci			goto out;
184562306a36Sopenharmony_ci	}
184662306a36Sopenharmony_ci	ret = 0;
184762306a36Sopenharmony_ciout:
184862306a36Sopenharmony_ci	session->s_cap_iterator = NULL;
184962306a36Sopenharmony_ci	spin_unlock(&session->s_cap_lock);
185062306a36Sopenharmony_ci
185162306a36Sopenharmony_ci	iput(last_inode);
185262306a36Sopenharmony_ci	if (old_cap)
185362306a36Sopenharmony_ci		ceph_put_cap(session->s_mdsc, old_cap);
185462306a36Sopenharmony_ci
185562306a36Sopenharmony_ci	return ret;
185662306a36Sopenharmony_ci}
185762306a36Sopenharmony_ci
185862306a36Sopenharmony_cistatic int remove_session_caps_cb(struct inode *inode, int mds, void *arg)
185962306a36Sopenharmony_ci{
186062306a36Sopenharmony_ci	struct ceph_inode_info *ci = ceph_inode(inode);
186162306a36Sopenharmony_ci	bool invalidate = false;
186262306a36Sopenharmony_ci	struct ceph_cap *cap;
186362306a36Sopenharmony_ci	int iputs = 0;
186462306a36Sopenharmony_ci
186562306a36Sopenharmony_ci	spin_lock(&ci->i_ceph_lock);
186662306a36Sopenharmony_ci	cap = __get_cap_for_mds(ci, mds);
186762306a36Sopenharmony_ci	if (cap) {
186862306a36Sopenharmony_ci		dout(" removing cap %p, ci is %p, inode is %p\n",
186962306a36Sopenharmony_ci		     cap, ci, &ci->netfs.inode);
187062306a36Sopenharmony_ci
187162306a36Sopenharmony_ci		iputs = ceph_purge_inode_cap(inode, cap, &invalidate);
187262306a36Sopenharmony_ci	}
187362306a36Sopenharmony_ci	spin_unlock(&ci->i_ceph_lock);
187462306a36Sopenharmony_ci
187562306a36Sopenharmony_ci	if (cap)
187662306a36Sopenharmony_ci		wake_up_all(&ci->i_cap_wq);
187762306a36Sopenharmony_ci	if (invalidate)
187862306a36Sopenharmony_ci		ceph_queue_invalidate(inode);
187962306a36Sopenharmony_ci	while (iputs--)
188062306a36Sopenharmony_ci		iput(inode);
188162306a36Sopenharmony_ci	return 0;
188262306a36Sopenharmony_ci}
188362306a36Sopenharmony_ci
188462306a36Sopenharmony_ci/*
188562306a36Sopenharmony_ci * caller must hold session s_mutex
188662306a36Sopenharmony_ci */
188762306a36Sopenharmony_cistatic void remove_session_caps(struct ceph_mds_session *session)
188862306a36Sopenharmony_ci{
188962306a36Sopenharmony_ci	struct ceph_fs_client *fsc = session->s_mdsc->fsc;
189062306a36Sopenharmony_ci	struct super_block *sb = fsc->sb;
189162306a36Sopenharmony_ci	LIST_HEAD(dispose);
189262306a36Sopenharmony_ci
189362306a36Sopenharmony_ci	dout("remove_session_caps on %p\n", session);
189462306a36Sopenharmony_ci	ceph_iterate_session_caps(session, remove_session_caps_cb, fsc);
189562306a36Sopenharmony_ci
189662306a36Sopenharmony_ci	wake_up_all(&fsc->mdsc->cap_flushing_wq);
189762306a36Sopenharmony_ci
189862306a36Sopenharmony_ci	spin_lock(&session->s_cap_lock);
189962306a36Sopenharmony_ci	if (session->s_nr_caps > 0) {
190062306a36Sopenharmony_ci		struct inode *inode;
190162306a36Sopenharmony_ci		struct ceph_cap *cap, *prev = NULL;
190262306a36Sopenharmony_ci		struct ceph_vino vino;
190362306a36Sopenharmony_ci		/*
190462306a36Sopenharmony_ci		 * iterate_session_caps() skips inodes that are being
190562306a36Sopenharmony_ci		 * deleted, we need to wait until deletions are complete.
190662306a36Sopenharmony_ci		 * __wait_on_freeing_inode() is designed for the job,
190762306a36Sopenharmony_ci		 * but it is not exported, so use lookup inode function
190862306a36Sopenharmony_ci		 * to access it.
190962306a36Sopenharmony_ci		 */
191062306a36Sopenharmony_ci		while (!list_empty(&session->s_caps)) {
191162306a36Sopenharmony_ci			cap = list_entry(session->s_caps.next,
191262306a36Sopenharmony_ci					 struct ceph_cap, session_caps);
191362306a36Sopenharmony_ci			if (cap == prev)
191462306a36Sopenharmony_ci				break;
191562306a36Sopenharmony_ci			prev = cap;
191662306a36Sopenharmony_ci			vino = cap->ci->i_vino;
191762306a36Sopenharmony_ci			spin_unlock(&session->s_cap_lock);
191862306a36Sopenharmony_ci
191962306a36Sopenharmony_ci			inode = ceph_find_inode(sb, vino);
192062306a36Sopenharmony_ci			iput(inode);
192162306a36Sopenharmony_ci
192262306a36Sopenharmony_ci			spin_lock(&session->s_cap_lock);
192362306a36Sopenharmony_ci		}
192462306a36Sopenharmony_ci	}
192562306a36Sopenharmony_ci
192662306a36Sopenharmony_ci	// drop cap expires and unlock s_cap_lock
192762306a36Sopenharmony_ci	detach_cap_releases(session, &dispose);
192862306a36Sopenharmony_ci
192962306a36Sopenharmony_ci	BUG_ON(session->s_nr_caps > 0);
193062306a36Sopenharmony_ci	BUG_ON(!list_empty(&session->s_cap_flushing));
193162306a36Sopenharmony_ci	spin_unlock(&session->s_cap_lock);
193262306a36Sopenharmony_ci	dispose_cap_releases(session->s_mdsc, &dispose);
193362306a36Sopenharmony_ci}
193462306a36Sopenharmony_ci
193562306a36Sopenharmony_cienum {
193662306a36Sopenharmony_ci	RECONNECT,
193762306a36Sopenharmony_ci	RENEWCAPS,
193862306a36Sopenharmony_ci	FORCE_RO,
193962306a36Sopenharmony_ci};
194062306a36Sopenharmony_ci
194162306a36Sopenharmony_ci/*
194262306a36Sopenharmony_ci * wake up any threads waiting on this session's caps.  if the cap is
194362306a36Sopenharmony_ci * old (didn't get renewed on the client reconnect), remove it now.
194462306a36Sopenharmony_ci *
194562306a36Sopenharmony_ci * caller must hold s_mutex.
194662306a36Sopenharmony_ci */
194762306a36Sopenharmony_cistatic int wake_up_session_cb(struct inode *inode, int mds, void *arg)
194862306a36Sopenharmony_ci{
194962306a36Sopenharmony_ci	struct ceph_inode_info *ci = ceph_inode(inode);
195062306a36Sopenharmony_ci	unsigned long ev = (unsigned long)arg;
195162306a36Sopenharmony_ci
195262306a36Sopenharmony_ci	if (ev == RECONNECT) {
195362306a36Sopenharmony_ci		spin_lock(&ci->i_ceph_lock);
195462306a36Sopenharmony_ci		ci->i_wanted_max_size = 0;
195562306a36Sopenharmony_ci		ci->i_requested_max_size = 0;
195662306a36Sopenharmony_ci		spin_unlock(&ci->i_ceph_lock);
195762306a36Sopenharmony_ci	} else if (ev == RENEWCAPS) {
195862306a36Sopenharmony_ci		struct ceph_cap *cap;
195962306a36Sopenharmony_ci
196062306a36Sopenharmony_ci		spin_lock(&ci->i_ceph_lock);
196162306a36Sopenharmony_ci		cap = __get_cap_for_mds(ci, mds);
196262306a36Sopenharmony_ci		/* mds did not re-issue stale cap */
196362306a36Sopenharmony_ci		if (cap && cap->cap_gen < atomic_read(&cap->session->s_cap_gen))
196462306a36Sopenharmony_ci			cap->issued = cap->implemented = CEPH_CAP_PIN;
196562306a36Sopenharmony_ci		spin_unlock(&ci->i_ceph_lock);
196662306a36Sopenharmony_ci	} else if (ev == FORCE_RO) {
196762306a36Sopenharmony_ci	}
196862306a36Sopenharmony_ci	wake_up_all(&ci->i_cap_wq);
196962306a36Sopenharmony_ci	return 0;
197062306a36Sopenharmony_ci}
197162306a36Sopenharmony_ci
197262306a36Sopenharmony_cistatic void wake_up_session_caps(struct ceph_mds_session *session, int ev)
197362306a36Sopenharmony_ci{
197462306a36Sopenharmony_ci	dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
197562306a36Sopenharmony_ci	ceph_iterate_session_caps(session, wake_up_session_cb,
197662306a36Sopenharmony_ci				  (void *)(unsigned long)ev);
197762306a36Sopenharmony_ci}
197862306a36Sopenharmony_ci
197962306a36Sopenharmony_ci/*
198062306a36Sopenharmony_ci * Send periodic message to MDS renewing all currently held caps.  The
198162306a36Sopenharmony_ci * ack will reset the expiration for all caps from this session.
198262306a36Sopenharmony_ci *
198362306a36Sopenharmony_ci * caller holds s_mutex
198462306a36Sopenharmony_ci */
198562306a36Sopenharmony_cistatic int send_renew_caps(struct ceph_mds_client *mdsc,
198662306a36Sopenharmony_ci			   struct ceph_mds_session *session)
198762306a36Sopenharmony_ci{
198862306a36Sopenharmony_ci	struct ceph_msg *msg;
198962306a36Sopenharmony_ci	int state;
199062306a36Sopenharmony_ci
199162306a36Sopenharmony_ci	if (time_after_eq(jiffies, session->s_cap_ttl) &&
199262306a36Sopenharmony_ci	    time_after_eq(session->s_cap_ttl, session->s_renew_requested))
199362306a36Sopenharmony_ci		pr_info("mds%d caps stale\n", session->s_mds);
199462306a36Sopenharmony_ci	session->s_renew_requested = jiffies;
199562306a36Sopenharmony_ci
199662306a36Sopenharmony_ci	/* do not try to renew caps until a recovering mds has reconnected
199762306a36Sopenharmony_ci	 * with its clients. */
199862306a36Sopenharmony_ci	state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
199962306a36Sopenharmony_ci	if (state < CEPH_MDS_STATE_RECONNECT) {
200062306a36Sopenharmony_ci		dout("send_renew_caps ignoring mds%d (%s)\n",
200162306a36Sopenharmony_ci		     session->s_mds, ceph_mds_state_name(state));
200262306a36Sopenharmony_ci		return 0;
200362306a36Sopenharmony_ci	}
200462306a36Sopenharmony_ci
200562306a36Sopenharmony_ci	dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
200662306a36Sopenharmony_ci		ceph_mds_state_name(state));
200762306a36Sopenharmony_ci	msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
200862306a36Sopenharmony_ci				      ++session->s_renew_seq);
200962306a36Sopenharmony_ci	if (!msg)
201062306a36Sopenharmony_ci		return -ENOMEM;
201162306a36Sopenharmony_ci	ceph_con_send(&session->s_con, msg);
201262306a36Sopenharmony_ci	return 0;
201362306a36Sopenharmony_ci}
201462306a36Sopenharmony_ci
201562306a36Sopenharmony_cistatic int send_flushmsg_ack(struct ceph_mds_client *mdsc,
201662306a36Sopenharmony_ci			     struct ceph_mds_session *session, u64 seq)
201762306a36Sopenharmony_ci{
201862306a36Sopenharmony_ci	struct ceph_msg *msg;
201962306a36Sopenharmony_ci
202062306a36Sopenharmony_ci	dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
202162306a36Sopenharmony_ci	     session->s_mds, ceph_session_state_name(session->s_state), seq);
202262306a36Sopenharmony_ci	msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
202362306a36Sopenharmony_ci	if (!msg)
202462306a36Sopenharmony_ci		return -ENOMEM;
202562306a36Sopenharmony_ci	ceph_con_send(&session->s_con, msg);
202662306a36Sopenharmony_ci	return 0;
202762306a36Sopenharmony_ci}
202862306a36Sopenharmony_ci
202962306a36Sopenharmony_ci
203062306a36Sopenharmony_ci/*
203162306a36Sopenharmony_ci * Note new cap ttl, and any transition from stale -> not stale (fresh?).
203262306a36Sopenharmony_ci *
203362306a36Sopenharmony_ci * Called under session->s_mutex
203462306a36Sopenharmony_ci */
203562306a36Sopenharmony_cistatic void renewed_caps(struct ceph_mds_client *mdsc,
203662306a36Sopenharmony_ci			 struct ceph_mds_session *session, int is_renew)
203762306a36Sopenharmony_ci{
203862306a36Sopenharmony_ci	int was_stale;
203962306a36Sopenharmony_ci	int wake = 0;
204062306a36Sopenharmony_ci
204162306a36Sopenharmony_ci	spin_lock(&session->s_cap_lock);
204262306a36Sopenharmony_ci	was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);
204362306a36Sopenharmony_ci
204462306a36Sopenharmony_ci	session->s_cap_ttl = session->s_renew_requested +
204562306a36Sopenharmony_ci		mdsc->mdsmap->m_session_timeout*HZ;
204662306a36Sopenharmony_ci
204762306a36Sopenharmony_ci	if (was_stale) {
204862306a36Sopenharmony_ci		if (time_before(jiffies, session->s_cap_ttl)) {
204962306a36Sopenharmony_ci			pr_info("mds%d caps renewed\n", session->s_mds);
205062306a36Sopenharmony_ci			wake = 1;
205162306a36Sopenharmony_ci		} else {
205262306a36Sopenharmony_ci			pr_info("mds%d caps still stale\n", session->s_mds);
205362306a36Sopenharmony_ci		}
205462306a36Sopenharmony_ci	}
205562306a36Sopenharmony_ci	dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
205662306a36Sopenharmony_ci	     session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
205762306a36Sopenharmony_ci	     time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
205862306a36Sopenharmony_ci	spin_unlock(&session->s_cap_lock);
205962306a36Sopenharmony_ci
206062306a36Sopenharmony_ci	if (wake)
206162306a36Sopenharmony_ci		wake_up_session_caps(session, RENEWCAPS);
206262306a36Sopenharmony_ci}
206362306a36Sopenharmony_ci
206462306a36Sopenharmony_ci/*
206562306a36Sopenharmony_ci * send a session close request
206662306a36Sopenharmony_ci */
206762306a36Sopenharmony_cistatic int request_close_session(struct ceph_mds_session *session)
206862306a36Sopenharmony_ci{
206962306a36Sopenharmony_ci	struct ceph_msg *msg;
207062306a36Sopenharmony_ci
207162306a36Sopenharmony_ci	dout("request_close_session mds%d state %s seq %lld\n",
207262306a36Sopenharmony_ci	     session->s_mds, ceph_session_state_name(session->s_state),
207362306a36Sopenharmony_ci	     session->s_seq);
207462306a36Sopenharmony_ci	msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE,
207562306a36Sopenharmony_ci				      session->s_seq);
207662306a36Sopenharmony_ci	if (!msg)
207762306a36Sopenharmony_ci		return -ENOMEM;
207862306a36Sopenharmony_ci	ceph_con_send(&session->s_con, msg);
207962306a36Sopenharmony_ci	return 1;
208062306a36Sopenharmony_ci}
208162306a36Sopenharmony_ci
208262306a36Sopenharmony_ci/*
208362306a36Sopenharmony_ci * Called with s_mutex held.
208462306a36Sopenharmony_ci */
208562306a36Sopenharmony_cistatic int __close_session(struct ceph_mds_client *mdsc,
208662306a36Sopenharmony_ci			 struct ceph_mds_session *session)
208762306a36Sopenharmony_ci{
208862306a36Sopenharmony_ci	if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
208962306a36Sopenharmony_ci		return 0;
209062306a36Sopenharmony_ci	session->s_state = CEPH_MDS_SESSION_CLOSING;
209162306a36Sopenharmony_ci	return request_close_session(session);
209262306a36Sopenharmony_ci}
209362306a36Sopenharmony_ci
209462306a36Sopenharmony_cistatic bool drop_negative_children(struct dentry *dentry)
209562306a36Sopenharmony_ci{
209662306a36Sopenharmony_ci	struct dentry *child;
209762306a36Sopenharmony_ci	bool all_negative = true;
209862306a36Sopenharmony_ci
209962306a36Sopenharmony_ci	if (!d_is_dir(dentry))
210062306a36Sopenharmony_ci		goto out;
210162306a36Sopenharmony_ci
210262306a36Sopenharmony_ci	spin_lock(&dentry->d_lock);
210362306a36Sopenharmony_ci	list_for_each_entry(child, &dentry->d_subdirs, d_child) {
210462306a36Sopenharmony_ci		if (d_really_is_positive(child)) {
210562306a36Sopenharmony_ci			all_negative = false;
210662306a36Sopenharmony_ci			break;
210762306a36Sopenharmony_ci		}
210862306a36Sopenharmony_ci	}
210962306a36Sopenharmony_ci	spin_unlock(&dentry->d_lock);
211062306a36Sopenharmony_ci
211162306a36Sopenharmony_ci	if (all_negative)
211262306a36Sopenharmony_ci		shrink_dcache_parent(dentry);
211362306a36Sopenharmony_ciout:
211462306a36Sopenharmony_ci	return all_negative;
211562306a36Sopenharmony_ci}
211662306a36Sopenharmony_ci
211762306a36Sopenharmony_ci/*
211862306a36Sopenharmony_ci * Trim old(er) caps.
211962306a36Sopenharmony_ci *
212062306a36Sopenharmony_ci * Because we can't cache an inode without one or more caps, we do
212162306a36Sopenharmony_ci * this indirectly: if a cap is unused, we prune its aliases, at which
212262306a36Sopenharmony_ci * point the inode will hopefully get dropped to.
212362306a36Sopenharmony_ci *
212462306a36Sopenharmony_ci * Yes, this is a bit sloppy.  Our only real goal here is to respond to
212562306a36Sopenharmony_ci * memory pressure from the MDS, though, so it needn't be perfect.
212662306a36Sopenharmony_ci */
212762306a36Sopenharmony_cistatic int trim_caps_cb(struct inode *inode, int mds, void *arg)
212862306a36Sopenharmony_ci{
212962306a36Sopenharmony_ci	int *remaining = arg;
213062306a36Sopenharmony_ci	struct ceph_inode_info *ci = ceph_inode(inode);
213162306a36Sopenharmony_ci	int used, wanted, oissued, mine;
213262306a36Sopenharmony_ci	struct ceph_cap *cap;
213362306a36Sopenharmony_ci
213462306a36Sopenharmony_ci	if (*remaining <= 0)
213562306a36Sopenharmony_ci		return -1;
213662306a36Sopenharmony_ci
213762306a36Sopenharmony_ci	spin_lock(&ci->i_ceph_lock);
213862306a36Sopenharmony_ci	cap = __get_cap_for_mds(ci, mds);
213962306a36Sopenharmony_ci	if (!cap) {
214062306a36Sopenharmony_ci		spin_unlock(&ci->i_ceph_lock);
214162306a36Sopenharmony_ci		return 0;
214262306a36Sopenharmony_ci	}
214362306a36Sopenharmony_ci	mine = cap->issued | cap->implemented;
214462306a36Sopenharmony_ci	used = __ceph_caps_used(ci);
214562306a36Sopenharmony_ci	wanted = __ceph_caps_file_wanted(ci);
214662306a36Sopenharmony_ci	oissued = __ceph_caps_issued_other(ci, cap);
214762306a36Sopenharmony_ci
214862306a36Sopenharmony_ci	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
214962306a36Sopenharmony_ci	     inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
215062306a36Sopenharmony_ci	     ceph_cap_string(used), ceph_cap_string(wanted));
215162306a36Sopenharmony_ci	if (cap == ci->i_auth_cap) {
215262306a36Sopenharmony_ci		if (ci->i_dirty_caps || ci->i_flushing_caps ||
215362306a36Sopenharmony_ci		    !list_empty(&ci->i_cap_snaps))
215462306a36Sopenharmony_ci			goto out;
215562306a36Sopenharmony_ci		if ((used | wanted) & CEPH_CAP_ANY_WR)
215662306a36Sopenharmony_ci			goto out;
215762306a36Sopenharmony_ci		/* Note: it's possible that i_filelock_ref becomes non-zero
215862306a36Sopenharmony_ci		 * after dropping auth caps. It doesn't hurt because reply
215962306a36Sopenharmony_ci		 * of lock mds request will re-add auth caps. */
216062306a36Sopenharmony_ci		if (atomic_read(&ci->i_filelock_ref) > 0)
216162306a36Sopenharmony_ci			goto out;
216262306a36Sopenharmony_ci	}
216362306a36Sopenharmony_ci	/* The inode has cached pages, but it's no longer used.
216462306a36Sopenharmony_ci	 * we can safely drop it */
216562306a36Sopenharmony_ci	if (S_ISREG(inode->i_mode) &&
216662306a36Sopenharmony_ci	    wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
216762306a36Sopenharmony_ci	    !(oissued & CEPH_CAP_FILE_CACHE)) {
216862306a36Sopenharmony_ci	  used = 0;
216962306a36Sopenharmony_ci	  oissued = 0;
217062306a36Sopenharmony_ci	}
217162306a36Sopenharmony_ci	if ((used | wanted) & ~oissued & mine)
217262306a36Sopenharmony_ci		goto out;   /* we need these caps */
217362306a36Sopenharmony_ci
217462306a36Sopenharmony_ci	if (oissued) {
217562306a36Sopenharmony_ci		/* we aren't the only cap.. just remove us */
217662306a36Sopenharmony_ci		ceph_remove_cap(cap, true);
217762306a36Sopenharmony_ci		(*remaining)--;
217862306a36Sopenharmony_ci	} else {
217962306a36Sopenharmony_ci		struct dentry *dentry;
218062306a36Sopenharmony_ci		/* try dropping referring dentries */
218162306a36Sopenharmony_ci		spin_unlock(&ci->i_ceph_lock);
218262306a36Sopenharmony_ci		dentry = d_find_any_alias(inode);
218362306a36Sopenharmony_ci		if (dentry && drop_negative_children(dentry)) {
218462306a36Sopenharmony_ci			int count;
218562306a36Sopenharmony_ci			dput(dentry);
218662306a36Sopenharmony_ci			d_prune_aliases(inode);
218762306a36Sopenharmony_ci			count = atomic_read(&inode->i_count);
218862306a36Sopenharmony_ci			if (count == 1)
218962306a36Sopenharmony_ci				(*remaining)--;
219062306a36Sopenharmony_ci			dout("trim_caps_cb %p cap %p pruned, count now %d\n",
219162306a36Sopenharmony_ci			     inode, cap, count);
219262306a36Sopenharmony_ci		} else {
219362306a36Sopenharmony_ci			dput(dentry);
219462306a36Sopenharmony_ci		}
219562306a36Sopenharmony_ci		return 0;
219662306a36Sopenharmony_ci	}
219762306a36Sopenharmony_ci
219862306a36Sopenharmony_ciout:
219962306a36Sopenharmony_ci	spin_unlock(&ci->i_ceph_lock);
220062306a36Sopenharmony_ci	return 0;
220162306a36Sopenharmony_ci}
220262306a36Sopenharmony_ci
220362306a36Sopenharmony_ci/*
220462306a36Sopenharmony_ci * Trim session cap count down to some max number.
220562306a36Sopenharmony_ci */
220662306a36Sopenharmony_ciint ceph_trim_caps(struct ceph_mds_client *mdsc,
220762306a36Sopenharmony_ci		   struct ceph_mds_session *session,
220862306a36Sopenharmony_ci		   int max_caps)
220962306a36Sopenharmony_ci{
221062306a36Sopenharmony_ci	int trim_caps = session->s_nr_caps - max_caps;
221162306a36Sopenharmony_ci
221262306a36Sopenharmony_ci	dout("trim_caps mds%d start: %d / %d, trim %d\n",
221362306a36Sopenharmony_ci	     session->s_mds, session->s_nr_caps, max_caps, trim_caps);
221462306a36Sopenharmony_ci	if (trim_caps > 0) {
221562306a36Sopenharmony_ci		int remaining = trim_caps;
221662306a36Sopenharmony_ci
221762306a36Sopenharmony_ci		ceph_iterate_session_caps(session, trim_caps_cb, &remaining);
221862306a36Sopenharmony_ci		dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
221962306a36Sopenharmony_ci		     session->s_mds, session->s_nr_caps, max_caps,
222062306a36Sopenharmony_ci			trim_caps - remaining);
222162306a36Sopenharmony_ci	}
222262306a36Sopenharmony_ci
222362306a36Sopenharmony_ci	ceph_flush_cap_releases(mdsc, session);
222462306a36Sopenharmony_ci	return 0;
222562306a36Sopenharmony_ci}
222662306a36Sopenharmony_ci
222762306a36Sopenharmony_cistatic int check_caps_flush(struct ceph_mds_client *mdsc,
222862306a36Sopenharmony_ci			    u64 want_flush_tid)
222962306a36Sopenharmony_ci{
223062306a36Sopenharmony_ci	int ret = 1;
223162306a36Sopenharmony_ci
223262306a36Sopenharmony_ci	spin_lock(&mdsc->cap_dirty_lock);
223362306a36Sopenharmony_ci	if (!list_empty(&mdsc->cap_flush_list)) {
223462306a36Sopenharmony_ci		struct ceph_cap_flush *cf =
223562306a36Sopenharmony_ci			list_first_entry(&mdsc->cap_flush_list,
223662306a36Sopenharmony_ci					 struct ceph_cap_flush, g_list);
223762306a36Sopenharmony_ci		if (cf->tid <= want_flush_tid) {
223862306a36Sopenharmony_ci			dout("check_caps_flush still flushing tid "
223962306a36Sopenharmony_ci			     "%llu <= %llu\n", cf->tid, want_flush_tid);
224062306a36Sopenharmony_ci			ret = 0;
224162306a36Sopenharmony_ci		}
224262306a36Sopenharmony_ci	}
224362306a36Sopenharmony_ci	spin_unlock(&mdsc->cap_dirty_lock);
224462306a36Sopenharmony_ci	return ret;
224562306a36Sopenharmony_ci}
224662306a36Sopenharmony_ci
224762306a36Sopenharmony_ci/*
224862306a36Sopenharmony_ci * flush all dirty inode data to disk.
224962306a36Sopenharmony_ci *
225062306a36Sopenharmony_ci * returns true if we've flushed through want_flush_tid
225162306a36Sopenharmony_ci */
225262306a36Sopenharmony_cistatic void wait_caps_flush(struct ceph_mds_client *mdsc,
225362306a36Sopenharmony_ci			    u64 want_flush_tid)
225462306a36Sopenharmony_ci{
225562306a36Sopenharmony_ci	dout("check_caps_flush want %llu\n", want_flush_tid);
225662306a36Sopenharmony_ci
225762306a36Sopenharmony_ci	wait_event(mdsc->cap_flushing_wq,
225862306a36Sopenharmony_ci		   check_caps_flush(mdsc, want_flush_tid));
225962306a36Sopenharmony_ci
226062306a36Sopenharmony_ci	dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
226162306a36Sopenharmony_ci}
226262306a36Sopenharmony_ci
226362306a36Sopenharmony_ci/*
226462306a36Sopenharmony_ci * called under s_mutex
226562306a36Sopenharmony_ci */
226662306a36Sopenharmony_cistatic void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
226762306a36Sopenharmony_ci				   struct ceph_mds_session *session)
226862306a36Sopenharmony_ci{
226962306a36Sopenharmony_ci	struct ceph_msg *msg = NULL;
227062306a36Sopenharmony_ci	struct ceph_mds_cap_release *head;
227162306a36Sopenharmony_ci	struct ceph_mds_cap_item *item;
227262306a36Sopenharmony_ci	struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
227362306a36Sopenharmony_ci	struct ceph_cap *cap;
227462306a36Sopenharmony_ci	LIST_HEAD(tmp_list);
227562306a36Sopenharmony_ci	int num_cap_releases;
227662306a36Sopenharmony_ci	__le32	barrier, *cap_barrier;
227762306a36Sopenharmony_ci
227862306a36Sopenharmony_ci	down_read(&osdc->lock);
227962306a36Sopenharmony_ci	barrier = cpu_to_le32(osdc->epoch_barrier);
228062306a36Sopenharmony_ci	up_read(&osdc->lock);
228162306a36Sopenharmony_ci
228262306a36Sopenharmony_ci	spin_lock(&session->s_cap_lock);
228362306a36Sopenharmony_ciagain:
228462306a36Sopenharmony_ci	list_splice_init(&session->s_cap_releases, &tmp_list);
228562306a36Sopenharmony_ci	num_cap_releases = session->s_num_cap_releases;
228662306a36Sopenharmony_ci	session->s_num_cap_releases = 0;
228762306a36Sopenharmony_ci	spin_unlock(&session->s_cap_lock);
228862306a36Sopenharmony_ci
228962306a36Sopenharmony_ci	while (!list_empty(&tmp_list)) {
229062306a36Sopenharmony_ci		if (!msg) {
229162306a36Sopenharmony_ci			msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE,
229262306a36Sopenharmony_ci					PAGE_SIZE, GFP_NOFS, false);
229362306a36Sopenharmony_ci			if (!msg)
229462306a36Sopenharmony_ci				goto out_err;
229562306a36Sopenharmony_ci			head = msg->front.iov_base;
229662306a36Sopenharmony_ci			head->num = cpu_to_le32(0);
229762306a36Sopenharmony_ci			msg->front.iov_len = sizeof(*head);
229862306a36Sopenharmony_ci
229962306a36Sopenharmony_ci			msg->hdr.version = cpu_to_le16(2);
230062306a36Sopenharmony_ci			msg->hdr.compat_version = cpu_to_le16(1);
230162306a36Sopenharmony_ci		}
230262306a36Sopenharmony_ci
230362306a36Sopenharmony_ci		cap = list_first_entry(&tmp_list, struct ceph_cap,
230462306a36Sopenharmony_ci					session_caps);
230562306a36Sopenharmony_ci		list_del(&cap->session_caps);
230662306a36Sopenharmony_ci		num_cap_releases--;
230762306a36Sopenharmony_ci
230862306a36Sopenharmony_ci		head = msg->front.iov_base;
230962306a36Sopenharmony_ci		put_unaligned_le32(get_unaligned_le32(&head->num) + 1,
231062306a36Sopenharmony_ci				   &head->num);
231162306a36Sopenharmony_ci		item = msg->front.iov_base + msg->front.iov_len;
231262306a36Sopenharmony_ci		item->ino = cpu_to_le64(cap->cap_ino);
231362306a36Sopenharmony_ci		item->cap_id = cpu_to_le64(cap->cap_id);
231462306a36Sopenharmony_ci		item->migrate_seq = cpu_to_le32(cap->mseq);
231562306a36Sopenharmony_ci		item->seq = cpu_to_le32(cap->issue_seq);
231662306a36Sopenharmony_ci		msg->front.iov_len += sizeof(*item);
231762306a36Sopenharmony_ci
231862306a36Sopenharmony_ci		ceph_put_cap(mdsc, cap);
231962306a36Sopenharmony_ci
232062306a36Sopenharmony_ci		if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
232162306a36Sopenharmony_ci			// Append cap_barrier field
232262306a36Sopenharmony_ci			cap_barrier = msg->front.iov_base + msg->front.iov_len;
232362306a36Sopenharmony_ci			*cap_barrier = barrier;
232462306a36Sopenharmony_ci			msg->front.iov_len += sizeof(*cap_barrier);
232562306a36Sopenharmony_ci
232662306a36Sopenharmony_ci			msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
232762306a36Sopenharmony_ci			dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
232862306a36Sopenharmony_ci			ceph_con_send(&session->s_con, msg);
232962306a36Sopenharmony_ci			msg = NULL;
233062306a36Sopenharmony_ci		}
233162306a36Sopenharmony_ci	}
233262306a36Sopenharmony_ci
233362306a36Sopenharmony_ci	BUG_ON(num_cap_releases != 0);
233462306a36Sopenharmony_ci
233562306a36Sopenharmony_ci	spin_lock(&session->s_cap_lock);
233662306a36Sopenharmony_ci	if (!list_empty(&session->s_cap_releases))
233762306a36Sopenharmony_ci		goto again;
233862306a36Sopenharmony_ci	spin_unlock(&session->s_cap_lock);
233962306a36Sopenharmony_ci
234062306a36Sopenharmony_ci	if (msg) {
234162306a36Sopenharmony_ci		// Append cap_barrier field
234262306a36Sopenharmony_ci		cap_barrier = msg->front.iov_base + msg->front.iov_len;
234362306a36Sopenharmony_ci		*cap_barrier = barrier;
234462306a36Sopenharmony_ci		msg->front.iov_len += sizeof(*cap_barrier);
234562306a36Sopenharmony_ci
234662306a36Sopenharmony_ci		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
234762306a36Sopenharmony_ci		dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
234862306a36Sopenharmony_ci		ceph_con_send(&session->s_con, msg);
234962306a36Sopenharmony_ci	}
235062306a36Sopenharmony_ci	return;
235162306a36Sopenharmony_ciout_err:
235262306a36Sopenharmony_ci	pr_err("send_cap_releases mds%d, failed to allocate message\n",
235362306a36Sopenharmony_ci		session->s_mds);
235462306a36Sopenharmony_ci	spin_lock(&session->s_cap_lock);
235562306a36Sopenharmony_ci	list_splice(&tmp_list, &session->s_cap_releases);
235662306a36Sopenharmony_ci	session->s_num_cap_releases += num_cap_releases;
235762306a36Sopenharmony_ci	spin_unlock(&session->s_cap_lock);
235862306a36Sopenharmony_ci}
235962306a36Sopenharmony_ci
236062306a36Sopenharmony_cistatic void ceph_cap_release_work(struct work_struct *work)
236162306a36Sopenharmony_ci{
236262306a36Sopenharmony_ci	struct ceph_mds_session *session =
236362306a36Sopenharmony_ci		container_of(work, struct ceph_mds_session, s_cap_release_work);
236462306a36Sopenharmony_ci
236562306a36Sopenharmony_ci	mutex_lock(&session->s_mutex);
236662306a36Sopenharmony_ci	if (session->s_state == CEPH_MDS_SESSION_OPEN ||
236762306a36Sopenharmony_ci	    session->s_state == CEPH_MDS_SESSION_HUNG)
236862306a36Sopenharmony_ci		ceph_send_cap_releases(session->s_mdsc, session);
236962306a36Sopenharmony_ci	mutex_unlock(&session->s_mutex);
237062306a36Sopenharmony_ci	ceph_put_mds_session(session);
237162306a36Sopenharmony_ci}
237262306a36Sopenharmony_ci
237362306a36Sopenharmony_civoid ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
237462306a36Sopenharmony_ci		             struct ceph_mds_session *session)
237562306a36Sopenharmony_ci{
237662306a36Sopenharmony_ci	if (mdsc->stopping)
237762306a36Sopenharmony_ci		return;
237862306a36Sopenharmony_ci
237962306a36Sopenharmony_ci	ceph_get_mds_session(session);
238062306a36Sopenharmony_ci	if (queue_work(mdsc->fsc->cap_wq,
238162306a36Sopenharmony_ci		       &session->s_cap_release_work)) {
238262306a36Sopenharmony_ci		dout("cap release work queued\n");
238362306a36Sopenharmony_ci	} else {
238462306a36Sopenharmony_ci		ceph_put_mds_session(session);
238562306a36Sopenharmony_ci		dout("failed to queue cap release work\n");
238662306a36Sopenharmony_ci	}
238762306a36Sopenharmony_ci}
238862306a36Sopenharmony_ci
238962306a36Sopenharmony_ci/*
239062306a36Sopenharmony_ci * caller holds session->s_cap_lock
239162306a36Sopenharmony_ci */
239262306a36Sopenharmony_civoid __ceph_queue_cap_release(struct ceph_mds_session *session,
239362306a36Sopenharmony_ci			      struct ceph_cap *cap)
239462306a36Sopenharmony_ci{
239562306a36Sopenharmony_ci	list_add_tail(&cap->session_caps, &session->s_cap_releases);
239662306a36Sopenharmony_ci	session->s_num_cap_releases++;
239762306a36Sopenharmony_ci
239862306a36Sopenharmony_ci	if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE))
239962306a36Sopenharmony_ci		ceph_flush_cap_releases(session->s_mdsc, session);
240062306a36Sopenharmony_ci}
240162306a36Sopenharmony_ci
240262306a36Sopenharmony_cistatic void ceph_cap_reclaim_work(struct work_struct *work)
240362306a36Sopenharmony_ci{
240462306a36Sopenharmony_ci	struct ceph_mds_client *mdsc =
240562306a36Sopenharmony_ci		container_of(work, struct ceph_mds_client, cap_reclaim_work);
240662306a36Sopenharmony_ci	int ret = ceph_trim_dentries(mdsc);
240762306a36Sopenharmony_ci	if (ret == -EAGAIN)
240862306a36Sopenharmony_ci		ceph_queue_cap_reclaim_work(mdsc);
240962306a36Sopenharmony_ci}
241062306a36Sopenharmony_ci
241162306a36Sopenharmony_civoid ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc)
241262306a36Sopenharmony_ci{
241362306a36Sopenharmony_ci	if (mdsc->stopping)
241462306a36Sopenharmony_ci		return;
241562306a36Sopenharmony_ci
241662306a36Sopenharmony_ci        if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) {
241762306a36Sopenharmony_ci                dout("caps reclaim work queued\n");
241862306a36Sopenharmony_ci        } else {
241962306a36Sopenharmony_ci                dout("failed to queue caps release work\n");
242062306a36Sopenharmony_ci        }
242162306a36Sopenharmony_ci}
242262306a36Sopenharmony_ci
242362306a36Sopenharmony_civoid ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)
242462306a36Sopenharmony_ci{
242562306a36Sopenharmony_ci	int val;
242662306a36Sopenharmony_ci	if (!nr)
242762306a36Sopenharmony_ci		return;
242862306a36Sopenharmony_ci	val = atomic_add_return(nr, &mdsc->cap_reclaim_pending);
242962306a36Sopenharmony_ci	if ((val % CEPH_CAPS_PER_RELEASE) < nr) {
243062306a36Sopenharmony_ci		atomic_set(&mdsc->cap_reclaim_pending, 0);
243162306a36Sopenharmony_ci		ceph_queue_cap_reclaim_work(mdsc);
243262306a36Sopenharmony_ci	}
243362306a36Sopenharmony_ci}
243462306a36Sopenharmony_ci
243562306a36Sopenharmony_ci/*
243662306a36Sopenharmony_ci * requests
243762306a36Sopenharmony_ci */
243862306a36Sopenharmony_ci
243962306a36Sopenharmony_ciint ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
244062306a36Sopenharmony_ci				    struct inode *dir)
244162306a36Sopenharmony_ci{
244262306a36Sopenharmony_ci	struct ceph_inode_info *ci = ceph_inode(dir);
244362306a36Sopenharmony_ci	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
244462306a36Sopenharmony_ci	struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
244562306a36Sopenharmony_ci	size_t size = sizeof(struct ceph_mds_reply_dir_entry);
244662306a36Sopenharmony_ci	unsigned int num_entries;
244762306a36Sopenharmony_ci	int order;
244862306a36Sopenharmony_ci
244962306a36Sopenharmony_ci	spin_lock(&ci->i_ceph_lock);
245062306a36Sopenharmony_ci	num_entries = ci->i_files + ci->i_subdirs;
245162306a36Sopenharmony_ci	spin_unlock(&ci->i_ceph_lock);
245262306a36Sopenharmony_ci	num_entries = max(num_entries, 1U);
245362306a36Sopenharmony_ci	num_entries = min(num_entries, opt->max_readdir);
245462306a36Sopenharmony_ci
245562306a36Sopenharmony_ci	order = get_order(size * num_entries);
245662306a36Sopenharmony_ci	while (order >= 0) {
245762306a36Sopenharmony_ci		rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
245862306a36Sopenharmony_ci							     __GFP_NOWARN |
245962306a36Sopenharmony_ci							     __GFP_ZERO,
246062306a36Sopenharmony_ci							     order);
246162306a36Sopenharmony_ci		if (rinfo->dir_entries)
246262306a36Sopenharmony_ci			break;
246362306a36Sopenharmony_ci		order--;
246462306a36Sopenharmony_ci	}
246562306a36Sopenharmony_ci	if (!rinfo->dir_entries)
246662306a36Sopenharmony_ci		return -ENOMEM;
246762306a36Sopenharmony_ci
246862306a36Sopenharmony_ci	num_entries = (PAGE_SIZE << order) / size;
246962306a36Sopenharmony_ci	num_entries = min(num_entries, opt->max_readdir);
247062306a36Sopenharmony_ci
247162306a36Sopenharmony_ci	rinfo->dir_buf_size = PAGE_SIZE << order;
247262306a36Sopenharmony_ci	req->r_num_caps = num_entries + 1;
247362306a36Sopenharmony_ci	req->r_args.readdir.max_entries = cpu_to_le32(num_entries);
247462306a36Sopenharmony_ci	req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes);
247562306a36Sopenharmony_ci	return 0;
247662306a36Sopenharmony_ci}
247762306a36Sopenharmony_ci
247862306a36Sopenharmony_ci/*
247962306a36Sopenharmony_ci * Create an mds request.
248062306a36Sopenharmony_ci */
248162306a36Sopenharmony_cistruct ceph_mds_request *
248262306a36Sopenharmony_ciceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
248362306a36Sopenharmony_ci{
248462306a36Sopenharmony_ci	struct ceph_mds_request *req;
248562306a36Sopenharmony_ci
248662306a36Sopenharmony_ci	req = kmem_cache_zalloc(ceph_mds_request_cachep, GFP_NOFS);
248762306a36Sopenharmony_ci	if (!req)
248862306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
248962306a36Sopenharmony_ci
249062306a36Sopenharmony_ci	mutex_init(&req->r_fill_mutex);
249162306a36Sopenharmony_ci	req->r_mdsc = mdsc;
249262306a36Sopenharmony_ci	req->r_started = jiffies;
249362306a36Sopenharmony_ci	req->r_start_latency = ktime_get();
249462306a36Sopenharmony_ci	req->r_resend_mds = -1;
249562306a36Sopenharmony_ci	INIT_LIST_HEAD(&req->r_unsafe_dir_item);
249662306a36Sopenharmony_ci	INIT_LIST_HEAD(&req->r_unsafe_target_item);
249762306a36Sopenharmony_ci	req->r_fmode = -1;
249862306a36Sopenharmony_ci	req->r_feature_needed = -1;
249962306a36Sopenharmony_ci	kref_init(&req->r_kref);
250062306a36Sopenharmony_ci	RB_CLEAR_NODE(&req->r_node);
250162306a36Sopenharmony_ci	INIT_LIST_HEAD(&req->r_wait);
250262306a36Sopenharmony_ci	init_completion(&req->r_completion);
250362306a36Sopenharmony_ci	init_completion(&req->r_safe_completion);
250462306a36Sopenharmony_ci	INIT_LIST_HEAD(&req->r_unsafe_item);
250562306a36Sopenharmony_ci
250662306a36Sopenharmony_ci	ktime_get_coarse_real_ts64(&req->r_stamp);
250762306a36Sopenharmony_ci
250862306a36Sopenharmony_ci	req->r_op = op;
250962306a36Sopenharmony_ci	req->r_direct_mode = mode;
251062306a36Sopenharmony_ci	return req;
251162306a36Sopenharmony_ci}
251262306a36Sopenharmony_ci
251362306a36Sopenharmony_ci/*
251462306a36Sopenharmony_ci * return oldest (lowest) request, tid in request tree, 0 if none.
251562306a36Sopenharmony_ci *
251662306a36Sopenharmony_ci * called under mdsc->mutex.
251762306a36Sopenharmony_ci */
251862306a36Sopenharmony_cistatic struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
251962306a36Sopenharmony_ci{
252062306a36Sopenharmony_ci	if (RB_EMPTY_ROOT(&mdsc->request_tree))
252162306a36Sopenharmony_ci		return NULL;
252262306a36Sopenharmony_ci	return rb_entry(rb_first(&mdsc->request_tree),
252362306a36Sopenharmony_ci			struct ceph_mds_request, r_node);
252462306a36Sopenharmony_ci}
252562306a36Sopenharmony_ci
252662306a36Sopenharmony_cistatic inline  u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
252762306a36Sopenharmony_ci{
252862306a36Sopenharmony_ci	return mdsc->oldest_tid;
252962306a36Sopenharmony_ci}
253062306a36Sopenharmony_ci
253162306a36Sopenharmony_ci#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
253262306a36Sopenharmony_cistatic u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
253362306a36Sopenharmony_ci{
253462306a36Sopenharmony_ci	struct inode *dir = req->r_parent;
253562306a36Sopenharmony_ci	struct dentry *dentry = req->r_dentry;
253662306a36Sopenharmony_ci	u8 *cryptbuf = NULL;
253762306a36Sopenharmony_ci	u32 len = 0;
253862306a36Sopenharmony_ci	int ret = 0;
253962306a36Sopenharmony_ci
254062306a36Sopenharmony_ci	/* only encode if we have parent and dentry */
254162306a36Sopenharmony_ci	if (!dir || !dentry)
254262306a36Sopenharmony_ci		goto success;
254362306a36Sopenharmony_ci
254462306a36Sopenharmony_ci	/* No-op unless this is encrypted */
254562306a36Sopenharmony_ci	if (!IS_ENCRYPTED(dir))
254662306a36Sopenharmony_ci		goto success;
254762306a36Sopenharmony_ci
254862306a36Sopenharmony_ci	ret = ceph_fscrypt_prepare_readdir(dir);
254962306a36Sopenharmony_ci	if (ret < 0)
255062306a36Sopenharmony_ci		return ERR_PTR(ret);
255162306a36Sopenharmony_ci
255262306a36Sopenharmony_ci	/* No key? Just ignore it. */
255362306a36Sopenharmony_ci	if (!fscrypt_has_encryption_key(dir))
255462306a36Sopenharmony_ci		goto success;
255562306a36Sopenharmony_ci
255662306a36Sopenharmony_ci	if (!fscrypt_fname_encrypted_size(dir, dentry->d_name.len, NAME_MAX,
255762306a36Sopenharmony_ci					  &len)) {
255862306a36Sopenharmony_ci		WARN_ON_ONCE(1);
255962306a36Sopenharmony_ci		return ERR_PTR(-ENAMETOOLONG);
256062306a36Sopenharmony_ci	}
256162306a36Sopenharmony_ci
256262306a36Sopenharmony_ci	/* No need to append altname if name is short enough */
256362306a36Sopenharmony_ci	if (len <= CEPH_NOHASH_NAME_MAX) {
256462306a36Sopenharmony_ci		len = 0;
256562306a36Sopenharmony_ci		goto success;
256662306a36Sopenharmony_ci	}
256762306a36Sopenharmony_ci
256862306a36Sopenharmony_ci	cryptbuf = kmalloc(len, GFP_KERNEL);
256962306a36Sopenharmony_ci	if (!cryptbuf)
257062306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
257162306a36Sopenharmony_ci
257262306a36Sopenharmony_ci	ret = fscrypt_fname_encrypt(dir, &dentry->d_name, cryptbuf, len);
257362306a36Sopenharmony_ci	if (ret) {
257462306a36Sopenharmony_ci		kfree(cryptbuf);
257562306a36Sopenharmony_ci		return ERR_PTR(ret);
257662306a36Sopenharmony_ci	}
257762306a36Sopenharmony_cisuccess:
257862306a36Sopenharmony_ci	*plen = len;
257962306a36Sopenharmony_ci	return cryptbuf;
258062306a36Sopenharmony_ci}
258162306a36Sopenharmony_ci#else
258262306a36Sopenharmony_cistatic u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
258362306a36Sopenharmony_ci{
258462306a36Sopenharmony_ci	*plen = 0;
258562306a36Sopenharmony_ci	return NULL;
258662306a36Sopenharmony_ci}
258762306a36Sopenharmony_ci#endif
258862306a36Sopenharmony_ci
258962306a36Sopenharmony_ci/**
259062306a36Sopenharmony_ci * ceph_mdsc_build_path - build a path string to a given dentry
259162306a36Sopenharmony_ci * @dentry: dentry to which path should be built
259262306a36Sopenharmony_ci * @plen: returned length of string
259362306a36Sopenharmony_ci * @pbase: returned base inode number
259462306a36Sopenharmony_ci * @for_wire: is this path going to be sent to the MDS?
259562306a36Sopenharmony_ci *
259662306a36Sopenharmony_ci * Build a string that represents the path to the dentry. This is mostly called
259762306a36Sopenharmony_ci * for two different purposes:
259862306a36Sopenharmony_ci *
259962306a36Sopenharmony_ci * 1) we need to build a path string to send to the MDS (for_wire == true)
260062306a36Sopenharmony_ci * 2) we need a path string for local presentation (e.g. debugfs)
260162306a36Sopenharmony_ci *    (for_wire == false)
260262306a36Sopenharmony_ci *
260362306a36Sopenharmony_ci * The path is built in reverse, starting with the dentry. Walk back up toward
260462306a36Sopenharmony_ci * the root, building the path until the first non-snapped inode is reached
260562306a36Sopenharmony_ci * (for_wire) or the root inode is reached (!for_wire).
260662306a36Sopenharmony_ci *
260762306a36Sopenharmony_ci * Encode hidden .snap dirs as a double /, i.e.
260862306a36Sopenharmony_ci *   foo/.snap/bar -> foo//bar
260962306a36Sopenharmony_ci */
261062306a36Sopenharmony_cichar *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase,
261162306a36Sopenharmony_ci			   int for_wire)
261262306a36Sopenharmony_ci{
261362306a36Sopenharmony_ci	struct dentry *cur;
261462306a36Sopenharmony_ci	struct inode *inode;
261562306a36Sopenharmony_ci	char *path;
261662306a36Sopenharmony_ci	int pos;
261762306a36Sopenharmony_ci	unsigned seq;
261862306a36Sopenharmony_ci	u64 base;
261962306a36Sopenharmony_ci
262062306a36Sopenharmony_ci	if (!dentry)
262162306a36Sopenharmony_ci		return ERR_PTR(-EINVAL);
262262306a36Sopenharmony_ci
262362306a36Sopenharmony_ci	path = __getname();
262462306a36Sopenharmony_ci	if (!path)
262562306a36Sopenharmony_ci		return ERR_PTR(-ENOMEM);
262662306a36Sopenharmony_ciretry:
262762306a36Sopenharmony_ci	pos = PATH_MAX - 1;
262862306a36Sopenharmony_ci	path[pos] = '\0';
262962306a36Sopenharmony_ci
263062306a36Sopenharmony_ci	seq = read_seqbegin(&rename_lock);
263162306a36Sopenharmony_ci	cur = dget(dentry);
263262306a36Sopenharmony_ci	for (;;) {
263362306a36Sopenharmony_ci		struct dentry *parent;
263462306a36Sopenharmony_ci
263562306a36Sopenharmony_ci		spin_lock(&cur->d_lock);
263662306a36Sopenharmony_ci		inode = d_inode(cur);
263762306a36Sopenharmony_ci		if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
263862306a36Sopenharmony_ci			dout("build_path path+%d: %p SNAPDIR\n",
263962306a36Sopenharmony_ci			     pos, cur);
264062306a36Sopenharmony_ci			spin_unlock(&cur->d_lock);
264162306a36Sopenharmony_ci			parent = dget_parent(cur);
264262306a36Sopenharmony_ci		} else if (for_wire && inode && dentry != cur &&
264362306a36Sopenharmony_ci			   ceph_snap(inode) == CEPH_NOSNAP) {
264462306a36Sopenharmony_ci			spin_unlock(&cur->d_lock);
264562306a36Sopenharmony_ci			pos++; /* get rid of any prepended '/' */
264662306a36Sopenharmony_ci			break;
264762306a36Sopenharmony_ci		} else if (!for_wire || !IS_ENCRYPTED(d_inode(cur->d_parent))) {
264862306a36Sopenharmony_ci			pos -= cur->d_name.len;
264962306a36Sopenharmony_ci			if (pos < 0) {
265062306a36Sopenharmony_ci				spin_unlock(&cur->d_lock);
265162306a36Sopenharmony_ci				break;
265262306a36Sopenharmony_ci			}
265362306a36Sopenharmony_ci			memcpy(path + pos, cur->d_name.name, cur->d_name.len);
265462306a36Sopenharmony_ci			spin_unlock(&cur->d_lock);
265562306a36Sopenharmony_ci			parent = dget_parent(cur);
265662306a36Sopenharmony_ci		} else {
265762306a36Sopenharmony_ci			int len, ret;
265862306a36Sopenharmony_ci			char buf[NAME_MAX];
265962306a36Sopenharmony_ci
266062306a36Sopenharmony_ci			/*
266162306a36Sopenharmony_ci			 * Proactively copy name into buf, in case we need to
266262306a36Sopenharmony_ci			 * present it as-is.
266362306a36Sopenharmony_ci			 */
266462306a36Sopenharmony_ci			memcpy(buf, cur->d_name.name, cur->d_name.len);
266562306a36Sopenharmony_ci			len = cur->d_name.len;
266662306a36Sopenharmony_ci			spin_unlock(&cur->d_lock);
266762306a36Sopenharmony_ci			parent = dget_parent(cur);
266862306a36Sopenharmony_ci
266962306a36Sopenharmony_ci			ret = ceph_fscrypt_prepare_readdir(d_inode(parent));
267062306a36Sopenharmony_ci			if (ret < 0) {
267162306a36Sopenharmony_ci				dput(parent);
267262306a36Sopenharmony_ci				dput(cur);
267362306a36Sopenharmony_ci				return ERR_PTR(ret);
267462306a36Sopenharmony_ci			}
267562306a36Sopenharmony_ci
267662306a36Sopenharmony_ci			if (fscrypt_has_encryption_key(d_inode(parent))) {
267762306a36Sopenharmony_ci				len = ceph_encode_encrypted_fname(d_inode(parent),
267862306a36Sopenharmony_ci								  cur, buf);
267962306a36Sopenharmony_ci				if (len < 0) {
268062306a36Sopenharmony_ci					dput(parent);
268162306a36Sopenharmony_ci					dput(cur);
268262306a36Sopenharmony_ci					return ERR_PTR(len);
268362306a36Sopenharmony_ci				}
268462306a36Sopenharmony_ci			}
268562306a36Sopenharmony_ci			pos -= len;
268662306a36Sopenharmony_ci			if (pos < 0) {
268762306a36Sopenharmony_ci				dput(parent);
268862306a36Sopenharmony_ci				break;
268962306a36Sopenharmony_ci			}
269062306a36Sopenharmony_ci			memcpy(path + pos, buf, len);
269162306a36Sopenharmony_ci		}
269262306a36Sopenharmony_ci		dput(cur);
269362306a36Sopenharmony_ci		cur = parent;
269462306a36Sopenharmony_ci
269562306a36Sopenharmony_ci		/* Are we at the root? */
269662306a36Sopenharmony_ci		if (IS_ROOT(cur))
269762306a36Sopenharmony_ci			break;
269862306a36Sopenharmony_ci
269962306a36Sopenharmony_ci		/* Are we out of buffer? */
270062306a36Sopenharmony_ci		if (--pos < 0)
270162306a36Sopenharmony_ci			break;
270262306a36Sopenharmony_ci
270362306a36Sopenharmony_ci		path[pos] = '/';
270462306a36Sopenharmony_ci	}
270562306a36Sopenharmony_ci	inode = d_inode(cur);
270662306a36Sopenharmony_ci	base = inode ? ceph_ino(inode) : 0;
270762306a36Sopenharmony_ci	dput(cur);
270862306a36Sopenharmony_ci
270962306a36Sopenharmony_ci	if (read_seqretry(&rename_lock, seq))
271062306a36Sopenharmony_ci		goto retry;
271162306a36Sopenharmony_ci
271262306a36Sopenharmony_ci	if (pos < 0) {
271362306a36Sopenharmony_ci		/*
271462306a36Sopenharmony_ci		 * A rename didn't occur, but somehow we didn't end up where
271562306a36Sopenharmony_ci		 * we thought we would. Throw a warning and try again.
271662306a36Sopenharmony_ci		 */
271762306a36Sopenharmony_ci		pr_warn("build_path did not end path lookup where expected (pos = %d)\n",
271862306a36Sopenharmony_ci			pos);
271962306a36Sopenharmony_ci		goto retry;
272062306a36Sopenharmony_ci	}
272162306a36Sopenharmony_ci
272262306a36Sopenharmony_ci	*pbase = base;
272362306a36Sopenharmony_ci	*plen = PATH_MAX - 1 - pos;
272462306a36Sopenharmony_ci	dout("build_path on %p %d built %llx '%.*s'\n",
272562306a36Sopenharmony_ci	     dentry, d_count(dentry), base, *plen, path + pos);
272662306a36Sopenharmony_ci	return path + pos;
272762306a36Sopenharmony_ci}
272862306a36Sopenharmony_ci
272962306a36Sopenharmony_cistatic int build_dentry_path(struct dentry *dentry, struct inode *dir,
273062306a36Sopenharmony_ci			     const char **ppath, int *ppathlen, u64 *pino,
273162306a36Sopenharmony_ci			     bool *pfreepath, bool parent_locked)
273262306a36Sopenharmony_ci{
273362306a36Sopenharmony_ci	char *path;
273462306a36Sopenharmony_ci
273562306a36Sopenharmony_ci	rcu_read_lock();
273662306a36Sopenharmony_ci	if (!dir)
273762306a36Sopenharmony_ci		dir = d_inode_rcu(dentry->d_parent);
273862306a36Sopenharmony_ci	if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP &&
273962306a36Sopenharmony_ci	    !IS_ENCRYPTED(dir)) {
274062306a36Sopenharmony_ci		*pino = ceph_ino(dir);
274162306a36Sopenharmony_ci		rcu_read_unlock();
274262306a36Sopenharmony_ci		*ppath = dentry->d_name.name;
274362306a36Sopenharmony_ci		*ppathlen = dentry->d_name.len;
274462306a36Sopenharmony_ci		return 0;
274562306a36Sopenharmony_ci	}
274662306a36Sopenharmony_ci	rcu_read_unlock();
274762306a36Sopenharmony_ci	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
274862306a36Sopenharmony_ci	if (IS_ERR(path))
274962306a36Sopenharmony_ci		return PTR_ERR(path);
275062306a36Sopenharmony_ci	*ppath = path;
275162306a36Sopenharmony_ci	*pfreepath = true;
275262306a36Sopenharmony_ci	return 0;
275362306a36Sopenharmony_ci}
275462306a36Sopenharmony_ci
275562306a36Sopenharmony_cistatic int build_inode_path(struct inode *inode,
275662306a36Sopenharmony_ci			    const char **ppath, int *ppathlen, u64 *pino,
275762306a36Sopenharmony_ci			    bool *pfreepath)
275862306a36Sopenharmony_ci{
275962306a36Sopenharmony_ci	struct dentry *dentry;
276062306a36Sopenharmony_ci	char *path;
276162306a36Sopenharmony_ci
276262306a36Sopenharmony_ci	if (ceph_snap(inode) == CEPH_NOSNAP) {
276362306a36Sopenharmony_ci		*pino = ceph_ino(inode);
276462306a36Sopenharmony_ci		*ppathlen = 0;
276562306a36Sopenharmony_ci		return 0;
276662306a36Sopenharmony_ci	}
276762306a36Sopenharmony_ci	dentry = d_find_alias(inode);
276862306a36Sopenharmony_ci	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
276962306a36Sopenharmony_ci	dput(dentry);
277062306a36Sopenharmony_ci	if (IS_ERR(path))
277162306a36Sopenharmony_ci		return PTR_ERR(path);
277262306a36Sopenharmony_ci	*ppath = path;
277362306a36Sopenharmony_ci	*pfreepath = true;
277462306a36Sopenharmony_ci	return 0;
277562306a36Sopenharmony_ci}
277662306a36Sopenharmony_ci
277762306a36Sopenharmony_ci/*
277862306a36Sopenharmony_ci * request arguments may be specified via an inode *, a dentry *, or
277962306a36Sopenharmony_ci * an explicit ino+path.
278062306a36Sopenharmony_ci */
278162306a36Sopenharmony_cistatic int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
278262306a36Sopenharmony_ci				  struct inode *rdiri, const char *rpath,
278362306a36Sopenharmony_ci				  u64 rino, const char **ppath, int *pathlen,
278462306a36Sopenharmony_ci				  u64 *ino, bool *freepath, bool parent_locked)
278562306a36Sopenharmony_ci{
278662306a36Sopenharmony_ci	int r = 0;
278762306a36Sopenharmony_ci
278862306a36Sopenharmony_ci	if (rinode) {
278962306a36Sopenharmony_ci		r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
279062306a36Sopenharmony_ci		dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
279162306a36Sopenharmony_ci		     ceph_snap(rinode));
279262306a36Sopenharmony_ci	} else if (rdentry) {
279362306a36Sopenharmony_ci		r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
279462306a36Sopenharmony_ci					freepath, parent_locked);
279562306a36Sopenharmony_ci		dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
279662306a36Sopenharmony_ci		     *ppath);
279762306a36Sopenharmony_ci	} else if (rpath || rino) {
279862306a36Sopenharmony_ci		*ino = rino;
279962306a36Sopenharmony_ci		*ppath = rpath;
280062306a36Sopenharmony_ci		*pathlen = rpath ? strlen(rpath) : 0;
280162306a36Sopenharmony_ci		dout(" path %.*s\n", *pathlen, rpath);
280262306a36Sopenharmony_ci	}
280362306a36Sopenharmony_ci
280462306a36Sopenharmony_ci	return r;
280562306a36Sopenharmony_ci}
280662306a36Sopenharmony_ci
280762306a36Sopenharmony_cistatic void encode_mclientrequest_tail(void **p,
280862306a36Sopenharmony_ci				       const struct ceph_mds_request *req)
280962306a36Sopenharmony_ci{
281062306a36Sopenharmony_ci	struct ceph_timespec ts;
281162306a36Sopenharmony_ci	int i;
281262306a36Sopenharmony_ci
281362306a36Sopenharmony_ci	ceph_encode_timespec64(&ts, &req->r_stamp);
281462306a36Sopenharmony_ci	ceph_encode_copy(p, &ts, sizeof(ts));
281562306a36Sopenharmony_ci
281662306a36Sopenharmony_ci	/* v4: gid_list */
281762306a36Sopenharmony_ci	ceph_encode_32(p, req->r_cred->group_info->ngroups);
281862306a36Sopenharmony_ci	for (i = 0; i < req->r_cred->group_info->ngroups; i++)
281962306a36Sopenharmony_ci		ceph_encode_64(p, from_kgid(&init_user_ns,
282062306a36Sopenharmony_ci					    req->r_cred->group_info->gid[i]));
282162306a36Sopenharmony_ci
282262306a36Sopenharmony_ci	/* v5: altname */
282362306a36Sopenharmony_ci	ceph_encode_32(p, req->r_altname_len);
282462306a36Sopenharmony_ci	ceph_encode_copy(p, req->r_altname, req->r_altname_len);
282562306a36Sopenharmony_ci
282662306a36Sopenharmony_ci	/* v6: fscrypt_auth and fscrypt_file */
282762306a36Sopenharmony_ci	if (req->r_fscrypt_auth) {
282862306a36Sopenharmony_ci		u32 authlen = ceph_fscrypt_auth_len(req->r_fscrypt_auth);
282962306a36Sopenharmony_ci
283062306a36Sopenharmony_ci		ceph_encode_32(p, authlen);
283162306a36Sopenharmony_ci		ceph_encode_copy(p, req->r_fscrypt_auth, authlen);
283262306a36Sopenharmony_ci	} else {
283362306a36Sopenharmony_ci		ceph_encode_32(p, 0);
283462306a36Sopenharmony_ci	}
283562306a36Sopenharmony_ci	if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) {
283662306a36Sopenharmony_ci		ceph_encode_32(p, sizeof(__le64));
283762306a36Sopenharmony_ci		ceph_encode_64(p, req->r_fscrypt_file);
283862306a36Sopenharmony_ci	} else {
283962306a36Sopenharmony_ci		ceph_encode_32(p, 0);
284062306a36Sopenharmony_ci	}
284162306a36Sopenharmony_ci}
284262306a36Sopenharmony_ci
284362306a36Sopenharmony_cistatic struct ceph_mds_request_head_legacy *
284462306a36Sopenharmony_cifind_legacy_request_head(void *p, u64 features)
284562306a36Sopenharmony_ci{
284662306a36Sopenharmony_ci	bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
284762306a36Sopenharmony_ci	struct ceph_mds_request_head_old *ohead;
284862306a36Sopenharmony_ci
284962306a36Sopenharmony_ci	if (legacy)
285062306a36Sopenharmony_ci		return (struct ceph_mds_request_head_legacy *)p;
285162306a36Sopenharmony_ci	ohead = (struct ceph_mds_request_head_old *)p;
285262306a36Sopenharmony_ci	return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid;
285362306a36Sopenharmony_ci}
285462306a36Sopenharmony_ci
285562306a36Sopenharmony_ci/*
285662306a36Sopenharmony_ci * called under mdsc->mutex
285762306a36Sopenharmony_ci */
285862306a36Sopenharmony_cistatic struct ceph_msg *create_request_message(struct ceph_mds_session *session,
285962306a36Sopenharmony_ci					       struct ceph_mds_request *req,
286062306a36Sopenharmony_ci					       bool drop_cap_releases)
286162306a36Sopenharmony_ci{
286262306a36Sopenharmony_ci	int mds = session->s_mds;
286362306a36Sopenharmony_ci	struct ceph_mds_client *mdsc = session->s_mdsc;
286462306a36Sopenharmony_ci	struct ceph_msg *msg;
286562306a36Sopenharmony_ci	struct ceph_mds_request_head_legacy *lhead;
286662306a36Sopenharmony_ci	const char *path1 = NULL;
286762306a36Sopenharmony_ci	const char *path2 = NULL;
286862306a36Sopenharmony_ci	u64 ino1 = 0, ino2 = 0;
286962306a36Sopenharmony_ci	int pathlen1 = 0, pathlen2 = 0;
287062306a36Sopenharmony_ci	bool freepath1 = false, freepath2 = false;
287162306a36Sopenharmony_ci	struct dentry *old_dentry = NULL;
287262306a36Sopenharmony_ci	int len;
287362306a36Sopenharmony_ci	u16 releases;
287462306a36Sopenharmony_ci	void *p, *end;
287562306a36Sopenharmony_ci	int ret;
287662306a36Sopenharmony_ci	bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
287762306a36Sopenharmony_ci	bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
287862306a36Sopenharmony_ci				     &session->s_features);
287962306a36Sopenharmony_ci
288062306a36Sopenharmony_ci	ret = set_request_path_attr(req->r_inode, req->r_dentry,
288162306a36Sopenharmony_ci			      req->r_parent, req->r_path1, req->r_ino1.ino,
288262306a36Sopenharmony_ci			      &path1, &pathlen1, &ino1, &freepath1,
288362306a36Sopenharmony_ci			      test_bit(CEPH_MDS_R_PARENT_LOCKED,
288462306a36Sopenharmony_ci					&req->r_req_flags));
288562306a36Sopenharmony_ci	if (ret < 0) {
288662306a36Sopenharmony_ci		msg = ERR_PTR(ret);
288762306a36Sopenharmony_ci		goto out;
288862306a36Sopenharmony_ci	}
288962306a36Sopenharmony_ci
289062306a36Sopenharmony_ci	/* If r_old_dentry is set, then assume that its parent is locked */
289162306a36Sopenharmony_ci	if (req->r_old_dentry &&
289262306a36Sopenharmony_ci	    !(req->r_old_dentry->d_flags & DCACHE_DISCONNECTED))
289362306a36Sopenharmony_ci		old_dentry = req->r_old_dentry;
289462306a36Sopenharmony_ci	ret = set_request_path_attr(NULL, old_dentry,
289562306a36Sopenharmony_ci			      req->r_old_dentry_dir,
289662306a36Sopenharmony_ci			      req->r_path2, req->r_ino2.ino,
289762306a36Sopenharmony_ci			      &path2, &pathlen2, &ino2, &freepath2, true);
289862306a36Sopenharmony_ci	if (ret < 0) {
289962306a36Sopenharmony_ci		msg = ERR_PTR(ret);
290062306a36Sopenharmony_ci		goto out_free1;
290162306a36Sopenharmony_ci	}
290262306a36Sopenharmony_ci
290362306a36Sopenharmony_ci	req->r_altname = get_fscrypt_altname(req, &req->r_altname_len);
290462306a36Sopenharmony_ci	if (IS_ERR(req->r_altname)) {
290562306a36Sopenharmony_ci		msg = ERR_CAST(req->r_altname);
290662306a36Sopenharmony_ci		req->r_altname = NULL;
290762306a36Sopenharmony_ci		goto out_free2;
290862306a36Sopenharmony_ci	}
290962306a36Sopenharmony_ci
291062306a36Sopenharmony_ci	/*
291162306a36Sopenharmony_ci	 * For old cephs without supporting the 32bit retry/fwd feature
291262306a36Sopenharmony_ci	 * it will copy the raw memories directly when decoding the
291362306a36Sopenharmony_ci	 * requests. While new cephs will decode the head depending the
291462306a36Sopenharmony_ci	 * version member, so we need to make sure it will be compatible
291562306a36Sopenharmony_ci	 * with them both.
291662306a36Sopenharmony_ci	 */
291762306a36Sopenharmony_ci	if (legacy)
291862306a36Sopenharmony_ci		len = sizeof(struct ceph_mds_request_head_legacy);
291962306a36Sopenharmony_ci	else if (old_version)
292062306a36Sopenharmony_ci		len = sizeof(struct ceph_mds_request_head_old);
292162306a36Sopenharmony_ci	else
292262306a36Sopenharmony_ci		len = sizeof(struct ceph_mds_request_head);
292362306a36Sopenharmony_ci
292462306a36Sopenharmony_ci	/* filepaths */
292562306a36Sopenharmony_ci	len += 2 * (1 + sizeof(u32) + sizeof(u64));
292662306a36Sopenharmony_ci	len += pathlen1 + pathlen2;
292762306a36Sopenharmony_ci
292862306a36Sopenharmony_ci	/* cap releases */
292962306a36Sopenharmony_ci	len += sizeof(struct ceph_mds_request_release) *
293062306a36Sopenharmony_ci		(!!req->r_inode_drop + !!req->r_dentry_drop +
293162306a36Sopenharmony_ci		 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
293262306a36Sopenharmony_ci
293362306a36Sopenharmony_ci	if (req->r_dentry_drop)
293462306a36Sopenharmony_ci		len += pathlen1;
293562306a36Sopenharmony_ci	if (req->r_old_dentry_drop)
293662306a36Sopenharmony_ci		len += pathlen2;
293762306a36Sopenharmony_ci
293862306a36Sopenharmony_ci	/* MClientRequest tail */
293962306a36Sopenharmony_ci
294062306a36Sopenharmony_ci	/* req->r_stamp */
294162306a36Sopenharmony_ci	len += sizeof(struct ceph_timespec);
294262306a36Sopenharmony_ci
294362306a36Sopenharmony_ci	/* gid list */
294462306a36Sopenharmony_ci	len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups);
294562306a36Sopenharmony_ci
294662306a36Sopenharmony_ci	/* alternate name */
294762306a36Sopenharmony_ci	len += sizeof(u32) + req->r_altname_len;
294862306a36Sopenharmony_ci
294962306a36Sopenharmony_ci	/* fscrypt_auth */
295062306a36Sopenharmony_ci	len += sizeof(u32); // fscrypt_auth
295162306a36Sopenharmony_ci	if (req->r_fscrypt_auth)
295262306a36Sopenharmony_ci		len += ceph_fscrypt_auth_len(req->r_fscrypt_auth);
295362306a36Sopenharmony_ci
295462306a36Sopenharmony_ci	/* fscrypt_file */
295562306a36Sopenharmony_ci	len += sizeof(u32);
295662306a36Sopenharmony_ci	if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags))
295762306a36Sopenharmony_ci		len += sizeof(__le64);
295862306a36Sopenharmony_ci
295962306a36Sopenharmony_ci	msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false);
296062306a36Sopenharmony_ci	if (!msg) {
296162306a36Sopenharmony_ci		msg = ERR_PTR(-ENOMEM);
296262306a36Sopenharmony_ci		goto out_free2;
296362306a36Sopenharmony_ci	}
296462306a36Sopenharmony_ci
296562306a36Sopenharmony_ci	msg->hdr.tid = cpu_to_le64(req->r_tid);
296662306a36Sopenharmony_ci
296762306a36Sopenharmony_ci	lhead = find_legacy_request_head(msg->front.iov_base,
296862306a36Sopenharmony_ci					 session->s_con.peer_features);
296962306a36Sopenharmony_ci
297062306a36Sopenharmony_ci	/*
297162306a36Sopenharmony_ci	 * The ceph_mds_request_head_legacy didn't contain a version field, and
297262306a36Sopenharmony_ci	 * one was added when we moved the message version from 3->4.
297362306a36Sopenharmony_ci	 */
297462306a36Sopenharmony_ci	if (legacy) {
297562306a36Sopenharmony_ci		msg->hdr.version = cpu_to_le16(3);
297662306a36Sopenharmony_ci		p = msg->front.iov_base + sizeof(*lhead);
297762306a36Sopenharmony_ci	} else if (old_version) {
297862306a36Sopenharmony_ci		struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
297962306a36Sopenharmony_ci
298062306a36Sopenharmony_ci		msg->hdr.version = cpu_to_le16(4);
298162306a36Sopenharmony_ci		ohead->version = cpu_to_le16(1);
298262306a36Sopenharmony_ci		p = msg->front.iov_base + sizeof(*ohead);
298362306a36Sopenharmony_ci	} else {
298462306a36Sopenharmony_ci		struct ceph_mds_request_head *nhead = msg->front.iov_base;
298562306a36Sopenharmony_ci
298662306a36Sopenharmony_ci		msg->hdr.version = cpu_to_le16(6);
298762306a36Sopenharmony_ci		nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
298862306a36Sopenharmony_ci		p = msg->front.iov_base + sizeof(*nhead);
298962306a36Sopenharmony_ci	}
299062306a36Sopenharmony_ci
299162306a36Sopenharmony_ci	end = msg->front.iov_base + msg->front.iov_len;
299262306a36Sopenharmony_ci
299362306a36Sopenharmony_ci	lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
299462306a36Sopenharmony_ci	lhead->op = cpu_to_le32(req->r_op);
299562306a36Sopenharmony_ci	lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
299662306a36Sopenharmony_ci						  req->r_cred->fsuid));
299762306a36Sopenharmony_ci	lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
299862306a36Sopenharmony_ci						  req->r_cred->fsgid));
299962306a36Sopenharmony_ci	lhead->ino = cpu_to_le64(req->r_deleg_ino);
300062306a36Sopenharmony_ci	lhead->args = req->r_args;
300162306a36Sopenharmony_ci
300262306a36Sopenharmony_ci	ceph_encode_filepath(&p, end, ino1, path1);
300362306a36Sopenharmony_ci	ceph_encode_filepath(&p, end, ino2, path2);
300462306a36Sopenharmony_ci
300562306a36Sopenharmony_ci	/* make note of release offset, in case we need to replay */
300662306a36Sopenharmony_ci	req->r_request_release_offset = p - msg->front.iov_base;
300762306a36Sopenharmony_ci
300862306a36Sopenharmony_ci	/* cap releases */
300962306a36Sopenharmony_ci	releases = 0;
301062306a36Sopenharmony_ci	if (req->r_inode_drop)
301162306a36Sopenharmony_ci		releases += ceph_encode_inode_release(&p,
301262306a36Sopenharmony_ci		      req->r_inode ? req->r_inode : d_inode(req->r_dentry),
301362306a36Sopenharmony_ci		      mds, req->r_inode_drop, req->r_inode_unless,
301462306a36Sopenharmony_ci		      req->r_op == CEPH_MDS_OP_READDIR);
301562306a36Sopenharmony_ci	if (req->r_dentry_drop) {
301662306a36Sopenharmony_ci		ret = ceph_encode_dentry_release(&p, req->r_dentry,
301762306a36Sopenharmony_ci				req->r_parent, mds, req->r_dentry_drop,
301862306a36Sopenharmony_ci				req->r_dentry_unless);
301962306a36Sopenharmony_ci		if (ret < 0)
302062306a36Sopenharmony_ci			goto out_err;
302162306a36Sopenharmony_ci		releases += ret;
302262306a36Sopenharmony_ci	}
302362306a36Sopenharmony_ci	if (req->r_old_dentry_drop) {
302462306a36Sopenharmony_ci		ret = ceph_encode_dentry_release(&p, req->r_old_dentry,
302562306a36Sopenharmony_ci				req->r_old_dentry_dir, mds,
302662306a36Sopenharmony_ci				req->r_old_dentry_drop,
302762306a36Sopenharmony_ci				req->r_old_dentry_unless);
302862306a36Sopenharmony_ci		if (ret < 0)
302962306a36Sopenharmony_ci			goto out_err;
303062306a36Sopenharmony_ci		releases += ret;
303162306a36Sopenharmony_ci	}
303262306a36Sopenharmony_ci	if (req->r_old_inode_drop)
303362306a36Sopenharmony_ci		releases += ceph_encode_inode_release(&p,
303462306a36Sopenharmony_ci		      d_inode(req->r_old_dentry),
303562306a36Sopenharmony_ci		      mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
303662306a36Sopenharmony_ci
303762306a36Sopenharmony_ci	if (drop_cap_releases) {
303862306a36Sopenharmony_ci		releases = 0;
303962306a36Sopenharmony_ci		p = msg->front.iov_base + req->r_request_release_offset;
304062306a36Sopenharmony_ci	}
304162306a36Sopenharmony_ci
304262306a36Sopenharmony_ci	lhead->num_releases = cpu_to_le16(releases);
304362306a36Sopenharmony_ci
304462306a36Sopenharmony_ci	encode_mclientrequest_tail(&p, req);
304562306a36Sopenharmony_ci
304662306a36Sopenharmony_ci	if (WARN_ON_ONCE(p > end)) {
304762306a36Sopenharmony_ci		ceph_msg_put(msg);
304862306a36Sopenharmony_ci		msg = ERR_PTR(-ERANGE);
304962306a36Sopenharmony_ci		goto out_free2;
305062306a36Sopenharmony_ci	}
305162306a36Sopenharmony_ci
305262306a36Sopenharmony_ci	msg->front.iov_len = p - msg->front.iov_base;
305362306a36Sopenharmony_ci	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
305462306a36Sopenharmony_ci
305562306a36Sopenharmony_ci	if (req->r_pagelist) {
305662306a36Sopenharmony_ci		struct ceph_pagelist *pagelist = req->r_pagelist;
305762306a36Sopenharmony_ci		ceph_msg_data_add_pagelist(msg, pagelist);
305862306a36Sopenharmony_ci		msg->hdr.data_len = cpu_to_le32(pagelist->length);
305962306a36Sopenharmony_ci	} else {
306062306a36Sopenharmony_ci		msg->hdr.data_len = 0;
306162306a36Sopenharmony_ci	}
306262306a36Sopenharmony_ci
306362306a36Sopenharmony_ci	msg->hdr.data_off = cpu_to_le16(0);
306462306a36Sopenharmony_ci
306562306a36Sopenharmony_ciout_free2:
306662306a36Sopenharmony_ci	if (freepath2)
306762306a36Sopenharmony_ci		ceph_mdsc_free_path((char *)path2, pathlen2);
306862306a36Sopenharmony_ciout_free1:
306962306a36Sopenharmony_ci	if (freepath1)
307062306a36Sopenharmony_ci		ceph_mdsc_free_path((char *)path1, pathlen1);
307162306a36Sopenharmony_ciout:
307262306a36Sopenharmony_ci	return msg;
307362306a36Sopenharmony_ciout_err:
307462306a36Sopenharmony_ci	ceph_msg_put(msg);
307562306a36Sopenharmony_ci	msg = ERR_PTR(ret);
307662306a36Sopenharmony_ci	goto out_free2;
307762306a36Sopenharmony_ci}
307862306a36Sopenharmony_ci
307962306a36Sopenharmony_ci/*
308062306a36Sopenharmony_ci * called under mdsc->mutex if error, under no mutex if
308162306a36Sopenharmony_ci * success.
308262306a36Sopenharmony_ci */
308362306a36Sopenharmony_cistatic void complete_request(struct ceph_mds_client *mdsc,
308462306a36Sopenharmony_ci			     struct ceph_mds_request *req)
308562306a36Sopenharmony_ci{
308662306a36Sopenharmony_ci	req->r_end_latency = ktime_get();
308762306a36Sopenharmony_ci
308862306a36Sopenharmony_ci	if (req->r_callback)
308962306a36Sopenharmony_ci		req->r_callback(mdsc, req);
309062306a36Sopenharmony_ci	complete_all(&req->r_completion);
309162306a36Sopenharmony_ci}
309262306a36Sopenharmony_ci
309362306a36Sopenharmony_ci/*
309462306a36Sopenharmony_ci * called under mdsc->mutex
309562306a36Sopenharmony_ci */
309662306a36Sopenharmony_cistatic int __prepare_send_request(struct ceph_mds_session *session,
309762306a36Sopenharmony_ci				  struct ceph_mds_request *req,
309862306a36Sopenharmony_ci				  bool drop_cap_releases)
309962306a36Sopenharmony_ci{
310062306a36Sopenharmony_ci	int mds = session->s_mds;
310162306a36Sopenharmony_ci	struct ceph_mds_client *mdsc = session->s_mdsc;
310262306a36Sopenharmony_ci	struct ceph_mds_request_head_legacy *lhead;
310362306a36Sopenharmony_ci	struct ceph_mds_request_head *nhead;
310462306a36Sopenharmony_ci	struct ceph_msg *msg;
310562306a36Sopenharmony_ci	int flags = 0, old_max_retry;
310662306a36Sopenharmony_ci	bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
310762306a36Sopenharmony_ci				     &session->s_features);
310862306a36Sopenharmony_ci
310962306a36Sopenharmony_ci	/*
311062306a36Sopenharmony_ci	 * Avoid inifinite retrying after overflow. The client will
311162306a36Sopenharmony_ci	 * increase the retry count and if the MDS is old version,
311262306a36Sopenharmony_ci	 * so we limit to retry at most 256 times.
311362306a36Sopenharmony_ci	 */
311462306a36Sopenharmony_ci	if (req->r_attempts) {
311562306a36Sopenharmony_ci	       old_max_retry = sizeof_field(struct ceph_mds_request_head_old,
311662306a36Sopenharmony_ci					    num_retry);
311762306a36Sopenharmony_ci	       old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
311862306a36Sopenharmony_ci	       if ((old_version && req->r_attempts >= old_max_retry) ||
311962306a36Sopenharmony_ci		   ((uint32_t)req->r_attempts >= U32_MAX)) {
312062306a36Sopenharmony_ci			pr_warn_ratelimited("%s request tid %llu seq overflow\n",
312162306a36Sopenharmony_ci					    __func__, req->r_tid);
312262306a36Sopenharmony_ci			return -EMULTIHOP;
312362306a36Sopenharmony_ci	       }
312462306a36Sopenharmony_ci	}
312562306a36Sopenharmony_ci
312662306a36Sopenharmony_ci	req->r_attempts++;
312762306a36Sopenharmony_ci	if (req->r_inode) {
312862306a36Sopenharmony_ci		struct ceph_cap *cap =
312962306a36Sopenharmony_ci			ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
313062306a36Sopenharmony_ci
313162306a36Sopenharmony_ci		if (cap)
313262306a36Sopenharmony_ci			req->r_sent_on_mseq = cap->mseq;
313362306a36Sopenharmony_ci		else
313462306a36Sopenharmony_ci			req->r_sent_on_mseq = -1;
313562306a36Sopenharmony_ci	}
313662306a36Sopenharmony_ci	dout("%s %p tid %lld %s (attempt %d)\n", __func__, req,
313762306a36Sopenharmony_ci	     req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
313862306a36Sopenharmony_ci
313962306a36Sopenharmony_ci	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
314062306a36Sopenharmony_ci		void *p;
314162306a36Sopenharmony_ci
314262306a36Sopenharmony_ci		/*
314362306a36Sopenharmony_ci		 * Replay.  Do not regenerate message (and rebuild
314462306a36Sopenharmony_ci		 * paths, etc.); just use the original message.
314562306a36Sopenharmony_ci		 * Rebuilding paths will break for renames because
314662306a36Sopenharmony_ci		 * d_move mangles the src name.
314762306a36Sopenharmony_ci		 */
314862306a36Sopenharmony_ci		msg = req->r_request;
314962306a36Sopenharmony_ci		lhead = find_legacy_request_head(msg->front.iov_base,
315062306a36Sopenharmony_ci						 session->s_con.peer_features);
315162306a36Sopenharmony_ci
315262306a36Sopenharmony_ci		flags = le32_to_cpu(lhead->flags);
315362306a36Sopenharmony_ci		flags |= CEPH_MDS_FLAG_REPLAY;
315462306a36Sopenharmony_ci		lhead->flags = cpu_to_le32(flags);
315562306a36Sopenharmony_ci
315662306a36Sopenharmony_ci		if (req->r_target_inode)
315762306a36Sopenharmony_ci			lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
315862306a36Sopenharmony_ci
315962306a36Sopenharmony_ci		lhead->num_retry = req->r_attempts - 1;
316062306a36Sopenharmony_ci		if (!old_version) {
316162306a36Sopenharmony_ci			nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
316262306a36Sopenharmony_ci			nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
316362306a36Sopenharmony_ci		}
316462306a36Sopenharmony_ci
316562306a36Sopenharmony_ci		/* remove cap/dentry releases from message */
316662306a36Sopenharmony_ci		lhead->num_releases = 0;
316762306a36Sopenharmony_ci
316862306a36Sopenharmony_ci		p = msg->front.iov_base + req->r_request_release_offset;
316962306a36Sopenharmony_ci		encode_mclientrequest_tail(&p, req);
317062306a36Sopenharmony_ci
317162306a36Sopenharmony_ci		msg->front.iov_len = p - msg->front.iov_base;
317262306a36Sopenharmony_ci		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
317362306a36Sopenharmony_ci		return 0;
317462306a36Sopenharmony_ci	}
317562306a36Sopenharmony_ci
317662306a36Sopenharmony_ci	if (req->r_request) {
317762306a36Sopenharmony_ci		ceph_msg_put(req->r_request);
317862306a36Sopenharmony_ci		req->r_request = NULL;
317962306a36Sopenharmony_ci	}
318062306a36Sopenharmony_ci	msg = create_request_message(session, req, drop_cap_releases);
318162306a36Sopenharmony_ci	if (IS_ERR(msg)) {
318262306a36Sopenharmony_ci		req->r_err = PTR_ERR(msg);
318362306a36Sopenharmony_ci		return PTR_ERR(msg);
318462306a36Sopenharmony_ci	}
318562306a36Sopenharmony_ci	req->r_request = msg;
318662306a36Sopenharmony_ci
318762306a36Sopenharmony_ci	lhead = find_legacy_request_head(msg->front.iov_base,
318862306a36Sopenharmony_ci					 session->s_con.peer_features);
318962306a36Sopenharmony_ci	lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
319062306a36Sopenharmony_ci	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
319162306a36Sopenharmony_ci		flags |= CEPH_MDS_FLAG_REPLAY;
319262306a36Sopenharmony_ci	if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
319362306a36Sopenharmony_ci		flags |= CEPH_MDS_FLAG_ASYNC;
319462306a36Sopenharmony_ci	if (req->r_parent)
319562306a36Sopenharmony_ci		flags |= CEPH_MDS_FLAG_WANT_DENTRY;
319662306a36Sopenharmony_ci	lhead->flags = cpu_to_le32(flags);
319762306a36Sopenharmony_ci	lhead->num_fwd = req->r_num_fwd;
319862306a36Sopenharmony_ci	lhead->num_retry = req->r_attempts - 1;
319962306a36Sopenharmony_ci	if (!old_version) {
320062306a36Sopenharmony_ci		nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
320162306a36Sopenharmony_ci		nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd);
320262306a36Sopenharmony_ci		nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
320362306a36Sopenharmony_ci	}
320462306a36Sopenharmony_ci
320562306a36Sopenharmony_ci	dout(" r_parent = %p\n", req->r_parent);
320662306a36Sopenharmony_ci	return 0;
320762306a36Sopenharmony_ci}
320862306a36Sopenharmony_ci
320962306a36Sopenharmony_ci/*
321062306a36Sopenharmony_ci * called under mdsc->mutex
321162306a36Sopenharmony_ci */
321262306a36Sopenharmony_cistatic int __send_request(struct ceph_mds_session *session,
321362306a36Sopenharmony_ci			  struct ceph_mds_request *req,
321462306a36Sopenharmony_ci			  bool drop_cap_releases)
321562306a36Sopenharmony_ci{
321662306a36Sopenharmony_ci	int err;
321762306a36Sopenharmony_ci
321862306a36Sopenharmony_ci	err = __prepare_send_request(session, req, drop_cap_releases);
321962306a36Sopenharmony_ci	if (!err) {
322062306a36Sopenharmony_ci		ceph_msg_get(req->r_request);
322162306a36Sopenharmony_ci		ceph_con_send(&session->s_con, req->r_request);
322262306a36Sopenharmony_ci	}
322362306a36Sopenharmony_ci
322462306a36Sopenharmony_ci	return err;
322562306a36Sopenharmony_ci}
322662306a36Sopenharmony_ci
322762306a36Sopenharmony_ci/*
322862306a36Sopenharmony_ci * send request, or put it on the appropriate wait list.
322962306a36Sopenharmony_ci */
323062306a36Sopenharmony_cistatic void __do_request(struct ceph_mds_client *mdsc,
323162306a36Sopenharmony_ci			struct ceph_mds_request *req)
323262306a36Sopenharmony_ci{
323362306a36Sopenharmony_ci	struct ceph_mds_session *session = NULL;
323462306a36Sopenharmony_ci	int mds = -1;
323562306a36Sopenharmony_ci	int err = 0;
323662306a36Sopenharmony_ci	bool random;
323762306a36Sopenharmony_ci
323862306a36Sopenharmony_ci	if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
323962306a36Sopenharmony_ci		if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
324062306a36Sopenharmony_ci			__unregister_request(mdsc, req);
324162306a36Sopenharmony_ci		return;
324262306a36Sopenharmony_ci	}
324362306a36Sopenharmony_ci
324462306a36Sopenharmony_ci	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO) {
324562306a36Sopenharmony_ci		dout("do_request metadata corrupted\n");
324662306a36Sopenharmony_ci		err = -EIO;
324762306a36Sopenharmony_ci		goto finish;
324862306a36Sopenharmony_ci	}
324962306a36Sopenharmony_ci	if (req->r_timeout &&
325062306a36Sopenharmony_ci	    time_after_eq(jiffies, req->r_started + req->r_timeout)) {
325162306a36Sopenharmony_ci		dout("do_request timed out\n");
325262306a36Sopenharmony_ci		err = -ETIMEDOUT;
325362306a36Sopenharmony_ci		goto finish;
325462306a36Sopenharmony_ci	}
325562306a36Sopenharmony_ci	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
325662306a36Sopenharmony_ci		dout("do_request forced umount\n");
325762306a36Sopenharmony_ci		err = -EIO;
325862306a36Sopenharmony_ci		goto finish;
325962306a36Sopenharmony_ci	}
326062306a36Sopenharmony_ci	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
326162306a36Sopenharmony_ci		if (mdsc->mdsmap_err) {
326262306a36Sopenharmony_ci			err = mdsc->mdsmap_err;
326362306a36Sopenharmony_ci			dout("do_request mdsmap err %d\n", err);
326462306a36Sopenharmony_ci			goto finish;
326562306a36Sopenharmony_ci		}
326662306a36Sopenharmony_ci		if (mdsc->mdsmap->m_epoch == 0) {
326762306a36Sopenharmony_ci			dout("do_request no mdsmap, waiting for map\n");
326862306a36Sopenharmony_ci			list_add(&req->r_wait, &mdsc->waiting_for_map);
326962306a36Sopenharmony_ci			return;
327062306a36Sopenharmony_ci		}
327162306a36Sopenharmony_ci		if (!(mdsc->fsc->mount_options->flags &
327262306a36Sopenharmony_ci		      CEPH_MOUNT_OPT_MOUNTWAIT) &&
327362306a36Sopenharmony_ci		    !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
327462306a36Sopenharmony_ci			err = -EHOSTUNREACH;
327562306a36Sopenharmony_ci			goto finish;
327662306a36Sopenharmony_ci		}
327762306a36Sopenharmony_ci	}
327862306a36Sopenharmony_ci
327962306a36Sopenharmony_ci	put_request_session(req);
328062306a36Sopenharmony_ci
328162306a36Sopenharmony_ci	mds = __choose_mds(mdsc, req, &random);
328262306a36Sopenharmony_ci	if (mds < 0 ||
328362306a36Sopenharmony_ci	    ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
328462306a36Sopenharmony_ci		if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
328562306a36Sopenharmony_ci			err = -EJUKEBOX;
328662306a36Sopenharmony_ci			goto finish;
328762306a36Sopenharmony_ci		}
328862306a36Sopenharmony_ci		dout("do_request no mds or not active, waiting for map\n");
328962306a36Sopenharmony_ci		list_add(&req->r_wait, &mdsc->waiting_for_map);
329062306a36Sopenharmony_ci		return;
329162306a36Sopenharmony_ci	}
329262306a36Sopenharmony_ci
329362306a36Sopenharmony_ci	/* get, open session */
329462306a36Sopenharmony_ci	session = __ceph_lookup_mds_session(mdsc, mds);
329562306a36Sopenharmony_ci	if (!session) {
329662306a36Sopenharmony_ci		session = register_session(mdsc, mds);
329762306a36Sopenharmony_ci		if (IS_ERR(session)) {
329862306a36Sopenharmony_ci			err = PTR_ERR(session);
329962306a36Sopenharmony_ci			goto finish;
330062306a36Sopenharmony_ci		}
330162306a36Sopenharmony_ci	}
330262306a36Sopenharmony_ci	req->r_session = ceph_get_mds_session(session);
330362306a36Sopenharmony_ci
330462306a36Sopenharmony_ci	dout("do_request mds%d session %p state %s\n", mds, session,
330562306a36Sopenharmony_ci	     ceph_session_state_name(session->s_state));
330662306a36Sopenharmony_ci
330762306a36Sopenharmony_ci	/*
330862306a36Sopenharmony_ci	 * The old ceph will crash the MDSs when see unknown OPs
330962306a36Sopenharmony_ci	 */
331062306a36Sopenharmony_ci	if (req->r_feature_needed > 0 &&
331162306a36Sopenharmony_ci	    !test_bit(req->r_feature_needed, &session->s_features)) {
331262306a36Sopenharmony_ci		err = -EOPNOTSUPP;
331362306a36Sopenharmony_ci		goto out_session;
331462306a36Sopenharmony_ci	}
331562306a36Sopenharmony_ci
331662306a36Sopenharmony_ci	if (session->s_state != CEPH_MDS_SESSION_OPEN &&
331762306a36Sopenharmony_ci	    session->s_state != CEPH_MDS_SESSION_HUNG) {
331862306a36Sopenharmony_ci		/*
331962306a36Sopenharmony_ci		 * We cannot queue async requests since the caps and delegated
332062306a36Sopenharmony_ci		 * inodes are bound to the session. Just return -EJUKEBOX and
332162306a36Sopenharmony_ci		 * let the caller retry a sync request in that case.
332262306a36Sopenharmony_ci		 */
332362306a36Sopenharmony_ci		if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
332462306a36Sopenharmony_ci			err = -EJUKEBOX;
332562306a36Sopenharmony_ci			goto out_session;
332662306a36Sopenharmony_ci		}
332762306a36Sopenharmony_ci
332862306a36Sopenharmony_ci		/*
332962306a36Sopenharmony_ci		 * If the session has been REJECTED, then return a hard error,
333062306a36Sopenharmony_ci		 * unless it's a CLEANRECOVER mount, in which case we'll queue
333162306a36Sopenharmony_ci		 * it to the mdsc queue.
333262306a36Sopenharmony_ci		 */
333362306a36Sopenharmony_ci		if (session->s_state == CEPH_MDS_SESSION_REJECTED) {
333462306a36Sopenharmony_ci			if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER))
333562306a36Sopenharmony_ci				list_add(&req->r_wait, &mdsc->waiting_for_map);
333662306a36Sopenharmony_ci			else
333762306a36Sopenharmony_ci				err = -EACCES;
333862306a36Sopenharmony_ci			goto out_session;
333962306a36Sopenharmony_ci		}
334062306a36Sopenharmony_ci
334162306a36Sopenharmony_ci		if (session->s_state == CEPH_MDS_SESSION_NEW ||
334262306a36Sopenharmony_ci		    session->s_state == CEPH_MDS_SESSION_CLOSING) {
334362306a36Sopenharmony_ci			err = __open_session(mdsc, session);
334462306a36Sopenharmony_ci			if (err)
334562306a36Sopenharmony_ci				goto out_session;
334662306a36Sopenharmony_ci			/* retry the same mds later */
334762306a36Sopenharmony_ci			if (random)
334862306a36Sopenharmony_ci				req->r_resend_mds = mds;
334962306a36Sopenharmony_ci		}
335062306a36Sopenharmony_ci		list_add(&req->r_wait, &session->s_waiting);
335162306a36Sopenharmony_ci		goto out_session;
335262306a36Sopenharmony_ci	}
335362306a36Sopenharmony_ci
335462306a36Sopenharmony_ci	/* send request */
335562306a36Sopenharmony_ci	req->r_resend_mds = -1;   /* forget any previous mds hint */
335662306a36Sopenharmony_ci
335762306a36Sopenharmony_ci	if (req->r_request_started == 0)   /* note request start time */
335862306a36Sopenharmony_ci		req->r_request_started = jiffies;
335962306a36Sopenharmony_ci
336062306a36Sopenharmony_ci	/*
336162306a36Sopenharmony_ci	 * For async create we will choose the auth MDS of frag in parent
336262306a36Sopenharmony_ci	 * directory to send the request and ususally this works fine, but
336362306a36Sopenharmony_ci	 * if the migrated the dirtory to another MDS before it could handle
336462306a36Sopenharmony_ci	 * it the request will be forwarded.
336562306a36Sopenharmony_ci	 *
336662306a36Sopenharmony_ci	 * And then the auth cap will be changed.
336762306a36Sopenharmony_ci	 */
336862306a36Sopenharmony_ci	if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && req->r_num_fwd) {
336962306a36Sopenharmony_ci		struct ceph_dentry_info *di = ceph_dentry(req->r_dentry);
337062306a36Sopenharmony_ci		struct ceph_inode_info *ci;
337162306a36Sopenharmony_ci		struct ceph_cap *cap;
337262306a36Sopenharmony_ci
337362306a36Sopenharmony_ci		/*
337462306a36Sopenharmony_ci		 * The request maybe handled very fast and the new inode
337562306a36Sopenharmony_ci		 * hasn't been linked to the dentry yet. We need to wait
337662306a36Sopenharmony_ci		 * for the ceph_finish_async_create(), which shouldn't be
337762306a36Sopenharmony_ci		 * stuck too long or fail in thoery, to finish when forwarding
337862306a36Sopenharmony_ci		 * the request.
337962306a36Sopenharmony_ci		 */
338062306a36Sopenharmony_ci		if (!d_inode(req->r_dentry)) {
338162306a36Sopenharmony_ci			err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT,
338262306a36Sopenharmony_ci					  TASK_KILLABLE);
338362306a36Sopenharmony_ci			if (err) {
338462306a36Sopenharmony_ci				mutex_lock(&req->r_fill_mutex);
338562306a36Sopenharmony_ci				set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
338662306a36Sopenharmony_ci				mutex_unlock(&req->r_fill_mutex);
338762306a36Sopenharmony_ci				goto out_session;
338862306a36Sopenharmony_ci			}
338962306a36Sopenharmony_ci		}
339062306a36Sopenharmony_ci
339162306a36Sopenharmony_ci		ci = ceph_inode(d_inode(req->r_dentry));
339262306a36Sopenharmony_ci
339362306a36Sopenharmony_ci		spin_lock(&ci->i_ceph_lock);
339462306a36Sopenharmony_ci		cap = ci->i_auth_cap;
339562306a36Sopenharmony_ci		if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) {
339662306a36Sopenharmony_ci			dout("do_request session changed for auth cap %d -> %d\n",
339762306a36Sopenharmony_ci			     cap->session->s_mds, session->s_mds);
339862306a36Sopenharmony_ci
339962306a36Sopenharmony_ci			/* Remove the auth cap from old session */
340062306a36Sopenharmony_ci			spin_lock(&cap->session->s_cap_lock);
340162306a36Sopenharmony_ci			cap->session->s_nr_caps--;
340262306a36Sopenharmony_ci			list_del_init(&cap->session_caps);
340362306a36Sopenharmony_ci			spin_unlock(&cap->session->s_cap_lock);
340462306a36Sopenharmony_ci
340562306a36Sopenharmony_ci			/* Add the auth cap to the new session */
340662306a36Sopenharmony_ci			cap->mds = mds;
340762306a36Sopenharmony_ci			cap->session = session;
340862306a36Sopenharmony_ci			spin_lock(&session->s_cap_lock);
340962306a36Sopenharmony_ci			session->s_nr_caps++;
341062306a36Sopenharmony_ci			list_add_tail(&cap->session_caps, &session->s_caps);
341162306a36Sopenharmony_ci			spin_unlock(&session->s_cap_lock);
341262306a36Sopenharmony_ci
341362306a36Sopenharmony_ci			change_auth_cap_ses(ci, session);
341462306a36Sopenharmony_ci		}
341562306a36Sopenharmony_ci		spin_unlock(&ci->i_ceph_lock);
341662306a36Sopenharmony_ci	}
341762306a36Sopenharmony_ci
341862306a36Sopenharmony_ci	err = __send_request(session, req, false);
341962306a36Sopenharmony_ci
342062306a36Sopenharmony_ciout_session:
342162306a36Sopenharmony_ci	ceph_put_mds_session(session);
342262306a36Sopenharmony_cifinish:
342362306a36Sopenharmony_ci	if (err) {
342462306a36Sopenharmony_ci		dout("__do_request early error %d\n", err);
342562306a36Sopenharmony_ci		req->r_err = err;
342662306a36Sopenharmony_ci		complete_request(mdsc, req);
342762306a36Sopenharmony_ci		__unregister_request(mdsc, req);
342862306a36Sopenharmony_ci	}
342962306a36Sopenharmony_ci	return;
343062306a36Sopenharmony_ci}
343162306a36Sopenharmony_ci
343262306a36Sopenharmony_ci/*
343362306a36Sopenharmony_ci * called under mdsc->mutex
343462306a36Sopenharmony_ci */
343562306a36Sopenharmony_cistatic void __wake_requests(struct ceph_mds_client *mdsc,
343662306a36Sopenharmony_ci			    struct list_head *head)
343762306a36Sopenharmony_ci{
343862306a36Sopenharmony_ci	struct ceph_mds_request *req;
343962306a36Sopenharmony_ci	LIST_HEAD(tmp_list);
344062306a36Sopenharmony_ci
344162306a36Sopenharmony_ci	list_splice_init(head, &tmp_list);
344262306a36Sopenharmony_ci
344362306a36Sopenharmony_ci	while (!list_empty(&tmp_list)) {
344462306a36Sopenharmony_ci		req = list_entry(tmp_list.next,
344562306a36Sopenharmony_ci				 struct ceph_mds_request, r_wait);
344662306a36Sopenharmony_ci		list_del_init(&req->r_wait);
344762306a36Sopenharmony_ci		dout(" wake request %p tid %llu\n", req, req->r_tid);
344862306a36Sopenharmony_ci		__do_request(mdsc, req);
344962306a36Sopenharmony_ci	}
345062306a36Sopenharmony_ci}
345162306a36Sopenharmony_ci
345262306a36Sopenharmony_ci/*
345362306a36Sopenharmony_ci * Wake up threads with requests pending for @mds, so that they can
345462306a36Sopenharmony_ci * resubmit their requests to a possibly different mds.
345562306a36Sopenharmony_ci */
345662306a36Sopenharmony_cistatic void kick_requests(struct ceph_mds_client *mdsc, int mds)
345762306a36Sopenharmony_ci{
345862306a36Sopenharmony_ci	struct ceph_mds_request *req;
345962306a36Sopenharmony_ci	struct rb_node *p = rb_first(&mdsc->request_tree);
346062306a36Sopenharmony_ci
346162306a36Sopenharmony_ci	dout("kick_requests mds%d\n", mds);
346262306a36Sopenharmony_ci	while (p) {
346362306a36Sopenharmony_ci		req = rb_entry(p, struct ceph_mds_request, r_node);
346462306a36Sopenharmony_ci		p = rb_next(p);
346562306a36Sopenharmony_ci		if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
346662306a36Sopenharmony_ci			continue;
346762306a36Sopenharmony_ci		if (req->r_attempts > 0)
346862306a36Sopenharmony_ci			continue; /* only new requests */
346962306a36Sopenharmony_ci		if (req->r_session &&
347062306a36Sopenharmony_ci		    req->r_session->s_mds == mds) {
347162306a36Sopenharmony_ci			dout(" kicking tid %llu\n", req->r_tid);
347262306a36Sopenharmony_ci			list_del_init(&req->r_wait);
347362306a36Sopenharmony_ci			__do_request(mdsc, req);
347462306a36Sopenharmony_ci		}
347562306a36Sopenharmony_ci	}
347662306a36Sopenharmony_ci}
347762306a36Sopenharmony_ci
347862306a36Sopenharmony_ciint ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
347962306a36Sopenharmony_ci			      struct ceph_mds_request *req)
348062306a36Sopenharmony_ci{
348162306a36Sopenharmony_ci	int err = 0;
348262306a36Sopenharmony_ci
348362306a36Sopenharmony_ci	/* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
348462306a36Sopenharmony_ci	if (req->r_inode)
348562306a36Sopenharmony_ci		ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
348662306a36Sopenharmony_ci	if (req->r_parent) {
348762306a36Sopenharmony_ci		struct ceph_inode_info *ci = ceph_inode(req->r_parent);
348862306a36Sopenharmony_ci		int fmode = (req->r_op & CEPH_MDS_OP_WRITE) ?
348962306a36Sopenharmony_ci			    CEPH_FILE_MODE_WR : CEPH_FILE_MODE_RD;
349062306a36Sopenharmony_ci		spin_lock(&ci->i_ceph_lock);
349162306a36Sopenharmony_ci		ceph_take_cap_refs(ci, CEPH_CAP_PIN, false);
349262306a36Sopenharmony_ci		__ceph_touch_fmode(ci, mdsc, fmode);
349362306a36Sopenharmony_ci		spin_unlock(&ci->i_ceph_lock);
349462306a36Sopenharmony_ci	}
349562306a36Sopenharmony_ci	if (req->r_old_dentry_dir)
349662306a36Sopenharmony_ci		ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
349762306a36Sopenharmony_ci				  CEPH_CAP_PIN);
349862306a36Sopenharmony_ci
349962306a36Sopenharmony_ci	if (req->r_inode) {
350062306a36Sopenharmony_ci		err = ceph_wait_on_async_create(req->r_inode);
350162306a36Sopenharmony_ci		if (err) {
350262306a36Sopenharmony_ci			dout("%s: wait for async create returned: %d\n",
350362306a36Sopenharmony_ci			     __func__, err);
350462306a36Sopenharmony_ci			return err;
350562306a36Sopenharmony_ci		}
350662306a36Sopenharmony_ci	}
350762306a36Sopenharmony_ci
350862306a36Sopenharmony_ci	if (!err && req->r_old_inode) {
350962306a36Sopenharmony_ci		err = ceph_wait_on_async_create(req->r_old_inode);
351062306a36Sopenharmony_ci		if (err) {
351162306a36Sopenharmony_ci			dout("%s: wait for async create returned: %d\n",
351262306a36Sopenharmony_ci			     __func__, err);
351362306a36Sopenharmony_ci			return err;
351462306a36Sopenharmony_ci		}
351562306a36Sopenharmony_ci	}
351662306a36Sopenharmony_ci
351762306a36Sopenharmony_ci	dout("submit_request on %p for inode %p\n", req, dir);
351862306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
351962306a36Sopenharmony_ci	__register_request(mdsc, req, dir);
352062306a36Sopenharmony_ci	__do_request(mdsc, req);
352162306a36Sopenharmony_ci	err = req->r_err;
352262306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
352362306a36Sopenharmony_ci	return err;
352462306a36Sopenharmony_ci}
352562306a36Sopenharmony_ci
352662306a36Sopenharmony_ciint ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
352762306a36Sopenharmony_ci			   struct ceph_mds_request *req,
352862306a36Sopenharmony_ci			   ceph_mds_request_wait_callback_t wait_func)
352962306a36Sopenharmony_ci{
353062306a36Sopenharmony_ci	int err;
353162306a36Sopenharmony_ci
353262306a36Sopenharmony_ci	/* wait */
353362306a36Sopenharmony_ci	dout("do_request waiting\n");
353462306a36Sopenharmony_ci	if (wait_func) {
353562306a36Sopenharmony_ci		err = wait_func(mdsc, req);
353662306a36Sopenharmony_ci	} else {
353762306a36Sopenharmony_ci		long timeleft = wait_for_completion_killable_timeout(
353862306a36Sopenharmony_ci					&req->r_completion,
353962306a36Sopenharmony_ci					ceph_timeout_jiffies(req->r_timeout));
354062306a36Sopenharmony_ci		if (timeleft > 0)
354162306a36Sopenharmony_ci			err = 0;
354262306a36Sopenharmony_ci		else if (!timeleft)
354362306a36Sopenharmony_ci			err = -ETIMEDOUT;  /* timed out */
354462306a36Sopenharmony_ci		else
354562306a36Sopenharmony_ci			err = timeleft;  /* killed */
354662306a36Sopenharmony_ci	}
354762306a36Sopenharmony_ci	dout("do_request waited, got %d\n", err);
354862306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
354962306a36Sopenharmony_ci
355062306a36Sopenharmony_ci	/* only abort if we didn't race with a real reply */
355162306a36Sopenharmony_ci	if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
355262306a36Sopenharmony_ci		err = le32_to_cpu(req->r_reply_info.head->result);
355362306a36Sopenharmony_ci	} else if (err < 0) {
355462306a36Sopenharmony_ci		dout("aborted request %lld with %d\n", req->r_tid, err);
355562306a36Sopenharmony_ci
355662306a36Sopenharmony_ci		/*
355762306a36Sopenharmony_ci		 * ensure we aren't running concurrently with
355862306a36Sopenharmony_ci		 * ceph_fill_trace or ceph_readdir_prepopulate, which
355962306a36Sopenharmony_ci		 * rely on locks (dir mutex) held by our caller.
356062306a36Sopenharmony_ci		 */
356162306a36Sopenharmony_ci		mutex_lock(&req->r_fill_mutex);
356262306a36Sopenharmony_ci		req->r_err = err;
356362306a36Sopenharmony_ci		set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
356462306a36Sopenharmony_ci		mutex_unlock(&req->r_fill_mutex);
356562306a36Sopenharmony_ci
356662306a36Sopenharmony_ci		if (req->r_parent &&
356762306a36Sopenharmony_ci		    (req->r_op & CEPH_MDS_OP_WRITE))
356862306a36Sopenharmony_ci			ceph_invalidate_dir_request(req);
356962306a36Sopenharmony_ci	} else {
357062306a36Sopenharmony_ci		err = req->r_err;
357162306a36Sopenharmony_ci	}
357262306a36Sopenharmony_ci
357362306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
357462306a36Sopenharmony_ci	return err;
357562306a36Sopenharmony_ci}
357662306a36Sopenharmony_ci
357762306a36Sopenharmony_ci/*
357862306a36Sopenharmony_ci * Synchrously perform an mds request.  Take care of all of the
357962306a36Sopenharmony_ci * session setup, forwarding, retry details.
358062306a36Sopenharmony_ci */
358162306a36Sopenharmony_ciint ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
358262306a36Sopenharmony_ci			 struct inode *dir,
358362306a36Sopenharmony_ci			 struct ceph_mds_request *req)
358462306a36Sopenharmony_ci{
358562306a36Sopenharmony_ci	int err;
358662306a36Sopenharmony_ci
358762306a36Sopenharmony_ci	dout("do_request on %p\n", req);
358862306a36Sopenharmony_ci
358962306a36Sopenharmony_ci	/* issue */
359062306a36Sopenharmony_ci	err = ceph_mdsc_submit_request(mdsc, dir, req);
359162306a36Sopenharmony_ci	if (!err)
359262306a36Sopenharmony_ci		err = ceph_mdsc_wait_request(mdsc, req, NULL);
359362306a36Sopenharmony_ci	dout("do_request %p done, result %d\n", req, err);
359462306a36Sopenharmony_ci	return err;
359562306a36Sopenharmony_ci}
359662306a36Sopenharmony_ci
359762306a36Sopenharmony_ci/*
359862306a36Sopenharmony_ci * Invalidate dir's completeness, dentry lease state on an aborted MDS
359962306a36Sopenharmony_ci * namespace request.
360062306a36Sopenharmony_ci */
360162306a36Sopenharmony_civoid ceph_invalidate_dir_request(struct ceph_mds_request *req)
360262306a36Sopenharmony_ci{
360362306a36Sopenharmony_ci	struct inode *dir = req->r_parent;
360462306a36Sopenharmony_ci	struct inode *old_dir = req->r_old_dentry_dir;
360562306a36Sopenharmony_ci
360662306a36Sopenharmony_ci	dout("invalidate_dir_request %p %p (complete, lease(s))\n", dir, old_dir);
360762306a36Sopenharmony_ci
360862306a36Sopenharmony_ci	ceph_dir_clear_complete(dir);
360962306a36Sopenharmony_ci	if (old_dir)
361062306a36Sopenharmony_ci		ceph_dir_clear_complete(old_dir);
361162306a36Sopenharmony_ci	if (req->r_dentry)
361262306a36Sopenharmony_ci		ceph_invalidate_dentry_lease(req->r_dentry);
361362306a36Sopenharmony_ci	if (req->r_old_dentry)
361462306a36Sopenharmony_ci		ceph_invalidate_dentry_lease(req->r_old_dentry);
361562306a36Sopenharmony_ci}
361662306a36Sopenharmony_ci
361762306a36Sopenharmony_ci/*
361862306a36Sopenharmony_ci * Handle mds reply.
361962306a36Sopenharmony_ci *
362062306a36Sopenharmony_ci * We take the session mutex and parse and process the reply immediately.
362162306a36Sopenharmony_ci * This preserves the logical ordering of replies, capabilities, etc., sent
362262306a36Sopenharmony_ci * by the MDS as they are applied to our local cache.
362362306a36Sopenharmony_ci */
362462306a36Sopenharmony_cistatic void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
362562306a36Sopenharmony_ci{
362662306a36Sopenharmony_ci	struct ceph_mds_client *mdsc = session->s_mdsc;
362762306a36Sopenharmony_ci	struct ceph_mds_request *req;
362862306a36Sopenharmony_ci	struct ceph_mds_reply_head *head = msg->front.iov_base;
362962306a36Sopenharmony_ci	struct ceph_mds_reply_info_parsed *rinfo;  /* parsed reply info */
363062306a36Sopenharmony_ci	struct ceph_snap_realm *realm;
363162306a36Sopenharmony_ci	u64 tid;
363262306a36Sopenharmony_ci	int err, result;
363362306a36Sopenharmony_ci	int mds = session->s_mds;
363462306a36Sopenharmony_ci	bool close_sessions = false;
363562306a36Sopenharmony_ci
363662306a36Sopenharmony_ci	if (msg->front.iov_len < sizeof(*head)) {
363762306a36Sopenharmony_ci		pr_err("mdsc_handle_reply got corrupt (short) reply\n");
363862306a36Sopenharmony_ci		ceph_msg_dump(msg);
363962306a36Sopenharmony_ci		return;
364062306a36Sopenharmony_ci	}
364162306a36Sopenharmony_ci
364262306a36Sopenharmony_ci	/* get request, session */
364362306a36Sopenharmony_ci	tid = le64_to_cpu(msg->hdr.tid);
364462306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
364562306a36Sopenharmony_ci	req = lookup_get_request(mdsc, tid);
364662306a36Sopenharmony_ci	if (!req) {
364762306a36Sopenharmony_ci		dout("handle_reply on unknown tid %llu\n", tid);
364862306a36Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
364962306a36Sopenharmony_ci		return;
365062306a36Sopenharmony_ci	}
365162306a36Sopenharmony_ci	dout("handle_reply %p\n", req);
365262306a36Sopenharmony_ci
365362306a36Sopenharmony_ci	/* correct session? */
365462306a36Sopenharmony_ci	if (req->r_session != session) {
365562306a36Sopenharmony_ci		pr_err("mdsc_handle_reply got %llu on session mds%d"
365662306a36Sopenharmony_ci		       " not mds%d\n", tid, session->s_mds,
365762306a36Sopenharmony_ci		       req->r_session ? req->r_session->s_mds : -1);
365862306a36Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
365962306a36Sopenharmony_ci		goto out;
366062306a36Sopenharmony_ci	}
366162306a36Sopenharmony_ci
366262306a36Sopenharmony_ci	/* dup? */
366362306a36Sopenharmony_ci	if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) ||
366462306a36Sopenharmony_ci	    (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) {
366562306a36Sopenharmony_ci		pr_warn("got a dup %s reply on %llu from mds%d\n",
366662306a36Sopenharmony_ci			   head->safe ? "safe" : "unsafe", tid, mds);
366762306a36Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
366862306a36Sopenharmony_ci		goto out;
366962306a36Sopenharmony_ci	}
367062306a36Sopenharmony_ci	if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) {
367162306a36Sopenharmony_ci		pr_warn("got unsafe after safe on %llu from mds%d\n",
367262306a36Sopenharmony_ci			   tid, mds);
367362306a36Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
367462306a36Sopenharmony_ci		goto out;
367562306a36Sopenharmony_ci	}
367662306a36Sopenharmony_ci
367762306a36Sopenharmony_ci	result = le32_to_cpu(head->result);
367862306a36Sopenharmony_ci
367962306a36Sopenharmony_ci	if (head->safe) {
368062306a36Sopenharmony_ci		set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
368162306a36Sopenharmony_ci		__unregister_request(mdsc, req);
368262306a36Sopenharmony_ci
368362306a36Sopenharmony_ci		/* last request during umount? */
368462306a36Sopenharmony_ci		if (mdsc->stopping && !__get_oldest_req(mdsc))
368562306a36Sopenharmony_ci			complete_all(&mdsc->safe_umount_waiters);
368662306a36Sopenharmony_ci
368762306a36Sopenharmony_ci		if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
368862306a36Sopenharmony_ci			/*
368962306a36Sopenharmony_ci			 * We already handled the unsafe response, now do the
369062306a36Sopenharmony_ci			 * cleanup.  No need to examine the response; the MDS
369162306a36Sopenharmony_ci			 * doesn't include any result info in the safe
369262306a36Sopenharmony_ci			 * response.  And even if it did, there is nothing
369362306a36Sopenharmony_ci			 * useful we could do with a revised return value.
369462306a36Sopenharmony_ci			 */
369562306a36Sopenharmony_ci			dout("got safe reply %llu, mds%d\n", tid, mds);
369662306a36Sopenharmony_ci
369762306a36Sopenharmony_ci			mutex_unlock(&mdsc->mutex);
369862306a36Sopenharmony_ci			goto out;
369962306a36Sopenharmony_ci		}
370062306a36Sopenharmony_ci	} else {
370162306a36Sopenharmony_ci		set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
370262306a36Sopenharmony_ci		list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
370362306a36Sopenharmony_ci	}
370462306a36Sopenharmony_ci
370562306a36Sopenharmony_ci	dout("handle_reply tid %lld result %d\n", tid, result);
370662306a36Sopenharmony_ci	if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
370762306a36Sopenharmony_ci		err = parse_reply_info(session, msg, req, (u64)-1);
370862306a36Sopenharmony_ci	else
370962306a36Sopenharmony_ci		err = parse_reply_info(session, msg, req,
371062306a36Sopenharmony_ci				       session->s_con.peer_features);
371162306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
371262306a36Sopenharmony_ci
371362306a36Sopenharmony_ci	/* Must find target inode outside of mutexes to avoid deadlocks */
371462306a36Sopenharmony_ci	rinfo = &req->r_reply_info;
371562306a36Sopenharmony_ci	if ((err >= 0) && rinfo->head->is_target) {
371662306a36Sopenharmony_ci		struct inode *in = xchg(&req->r_new_inode, NULL);
371762306a36Sopenharmony_ci		struct ceph_vino tvino = {
371862306a36Sopenharmony_ci			.ino  = le64_to_cpu(rinfo->targeti.in->ino),
371962306a36Sopenharmony_ci			.snap = le64_to_cpu(rinfo->targeti.in->snapid)
372062306a36Sopenharmony_ci		};
372162306a36Sopenharmony_ci
372262306a36Sopenharmony_ci		/*
372362306a36Sopenharmony_ci		 * If we ended up opening an existing inode, discard
372462306a36Sopenharmony_ci		 * r_new_inode
372562306a36Sopenharmony_ci		 */
372662306a36Sopenharmony_ci		if (req->r_op == CEPH_MDS_OP_CREATE &&
372762306a36Sopenharmony_ci		    !req->r_reply_info.has_create_ino) {
372862306a36Sopenharmony_ci			/* This should never happen on an async create */
372962306a36Sopenharmony_ci			WARN_ON_ONCE(req->r_deleg_ino);
373062306a36Sopenharmony_ci			iput(in);
373162306a36Sopenharmony_ci			in = NULL;
373262306a36Sopenharmony_ci		}
373362306a36Sopenharmony_ci
373462306a36Sopenharmony_ci		in = ceph_get_inode(mdsc->fsc->sb, tvino, in);
373562306a36Sopenharmony_ci		if (IS_ERR(in)) {
373662306a36Sopenharmony_ci			err = PTR_ERR(in);
373762306a36Sopenharmony_ci			mutex_lock(&session->s_mutex);
373862306a36Sopenharmony_ci			goto out_err;
373962306a36Sopenharmony_ci		}
374062306a36Sopenharmony_ci		req->r_target_inode = in;
374162306a36Sopenharmony_ci	}
374262306a36Sopenharmony_ci
374362306a36Sopenharmony_ci	mutex_lock(&session->s_mutex);
374462306a36Sopenharmony_ci	if (err < 0) {
374562306a36Sopenharmony_ci		pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
374662306a36Sopenharmony_ci		ceph_msg_dump(msg);
374762306a36Sopenharmony_ci		goto out_err;
374862306a36Sopenharmony_ci	}
374962306a36Sopenharmony_ci
375062306a36Sopenharmony_ci	/* snap trace */
375162306a36Sopenharmony_ci	realm = NULL;
375262306a36Sopenharmony_ci	if (rinfo->snapblob_len) {
375362306a36Sopenharmony_ci		down_write(&mdsc->snap_rwsem);
375462306a36Sopenharmony_ci		err = ceph_update_snap_trace(mdsc, rinfo->snapblob,
375562306a36Sopenharmony_ci				rinfo->snapblob + rinfo->snapblob_len,
375662306a36Sopenharmony_ci				le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
375762306a36Sopenharmony_ci				&realm);
375862306a36Sopenharmony_ci		if (err) {
375962306a36Sopenharmony_ci			up_write(&mdsc->snap_rwsem);
376062306a36Sopenharmony_ci			close_sessions = true;
376162306a36Sopenharmony_ci			if (err == -EIO)
376262306a36Sopenharmony_ci				ceph_msg_dump(msg);
376362306a36Sopenharmony_ci			goto out_err;
376462306a36Sopenharmony_ci		}
376562306a36Sopenharmony_ci		downgrade_write(&mdsc->snap_rwsem);
376662306a36Sopenharmony_ci	} else {
376762306a36Sopenharmony_ci		down_read(&mdsc->snap_rwsem);
376862306a36Sopenharmony_ci	}
376962306a36Sopenharmony_ci
377062306a36Sopenharmony_ci	/* insert trace into our cache */
377162306a36Sopenharmony_ci	mutex_lock(&req->r_fill_mutex);
377262306a36Sopenharmony_ci	current->journal_info = req;
377362306a36Sopenharmony_ci	err = ceph_fill_trace(mdsc->fsc->sb, req);
377462306a36Sopenharmony_ci	if (err == 0) {
377562306a36Sopenharmony_ci		if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
377662306a36Sopenharmony_ci				    req->r_op == CEPH_MDS_OP_LSSNAP))
377762306a36Sopenharmony_ci			err = ceph_readdir_prepopulate(req, req->r_session);
377862306a36Sopenharmony_ci	}
377962306a36Sopenharmony_ci	current->journal_info = NULL;
378062306a36Sopenharmony_ci	mutex_unlock(&req->r_fill_mutex);
378162306a36Sopenharmony_ci
378262306a36Sopenharmony_ci	up_read(&mdsc->snap_rwsem);
378362306a36Sopenharmony_ci	if (realm)
378462306a36Sopenharmony_ci		ceph_put_snap_realm(mdsc, realm);
378562306a36Sopenharmony_ci
378662306a36Sopenharmony_ci	if (err == 0) {
378762306a36Sopenharmony_ci		if (req->r_target_inode &&
378862306a36Sopenharmony_ci		    test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
378962306a36Sopenharmony_ci			struct ceph_inode_info *ci =
379062306a36Sopenharmony_ci				ceph_inode(req->r_target_inode);
379162306a36Sopenharmony_ci			spin_lock(&ci->i_unsafe_lock);
379262306a36Sopenharmony_ci			list_add_tail(&req->r_unsafe_target_item,
379362306a36Sopenharmony_ci				      &ci->i_unsafe_iops);
379462306a36Sopenharmony_ci			spin_unlock(&ci->i_unsafe_lock);
379562306a36Sopenharmony_ci		}
379662306a36Sopenharmony_ci
379762306a36Sopenharmony_ci		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
379862306a36Sopenharmony_ci	}
379962306a36Sopenharmony_ciout_err:
380062306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
380162306a36Sopenharmony_ci	if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
380262306a36Sopenharmony_ci		if (err) {
380362306a36Sopenharmony_ci			req->r_err = err;
380462306a36Sopenharmony_ci		} else {
380562306a36Sopenharmony_ci			req->r_reply =  ceph_msg_get(msg);
380662306a36Sopenharmony_ci			set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags);
380762306a36Sopenharmony_ci		}
380862306a36Sopenharmony_ci	} else {
380962306a36Sopenharmony_ci		dout("reply arrived after request %lld was aborted\n", tid);
381062306a36Sopenharmony_ci	}
381162306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
381262306a36Sopenharmony_ci
381362306a36Sopenharmony_ci	mutex_unlock(&session->s_mutex);
381462306a36Sopenharmony_ci
381562306a36Sopenharmony_ci	/* kick calling process */
381662306a36Sopenharmony_ci	complete_request(mdsc, req);
381762306a36Sopenharmony_ci
381862306a36Sopenharmony_ci	ceph_update_metadata_metrics(&mdsc->metric, req->r_start_latency,
381962306a36Sopenharmony_ci				     req->r_end_latency, err);
382062306a36Sopenharmony_ciout:
382162306a36Sopenharmony_ci	ceph_mdsc_put_request(req);
382262306a36Sopenharmony_ci
382362306a36Sopenharmony_ci	/* Defer closing the sessions after s_mutex lock being released */
382462306a36Sopenharmony_ci	if (close_sessions)
382562306a36Sopenharmony_ci		ceph_mdsc_close_sessions(mdsc);
382662306a36Sopenharmony_ci	return;
382762306a36Sopenharmony_ci}
382862306a36Sopenharmony_ci
382962306a36Sopenharmony_ci
383062306a36Sopenharmony_ci
383162306a36Sopenharmony_ci/*
383262306a36Sopenharmony_ci * handle mds notification that our request has been forwarded.
383362306a36Sopenharmony_ci */
383462306a36Sopenharmony_cistatic void handle_forward(struct ceph_mds_client *mdsc,
383562306a36Sopenharmony_ci			   struct ceph_mds_session *session,
383662306a36Sopenharmony_ci			   struct ceph_msg *msg)
383762306a36Sopenharmony_ci{
383862306a36Sopenharmony_ci	struct ceph_mds_request *req;
383962306a36Sopenharmony_ci	u64 tid = le64_to_cpu(msg->hdr.tid);
384062306a36Sopenharmony_ci	u32 next_mds;
384162306a36Sopenharmony_ci	u32 fwd_seq;
384262306a36Sopenharmony_ci	int err = -EINVAL;
384362306a36Sopenharmony_ci	void *p = msg->front.iov_base;
384462306a36Sopenharmony_ci	void *end = p + msg->front.iov_len;
384562306a36Sopenharmony_ci	bool aborted = false;
384662306a36Sopenharmony_ci
384762306a36Sopenharmony_ci	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
384862306a36Sopenharmony_ci	next_mds = ceph_decode_32(&p);
384962306a36Sopenharmony_ci	fwd_seq = ceph_decode_32(&p);
385062306a36Sopenharmony_ci
385162306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
385262306a36Sopenharmony_ci	req = lookup_get_request(mdsc, tid);
385362306a36Sopenharmony_ci	if (!req) {
385462306a36Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
385562306a36Sopenharmony_ci		dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
385662306a36Sopenharmony_ci		return;  /* dup reply? */
385762306a36Sopenharmony_ci	}
385862306a36Sopenharmony_ci
385962306a36Sopenharmony_ci	if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
386062306a36Sopenharmony_ci		dout("forward tid %llu aborted, unregistering\n", tid);
386162306a36Sopenharmony_ci		__unregister_request(mdsc, req);
386262306a36Sopenharmony_ci	} else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) {
386362306a36Sopenharmony_ci		/*
386462306a36Sopenharmony_ci		 * Avoid inifinite retrying after overflow.
386562306a36Sopenharmony_ci		 *
386662306a36Sopenharmony_ci		 * The MDS will increase the fwd count and in client side
386762306a36Sopenharmony_ci		 * if the num_fwd is less than the one saved in request
386862306a36Sopenharmony_ci		 * that means the MDS is an old version and overflowed of
386962306a36Sopenharmony_ci		 * 8 bits.
387062306a36Sopenharmony_ci		 */
387162306a36Sopenharmony_ci		mutex_lock(&req->r_fill_mutex);
387262306a36Sopenharmony_ci		req->r_err = -EMULTIHOP;
387362306a36Sopenharmony_ci		set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
387462306a36Sopenharmony_ci		mutex_unlock(&req->r_fill_mutex);
387562306a36Sopenharmony_ci		aborted = true;
387662306a36Sopenharmony_ci		pr_warn_ratelimited("forward tid %llu seq overflow\n", tid);
387762306a36Sopenharmony_ci	} else {
387862306a36Sopenharmony_ci		/* resend. forward race not possible; mds would drop */
387962306a36Sopenharmony_ci		dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
388062306a36Sopenharmony_ci		BUG_ON(req->r_err);
388162306a36Sopenharmony_ci		BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags));
388262306a36Sopenharmony_ci		req->r_attempts = 0;
388362306a36Sopenharmony_ci		req->r_num_fwd = fwd_seq;
388462306a36Sopenharmony_ci		req->r_resend_mds = next_mds;
388562306a36Sopenharmony_ci		put_request_session(req);
388662306a36Sopenharmony_ci		__do_request(mdsc, req);
388762306a36Sopenharmony_ci	}
388862306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
388962306a36Sopenharmony_ci
389062306a36Sopenharmony_ci	/* kick calling process */
389162306a36Sopenharmony_ci	if (aborted)
389262306a36Sopenharmony_ci		complete_request(mdsc, req);
389362306a36Sopenharmony_ci	ceph_mdsc_put_request(req);
389462306a36Sopenharmony_ci	return;
389562306a36Sopenharmony_ci
389662306a36Sopenharmony_cibad:
389762306a36Sopenharmony_ci	pr_err("mdsc_handle_forward decode error err=%d\n", err);
389862306a36Sopenharmony_ci	ceph_msg_dump(msg);
389962306a36Sopenharmony_ci}
390062306a36Sopenharmony_ci
390162306a36Sopenharmony_cistatic int __decode_session_metadata(void **p, void *end,
390262306a36Sopenharmony_ci				     bool *blocklisted)
390362306a36Sopenharmony_ci{
390462306a36Sopenharmony_ci	/* map<string,string> */
390562306a36Sopenharmony_ci	u32 n;
390662306a36Sopenharmony_ci	bool err_str;
390762306a36Sopenharmony_ci	ceph_decode_32_safe(p, end, n, bad);
390862306a36Sopenharmony_ci	while (n-- > 0) {
390962306a36Sopenharmony_ci		u32 len;
391062306a36Sopenharmony_ci		ceph_decode_32_safe(p, end, len, bad);
391162306a36Sopenharmony_ci		ceph_decode_need(p, end, len, bad);
391262306a36Sopenharmony_ci		err_str = !strncmp(*p, "error_string", len);
391362306a36Sopenharmony_ci		*p += len;
391462306a36Sopenharmony_ci		ceph_decode_32_safe(p, end, len, bad);
391562306a36Sopenharmony_ci		ceph_decode_need(p, end, len, bad);
391662306a36Sopenharmony_ci		/*
391762306a36Sopenharmony_ci		 * Match "blocklisted (blacklisted)" from newer MDSes,
391862306a36Sopenharmony_ci		 * or "blacklisted" from older MDSes.
391962306a36Sopenharmony_ci		 */
392062306a36Sopenharmony_ci		if (err_str && strnstr(*p, "blacklisted", len))
392162306a36Sopenharmony_ci			*blocklisted = true;
392262306a36Sopenharmony_ci		*p += len;
392362306a36Sopenharmony_ci	}
392462306a36Sopenharmony_ci	return 0;
392562306a36Sopenharmony_cibad:
392662306a36Sopenharmony_ci	return -1;
392762306a36Sopenharmony_ci}
392862306a36Sopenharmony_ci
392962306a36Sopenharmony_ci/*
393062306a36Sopenharmony_ci * handle a mds session control message
393162306a36Sopenharmony_ci */
393262306a36Sopenharmony_cistatic void handle_session(struct ceph_mds_session *session,
393362306a36Sopenharmony_ci			   struct ceph_msg *msg)
393462306a36Sopenharmony_ci{
393562306a36Sopenharmony_ci	struct ceph_mds_client *mdsc = session->s_mdsc;
393662306a36Sopenharmony_ci	int mds = session->s_mds;
393762306a36Sopenharmony_ci	int msg_version = le16_to_cpu(msg->hdr.version);
393862306a36Sopenharmony_ci	void *p = msg->front.iov_base;
393962306a36Sopenharmony_ci	void *end = p + msg->front.iov_len;
394062306a36Sopenharmony_ci	struct ceph_mds_session_head *h;
394162306a36Sopenharmony_ci	u32 op;
394262306a36Sopenharmony_ci	u64 seq, features = 0;
394362306a36Sopenharmony_ci	int wake = 0;
394462306a36Sopenharmony_ci	bool blocklisted = false;
394562306a36Sopenharmony_ci
394662306a36Sopenharmony_ci	/* decode */
394762306a36Sopenharmony_ci	ceph_decode_need(&p, end, sizeof(*h), bad);
394862306a36Sopenharmony_ci	h = p;
394962306a36Sopenharmony_ci	p += sizeof(*h);
395062306a36Sopenharmony_ci
395162306a36Sopenharmony_ci	op = le32_to_cpu(h->op);
395262306a36Sopenharmony_ci	seq = le64_to_cpu(h->seq);
395362306a36Sopenharmony_ci
395462306a36Sopenharmony_ci	if (msg_version >= 3) {
395562306a36Sopenharmony_ci		u32 len;
395662306a36Sopenharmony_ci		/* version >= 2 and < 5, decode metadata, skip otherwise
395762306a36Sopenharmony_ci		 * as it's handled via flags.
395862306a36Sopenharmony_ci		 */
395962306a36Sopenharmony_ci		if (msg_version >= 5)
396062306a36Sopenharmony_ci			ceph_decode_skip_map(&p, end, string, string, bad);
396162306a36Sopenharmony_ci		else if (__decode_session_metadata(&p, end, &blocklisted) < 0)
396262306a36Sopenharmony_ci			goto bad;
396362306a36Sopenharmony_ci
396462306a36Sopenharmony_ci		/* version >= 3, feature bits */
396562306a36Sopenharmony_ci		ceph_decode_32_safe(&p, end, len, bad);
396662306a36Sopenharmony_ci		if (len) {
396762306a36Sopenharmony_ci			ceph_decode_64_safe(&p, end, features, bad);
396862306a36Sopenharmony_ci			p += len - sizeof(features);
396962306a36Sopenharmony_ci		}
397062306a36Sopenharmony_ci	}
397162306a36Sopenharmony_ci
397262306a36Sopenharmony_ci	if (msg_version >= 5) {
397362306a36Sopenharmony_ci		u32 flags, len;
397462306a36Sopenharmony_ci
397562306a36Sopenharmony_ci		/* version >= 4 */
397662306a36Sopenharmony_ci		ceph_decode_skip_16(&p, end, bad); /* struct_v, struct_cv */
397762306a36Sopenharmony_ci		ceph_decode_32_safe(&p, end, len, bad); /* len */
397862306a36Sopenharmony_ci		ceph_decode_skip_n(&p, end, len, bad); /* metric_spec */
397962306a36Sopenharmony_ci
398062306a36Sopenharmony_ci		/* version >= 5, flags   */
398162306a36Sopenharmony_ci		ceph_decode_32_safe(&p, end, flags, bad);
398262306a36Sopenharmony_ci		if (flags & CEPH_SESSION_BLOCKLISTED) {
398362306a36Sopenharmony_ci			pr_warn("mds%d session blocklisted\n", session->s_mds);
398462306a36Sopenharmony_ci			blocklisted = true;
398562306a36Sopenharmony_ci		}
398662306a36Sopenharmony_ci	}
398762306a36Sopenharmony_ci
398862306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
398962306a36Sopenharmony_ci	if (op == CEPH_SESSION_CLOSE) {
399062306a36Sopenharmony_ci		ceph_get_mds_session(session);
399162306a36Sopenharmony_ci		__unregister_session(mdsc, session);
399262306a36Sopenharmony_ci	}
399362306a36Sopenharmony_ci	/* FIXME: this ttl calculation is generous */
399462306a36Sopenharmony_ci	session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
399562306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
399662306a36Sopenharmony_ci
399762306a36Sopenharmony_ci	mutex_lock(&session->s_mutex);
399862306a36Sopenharmony_ci
399962306a36Sopenharmony_ci	dout("handle_session mds%d %s %p state %s seq %llu\n",
400062306a36Sopenharmony_ci	     mds, ceph_session_op_name(op), session,
400162306a36Sopenharmony_ci	     ceph_session_state_name(session->s_state), seq);
400262306a36Sopenharmony_ci
400362306a36Sopenharmony_ci	if (session->s_state == CEPH_MDS_SESSION_HUNG) {
400462306a36Sopenharmony_ci		session->s_state = CEPH_MDS_SESSION_OPEN;
400562306a36Sopenharmony_ci		pr_info("mds%d came back\n", session->s_mds);
400662306a36Sopenharmony_ci	}
400762306a36Sopenharmony_ci
400862306a36Sopenharmony_ci	switch (op) {
400962306a36Sopenharmony_ci	case CEPH_SESSION_OPEN:
401062306a36Sopenharmony_ci		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
401162306a36Sopenharmony_ci			pr_info("mds%d reconnect success\n", session->s_mds);
401262306a36Sopenharmony_ci
401362306a36Sopenharmony_ci		session->s_features = features;
401462306a36Sopenharmony_ci		if (session->s_state == CEPH_MDS_SESSION_OPEN) {
401562306a36Sopenharmony_ci			pr_notice("mds%d is already opened\n", session->s_mds);
401662306a36Sopenharmony_ci		} else {
401762306a36Sopenharmony_ci			session->s_state = CEPH_MDS_SESSION_OPEN;
401862306a36Sopenharmony_ci			renewed_caps(mdsc, session, 0);
401962306a36Sopenharmony_ci			if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT,
402062306a36Sopenharmony_ci				     &session->s_features))
402162306a36Sopenharmony_ci				metric_schedule_delayed(&mdsc->metric);
402262306a36Sopenharmony_ci		}
402362306a36Sopenharmony_ci
402462306a36Sopenharmony_ci		/*
402562306a36Sopenharmony_ci		 * The connection maybe broken and the session in client
402662306a36Sopenharmony_ci		 * side has been reinitialized, need to update the seq
402762306a36Sopenharmony_ci		 * anyway.
402862306a36Sopenharmony_ci		 */
402962306a36Sopenharmony_ci		if (!session->s_seq && seq)
403062306a36Sopenharmony_ci			session->s_seq = seq;
403162306a36Sopenharmony_ci
403262306a36Sopenharmony_ci		wake = 1;
403362306a36Sopenharmony_ci		if (mdsc->stopping)
403462306a36Sopenharmony_ci			__close_session(mdsc, session);
403562306a36Sopenharmony_ci		break;
403662306a36Sopenharmony_ci
403762306a36Sopenharmony_ci	case CEPH_SESSION_RENEWCAPS:
403862306a36Sopenharmony_ci		if (session->s_renew_seq == seq)
403962306a36Sopenharmony_ci			renewed_caps(mdsc, session, 1);
404062306a36Sopenharmony_ci		break;
404162306a36Sopenharmony_ci
404262306a36Sopenharmony_ci	case CEPH_SESSION_CLOSE:
404362306a36Sopenharmony_ci		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
404462306a36Sopenharmony_ci			pr_info("mds%d reconnect denied\n", session->s_mds);
404562306a36Sopenharmony_ci		session->s_state = CEPH_MDS_SESSION_CLOSED;
404662306a36Sopenharmony_ci		cleanup_session_requests(mdsc, session);
404762306a36Sopenharmony_ci		remove_session_caps(session);
404862306a36Sopenharmony_ci		wake = 2; /* for good measure */
404962306a36Sopenharmony_ci		wake_up_all(&mdsc->session_close_wq);
405062306a36Sopenharmony_ci		break;
405162306a36Sopenharmony_ci
405262306a36Sopenharmony_ci	case CEPH_SESSION_STALE:
405362306a36Sopenharmony_ci		pr_info("mds%d caps went stale, renewing\n",
405462306a36Sopenharmony_ci			session->s_mds);
405562306a36Sopenharmony_ci		atomic_inc(&session->s_cap_gen);
405662306a36Sopenharmony_ci		session->s_cap_ttl = jiffies - 1;
405762306a36Sopenharmony_ci		send_renew_caps(mdsc, session);
405862306a36Sopenharmony_ci		break;
405962306a36Sopenharmony_ci
406062306a36Sopenharmony_ci	case CEPH_SESSION_RECALL_STATE:
406162306a36Sopenharmony_ci		ceph_trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
406262306a36Sopenharmony_ci		break;
406362306a36Sopenharmony_ci
406462306a36Sopenharmony_ci	case CEPH_SESSION_FLUSHMSG:
406562306a36Sopenharmony_ci		/* flush cap releases */
406662306a36Sopenharmony_ci		spin_lock(&session->s_cap_lock);
406762306a36Sopenharmony_ci		if (session->s_num_cap_releases)
406862306a36Sopenharmony_ci			ceph_flush_cap_releases(mdsc, session);
406962306a36Sopenharmony_ci		spin_unlock(&session->s_cap_lock);
407062306a36Sopenharmony_ci
407162306a36Sopenharmony_ci		send_flushmsg_ack(mdsc, session, seq);
407262306a36Sopenharmony_ci		break;
407362306a36Sopenharmony_ci
407462306a36Sopenharmony_ci	case CEPH_SESSION_FORCE_RO:
407562306a36Sopenharmony_ci		dout("force_session_readonly %p\n", session);
407662306a36Sopenharmony_ci		spin_lock(&session->s_cap_lock);
407762306a36Sopenharmony_ci		session->s_readonly = true;
407862306a36Sopenharmony_ci		spin_unlock(&session->s_cap_lock);
407962306a36Sopenharmony_ci		wake_up_session_caps(session, FORCE_RO);
408062306a36Sopenharmony_ci		break;
408162306a36Sopenharmony_ci
408262306a36Sopenharmony_ci	case CEPH_SESSION_REJECT:
408362306a36Sopenharmony_ci		WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING);
408462306a36Sopenharmony_ci		pr_info("mds%d rejected session\n", session->s_mds);
408562306a36Sopenharmony_ci		session->s_state = CEPH_MDS_SESSION_REJECTED;
408662306a36Sopenharmony_ci		cleanup_session_requests(mdsc, session);
408762306a36Sopenharmony_ci		remove_session_caps(session);
408862306a36Sopenharmony_ci		if (blocklisted)
408962306a36Sopenharmony_ci			mdsc->fsc->blocklisted = true;
409062306a36Sopenharmony_ci		wake = 2; /* for good measure */
409162306a36Sopenharmony_ci		break;
409262306a36Sopenharmony_ci
409362306a36Sopenharmony_ci	default:
409462306a36Sopenharmony_ci		pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
409562306a36Sopenharmony_ci		WARN_ON(1);
409662306a36Sopenharmony_ci	}
409762306a36Sopenharmony_ci
409862306a36Sopenharmony_ci	mutex_unlock(&session->s_mutex);
409962306a36Sopenharmony_ci	if (wake) {
410062306a36Sopenharmony_ci		mutex_lock(&mdsc->mutex);
410162306a36Sopenharmony_ci		__wake_requests(mdsc, &session->s_waiting);
410262306a36Sopenharmony_ci		if (wake == 2)
410362306a36Sopenharmony_ci			kick_requests(mdsc, mds);
410462306a36Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
410562306a36Sopenharmony_ci	}
410662306a36Sopenharmony_ci	if (op == CEPH_SESSION_CLOSE)
410762306a36Sopenharmony_ci		ceph_put_mds_session(session);
410862306a36Sopenharmony_ci	return;
410962306a36Sopenharmony_ci
411062306a36Sopenharmony_cibad:
411162306a36Sopenharmony_ci	pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
411262306a36Sopenharmony_ci	       (int)msg->front.iov_len);
411362306a36Sopenharmony_ci	ceph_msg_dump(msg);
411462306a36Sopenharmony_ci	return;
411562306a36Sopenharmony_ci}
411662306a36Sopenharmony_ci
411762306a36Sopenharmony_civoid ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
411862306a36Sopenharmony_ci{
411962306a36Sopenharmony_ci	int dcaps;
412062306a36Sopenharmony_ci
412162306a36Sopenharmony_ci	dcaps = xchg(&req->r_dir_caps, 0);
412262306a36Sopenharmony_ci	if (dcaps) {
412362306a36Sopenharmony_ci		dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
412462306a36Sopenharmony_ci		ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
412562306a36Sopenharmony_ci	}
412662306a36Sopenharmony_ci}
412762306a36Sopenharmony_ci
412862306a36Sopenharmony_civoid ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req)
412962306a36Sopenharmony_ci{
413062306a36Sopenharmony_ci	int dcaps;
413162306a36Sopenharmony_ci
413262306a36Sopenharmony_ci	dcaps = xchg(&req->r_dir_caps, 0);
413362306a36Sopenharmony_ci	if (dcaps) {
413462306a36Sopenharmony_ci		dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
413562306a36Sopenharmony_ci		ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent),
413662306a36Sopenharmony_ci						dcaps);
413762306a36Sopenharmony_ci	}
413862306a36Sopenharmony_ci}
413962306a36Sopenharmony_ci
414062306a36Sopenharmony_ci/*
414162306a36Sopenharmony_ci * called under session->mutex.
414262306a36Sopenharmony_ci */
414362306a36Sopenharmony_cistatic void replay_unsafe_requests(struct ceph_mds_client *mdsc,
414462306a36Sopenharmony_ci				   struct ceph_mds_session *session)
414562306a36Sopenharmony_ci{
414662306a36Sopenharmony_ci	struct ceph_mds_request *req, *nreq;
414762306a36Sopenharmony_ci	struct rb_node *p;
414862306a36Sopenharmony_ci
414962306a36Sopenharmony_ci	dout("replay_unsafe_requests mds%d\n", session->s_mds);
415062306a36Sopenharmony_ci
415162306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
415262306a36Sopenharmony_ci	list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item)
415362306a36Sopenharmony_ci		__send_request(session, req, true);
415462306a36Sopenharmony_ci
415562306a36Sopenharmony_ci	/*
415662306a36Sopenharmony_ci	 * also re-send old requests when MDS enters reconnect stage. So that MDS
415762306a36Sopenharmony_ci	 * can process completed request in clientreplay stage.
415862306a36Sopenharmony_ci	 */
415962306a36Sopenharmony_ci	p = rb_first(&mdsc->request_tree);
416062306a36Sopenharmony_ci	while (p) {
416162306a36Sopenharmony_ci		req = rb_entry(p, struct ceph_mds_request, r_node);
416262306a36Sopenharmony_ci		p = rb_next(p);
416362306a36Sopenharmony_ci		if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
416462306a36Sopenharmony_ci			continue;
416562306a36Sopenharmony_ci		if (req->r_attempts == 0)
416662306a36Sopenharmony_ci			continue; /* only old requests */
416762306a36Sopenharmony_ci		if (!req->r_session)
416862306a36Sopenharmony_ci			continue;
416962306a36Sopenharmony_ci		if (req->r_session->s_mds != session->s_mds)
417062306a36Sopenharmony_ci			continue;
417162306a36Sopenharmony_ci
417262306a36Sopenharmony_ci		ceph_mdsc_release_dir_caps_no_check(req);
417362306a36Sopenharmony_ci
417462306a36Sopenharmony_ci		__send_request(session, req, true);
417562306a36Sopenharmony_ci	}
417662306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
417762306a36Sopenharmony_ci}
417862306a36Sopenharmony_ci
417962306a36Sopenharmony_cistatic int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
418062306a36Sopenharmony_ci{
418162306a36Sopenharmony_ci	struct ceph_msg *reply;
418262306a36Sopenharmony_ci	struct ceph_pagelist *_pagelist;
418362306a36Sopenharmony_ci	struct page *page;
418462306a36Sopenharmony_ci	__le32 *addr;
418562306a36Sopenharmony_ci	int err = -ENOMEM;
418662306a36Sopenharmony_ci
418762306a36Sopenharmony_ci	if (!recon_state->allow_multi)
418862306a36Sopenharmony_ci		return -ENOSPC;
418962306a36Sopenharmony_ci
419062306a36Sopenharmony_ci	/* can't handle message that contains both caps and realm */
419162306a36Sopenharmony_ci	BUG_ON(!recon_state->nr_caps == !recon_state->nr_realms);
419262306a36Sopenharmony_ci
419362306a36Sopenharmony_ci	/* pre-allocate new pagelist */
419462306a36Sopenharmony_ci	_pagelist = ceph_pagelist_alloc(GFP_NOFS);
419562306a36Sopenharmony_ci	if (!_pagelist)
419662306a36Sopenharmony_ci		return -ENOMEM;
419762306a36Sopenharmony_ci
419862306a36Sopenharmony_ci	reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
419962306a36Sopenharmony_ci	if (!reply)
420062306a36Sopenharmony_ci		goto fail_msg;
420162306a36Sopenharmony_ci
420262306a36Sopenharmony_ci	/* placeholder for nr_caps */
420362306a36Sopenharmony_ci	err = ceph_pagelist_encode_32(_pagelist, 0);
420462306a36Sopenharmony_ci	if (err < 0)
420562306a36Sopenharmony_ci		goto fail;
420662306a36Sopenharmony_ci
420762306a36Sopenharmony_ci	if (recon_state->nr_caps) {
420862306a36Sopenharmony_ci		/* currently encoding caps */
420962306a36Sopenharmony_ci		err = ceph_pagelist_encode_32(recon_state->pagelist, 0);
421062306a36Sopenharmony_ci		if (err)
421162306a36Sopenharmony_ci			goto fail;
421262306a36Sopenharmony_ci	} else {
421362306a36Sopenharmony_ci		/* placeholder for nr_realms (currently encoding relams) */
421462306a36Sopenharmony_ci		err = ceph_pagelist_encode_32(_pagelist, 0);
421562306a36Sopenharmony_ci		if (err < 0)
421662306a36Sopenharmony_ci			goto fail;
421762306a36Sopenharmony_ci	}
421862306a36Sopenharmony_ci
421962306a36Sopenharmony_ci	err = ceph_pagelist_encode_8(recon_state->pagelist, 1);
422062306a36Sopenharmony_ci	if (err)
422162306a36Sopenharmony_ci		goto fail;
422262306a36Sopenharmony_ci
422362306a36Sopenharmony_ci	page = list_first_entry(&recon_state->pagelist->head, struct page, lru);
422462306a36Sopenharmony_ci	addr = kmap_atomic(page);
422562306a36Sopenharmony_ci	if (recon_state->nr_caps) {
422662306a36Sopenharmony_ci		/* currently encoding caps */
422762306a36Sopenharmony_ci		*addr = cpu_to_le32(recon_state->nr_caps);
422862306a36Sopenharmony_ci	} else {
422962306a36Sopenharmony_ci		/* currently encoding relams */
423062306a36Sopenharmony_ci		*(addr + 1) = cpu_to_le32(recon_state->nr_realms);
423162306a36Sopenharmony_ci	}
423262306a36Sopenharmony_ci	kunmap_atomic(addr);
423362306a36Sopenharmony_ci
423462306a36Sopenharmony_ci	reply->hdr.version = cpu_to_le16(5);
423562306a36Sopenharmony_ci	reply->hdr.compat_version = cpu_to_le16(4);
423662306a36Sopenharmony_ci
423762306a36Sopenharmony_ci	reply->hdr.data_len = cpu_to_le32(recon_state->pagelist->length);
423862306a36Sopenharmony_ci	ceph_msg_data_add_pagelist(reply, recon_state->pagelist);
423962306a36Sopenharmony_ci
424062306a36Sopenharmony_ci	ceph_con_send(&recon_state->session->s_con, reply);
424162306a36Sopenharmony_ci	ceph_pagelist_release(recon_state->pagelist);
424262306a36Sopenharmony_ci
424362306a36Sopenharmony_ci	recon_state->pagelist = _pagelist;
424462306a36Sopenharmony_ci	recon_state->nr_caps = 0;
424562306a36Sopenharmony_ci	recon_state->nr_realms = 0;
424662306a36Sopenharmony_ci	recon_state->msg_version = 5;
424762306a36Sopenharmony_ci	return 0;
424862306a36Sopenharmony_cifail:
424962306a36Sopenharmony_ci	ceph_msg_put(reply);
425062306a36Sopenharmony_cifail_msg:
425162306a36Sopenharmony_ci	ceph_pagelist_release(_pagelist);
425262306a36Sopenharmony_ci	return err;
425362306a36Sopenharmony_ci}
425462306a36Sopenharmony_ci
425562306a36Sopenharmony_cistatic struct dentry* d_find_primary(struct inode *inode)
425662306a36Sopenharmony_ci{
425762306a36Sopenharmony_ci	struct dentry *alias, *dn = NULL;
425862306a36Sopenharmony_ci
425962306a36Sopenharmony_ci	if (hlist_empty(&inode->i_dentry))
426062306a36Sopenharmony_ci		return NULL;
426162306a36Sopenharmony_ci
426262306a36Sopenharmony_ci	spin_lock(&inode->i_lock);
426362306a36Sopenharmony_ci	if (hlist_empty(&inode->i_dentry))
426462306a36Sopenharmony_ci		goto out_unlock;
426562306a36Sopenharmony_ci
426662306a36Sopenharmony_ci	if (S_ISDIR(inode->i_mode)) {
426762306a36Sopenharmony_ci		alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
426862306a36Sopenharmony_ci		if (!IS_ROOT(alias))
426962306a36Sopenharmony_ci			dn = dget(alias);
427062306a36Sopenharmony_ci		goto out_unlock;
427162306a36Sopenharmony_ci	}
427262306a36Sopenharmony_ci
427362306a36Sopenharmony_ci	hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
427462306a36Sopenharmony_ci		spin_lock(&alias->d_lock);
427562306a36Sopenharmony_ci		if (!d_unhashed(alias) &&
427662306a36Sopenharmony_ci		    (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) {
427762306a36Sopenharmony_ci			dn = dget_dlock(alias);
427862306a36Sopenharmony_ci		}
427962306a36Sopenharmony_ci		spin_unlock(&alias->d_lock);
428062306a36Sopenharmony_ci		if (dn)
428162306a36Sopenharmony_ci			break;
428262306a36Sopenharmony_ci	}
428362306a36Sopenharmony_ciout_unlock:
428462306a36Sopenharmony_ci	spin_unlock(&inode->i_lock);
428562306a36Sopenharmony_ci	return dn;
428662306a36Sopenharmony_ci}
428762306a36Sopenharmony_ci
428862306a36Sopenharmony_ci/*
428962306a36Sopenharmony_ci * Encode information about a cap for a reconnect with the MDS.
429062306a36Sopenharmony_ci */
429162306a36Sopenharmony_cistatic int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
429262306a36Sopenharmony_ci{
429362306a36Sopenharmony_ci	union {
429462306a36Sopenharmony_ci		struct ceph_mds_cap_reconnect v2;
429562306a36Sopenharmony_ci		struct ceph_mds_cap_reconnect_v1 v1;
429662306a36Sopenharmony_ci	} rec;
429762306a36Sopenharmony_ci	struct ceph_inode_info *ci = ceph_inode(inode);
429862306a36Sopenharmony_ci	struct ceph_reconnect_state *recon_state = arg;
429962306a36Sopenharmony_ci	struct ceph_pagelist *pagelist = recon_state->pagelist;
430062306a36Sopenharmony_ci	struct dentry *dentry;
430162306a36Sopenharmony_ci	struct ceph_cap *cap;
430262306a36Sopenharmony_ci	char *path;
430362306a36Sopenharmony_ci	int pathlen = 0, err;
430462306a36Sopenharmony_ci	u64 pathbase;
430562306a36Sopenharmony_ci	u64 snap_follows;
430662306a36Sopenharmony_ci
430762306a36Sopenharmony_ci	dentry = d_find_primary(inode);
430862306a36Sopenharmony_ci	if (dentry) {
430962306a36Sopenharmony_ci		/* set pathbase to parent dir when msg_version >= 2 */
431062306a36Sopenharmony_ci		path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase,
431162306a36Sopenharmony_ci					    recon_state->msg_version >= 2);
431262306a36Sopenharmony_ci		dput(dentry);
431362306a36Sopenharmony_ci		if (IS_ERR(path)) {
431462306a36Sopenharmony_ci			err = PTR_ERR(path);
431562306a36Sopenharmony_ci			goto out_err;
431662306a36Sopenharmony_ci		}
431762306a36Sopenharmony_ci	} else {
431862306a36Sopenharmony_ci		path = NULL;
431962306a36Sopenharmony_ci		pathbase = 0;
432062306a36Sopenharmony_ci	}
432162306a36Sopenharmony_ci
432262306a36Sopenharmony_ci	spin_lock(&ci->i_ceph_lock);
432362306a36Sopenharmony_ci	cap = __get_cap_for_mds(ci, mds);
432462306a36Sopenharmony_ci	if (!cap) {
432562306a36Sopenharmony_ci		spin_unlock(&ci->i_ceph_lock);
432662306a36Sopenharmony_ci		err = 0;
432762306a36Sopenharmony_ci		goto out_err;
432862306a36Sopenharmony_ci	}
432962306a36Sopenharmony_ci	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
433062306a36Sopenharmony_ci	     inode, ceph_vinop(inode), cap, cap->cap_id,
433162306a36Sopenharmony_ci	     ceph_cap_string(cap->issued));
433262306a36Sopenharmony_ci
433362306a36Sopenharmony_ci	cap->seq = 0;        /* reset cap seq */
433462306a36Sopenharmony_ci	cap->issue_seq = 0;  /* and issue_seq */
433562306a36Sopenharmony_ci	cap->mseq = 0;       /* and migrate_seq */
433662306a36Sopenharmony_ci	cap->cap_gen = atomic_read(&cap->session->s_cap_gen);
433762306a36Sopenharmony_ci
433862306a36Sopenharmony_ci	/* These are lost when the session goes away */
433962306a36Sopenharmony_ci	if (S_ISDIR(inode->i_mode)) {
434062306a36Sopenharmony_ci		if (cap->issued & CEPH_CAP_DIR_CREATE) {
434162306a36Sopenharmony_ci			ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
434262306a36Sopenharmony_ci			memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
434362306a36Sopenharmony_ci		}
434462306a36Sopenharmony_ci		cap->issued &= ~CEPH_CAP_ANY_DIR_OPS;
434562306a36Sopenharmony_ci	}
434662306a36Sopenharmony_ci
434762306a36Sopenharmony_ci	if (recon_state->msg_version >= 2) {
434862306a36Sopenharmony_ci		rec.v2.cap_id = cpu_to_le64(cap->cap_id);
434962306a36Sopenharmony_ci		rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
435062306a36Sopenharmony_ci		rec.v2.issued = cpu_to_le32(cap->issued);
435162306a36Sopenharmony_ci		rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
435262306a36Sopenharmony_ci		rec.v2.pathbase = cpu_to_le64(pathbase);
435362306a36Sopenharmony_ci		rec.v2.flock_len = (__force __le32)
435462306a36Sopenharmony_ci			((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
435562306a36Sopenharmony_ci	} else {
435662306a36Sopenharmony_ci		rec.v1.cap_id = cpu_to_le64(cap->cap_id);
435762306a36Sopenharmony_ci		rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
435862306a36Sopenharmony_ci		rec.v1.issued = cpu_to_le32(cap->issued);
435962306a36Sopenharmony_ci		rec.v1.size = cpu_to_le64(i_size_read(inode));
436062306a36Sopenharmony_ci		ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
436162306a36Sopenharmony_ci		ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
436262306a36Sopenharmony_ci		rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
436362306a36Sopenharmony_ci		rec.v1.pathbase = cpu_to_le64(pathbase);
436462306a36Sopenharmony_ci	}
436562306a36Sopenharmony_ci
436662306a36Sopenharmony_ci	if (list_empty(&ci->i_cap_snaps)) {
436762306a36Sopenharmony_ci		snap_follows = ci->i_head_snapc ? ci->i_head_snapc->seq : 0;
436862306a36Sopenharmony_ci	} else {
436962306a36Sopenharmony_ci		struct ceph_cap_snap *capsnap =
437062306a36Sopenharmony_ci			list_first_entry(&ci->i_cap_snaps,
437162306a36Sopenharmony_ci					 struct ceph_cap_snap, ci_item);
437262306a36Sopenharmony_ci		snap_follows = capsnap->follows;
437362306a36Sopenharmony_ci	}
437462306a36Sopenharmony_ci	spin_unlock(&ci->i_ceph_lock);
437562306a36Sopenharmony_ci
437662306a36Sopenharmony_ci	if (recon_state->msg_version >= 2) {
437762306a36Sopenharmony_ci		int num_fcntl_locks, num_flock_locks;
437862306a36Sopenharmony_ci		struct ceph_filelock *flocks = NULL;
437962306a36Sopenharmony_ci		size_t struct_len, total_len = sizeof(u64);
438062306a36Sopenharmony_ci		u8 struct_v = 0;
438162306a36Sopenharmony_ci
438262306a36Sopenharmony_ciencode_again:
438362306a36Sopenharmony_ci		if (rec.v2.flock_len) {
438462306a36Sopenharmony_ci			ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
438562306a36Sopenharmony_ci		} else {
438662306a36Sopenharmony_ci			num_fcntl_locks = 0;
438762306a36Sopenharmony_ci			num_flock_locks = 0;
438862306a36Sopenharmony_ci		}
438962306a36Sopenharmony_ci		if (num_fcntl_locks + num_flock_locks > 0) {
439062306a36Sopenharmony_ci			flocks = kmalloc_array(num_fcntl_locks + num_flock_locks,
439162306a36Sopenharmony_ci					       sizeof(struct ceph_filelock),
439262306a36Sopenharmony_ci					       GFP_NOFS);
439362306a36Sopenharmony_ci			if (!flocks) {
439462306a36Sopenharmony_ci				err = -ENOMEM;
439562306a36Sopenharmony_ci				goto out_err;
439662306a36Sopenharmony_ci			}
439762306a36Sopenharmony_ci			err = ceph_encode_locks_to_buffer(inode, flocks,
439862306a36Sopenharmony_ci							  num_fcntl_locks,
439962306a36Sopenharmony_ci							  num_flock_locks);
440062306a36Sopenharmony_ci			if (err) {
440162306a36Sopenharmony_ci				kfree(flocks);
440262306a36Sopenharmony_ci				flocks = NULL;
440362306a36Sopenharmony_ci				if (err == -ENOSPC)
440462306a36Sopenharmony_ci					goto encode_again;
440562306a36Sopenharmony_ci				goto out_err;
440662306a36Sopenharmony_ci			}
440762306a36Sopenharmony_ci		} else {
440862306a36Sopenharmony_ci			kfree(flocks);
440962306a36Sopenharmony_ci			flocks = NULL;
441062306a36Sopenharmony_ci		}
441162306a36Sopenharmony_ci
441262306a36Sopenharmony_ci		if (recon_state->msg_version >= 3) {
441362306a36Sopenharmony_ci			/* version, compat_version and struct_len */
441462306a36Sopenharmony_ci			total_len += 2 * sizeof(u8) + sizeof(u32);
441562306a36Sopenharmony_ci			struct_v = 2;
441662306a36Sopenharmony_ci		}
441762306a36Sopenharmony_ci		/*
441862306a36Sopenharmony_ci		 * number of encoded locks is stable, so copy to pagelist
441962306a36Sopenharmony_ci		 */
442062306a36Sopenharmony_ci		struct_len = 2 * sizeof(u32) +
442162306a36Sopenharmony_ci			    (num_fcntl_locks + num_flock_locks) *
442262306a36Sopenharmony_ci			    sizeof(struct ceph_filelock);
442362306a36Sopenharmony_ci		rec.v2.flock_len = cpu_to_le32(struct_len);
442462306a36Sopenharmony_ci
442562306a36Sopenharmony_ci		struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
442662306a36Sopenharmony_ci
442762306a36Sopenharmony_ci		if (struct_v >= 2)
442862306a36Sopenharmony_ci			struct_len += sizeof(u64); /* snap_follows */
442962306a36Sopenharmony_ci
443062306a36Sopenharmony_ci		total_len += struct_len;
443162306a36Sopenharmony_ci
443262306a36Sopenharmony_ci		if (pagelist->length + total_len > RECONNECT_MAX_SIZE) {
443362306a36Sopenharmony_ci			err = send_reconnect_partial(recon_state);
443462306a36Sopenharmony_ci			if (err)
443562306a36Sopenharmony_ci				goto out_freeflocks;
443662306a36Sopenharmony_ci			pagelist = recon_state->pagelist;
443762306a36Sopenharmony_ci		}
443862306a36Sopenharmony_ci
443962306a36Sopenharmony_ci		err = ceph_pagelist_reserve(pagelist, total_len);
444062306a36Sopenharmony_ci		if (err)
444162306a36Sopenharmony_ci			goto out_freeflocks;
444262306a36Sopenharmony_ci
444362306a36Sopenharmony_ci		ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
444462306a36Sopenharmony_ci		if (recon_state->msg_version >= 3) {
444562306a36Sopenharmony_ci			ceph_pagelist_encode_8(pagelist, struct_v);
444662306a36Sopenharmony_ci			ceph_pagelist_encode_8(pagelist, 1);
444762306a36Sopenharmony_ci			ceph_pagelist_encode_32(pagelist, struct_len);
444862306a36Sopenharmony_ci		}
444962306a36Sopenharmony_ci		ceph_pagelist_encode_string(pagelist, path, pathlen);
445062306a36Sopenharmony_ci		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
445162306a36Sopenharmony_ci		ceph_locks_to_pagelist(flocks, pagelist,
445262306a36Sopenharmony_ci				       num_fcntl_locks, num_flock_locks);
445362306a36Sopenharmony_ci		if (struct_v >= 2)
445462306a36Sopenharmony_ci			ceph_pagelist_encode_64(pagelist, snap_follows);
445562306a36Sopenharmony_ciout_freeflocks:
445662306a36Sopenharmony_ci		kfree(flocks);
445762306a36Sopenharmony_ci	} else {
445862306a36Sopenharmony_ci		err = ceph_pagelist_reserve(pagelist,
445962306a36Sopenharmony_ci					    sizeof(u64) + sizeof(u32) +
446062306a36Sopenharmony_ci					    pathlen + sizeof(rec.v1));
446162306a36Sopenharmony_ci		if (err)
446262306a36Sopenharmony_ci			goto out_err;
446362306a36Sopenharmony_ci
446462306a36Sopenharmony_ci		ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
446562306a36Sopenharmony_ci		ceph_pagelist_encode_string(pagelist, path, pathlen);
446662306a36Sopenharmony_ci		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
446762306a36Sopenharmony_ci	}
446862306a36Sopenharmony_ci
446962306a36Sopenharmony_ciout_err:
447062306a36Sopenharmony_ci	ceph_mdsc_free_path(path, pathlen);
447162306a36Sopenharmony_ci	if (!err)
447262306a36Sopenharmony_ci		recon_state->nr_caps++;
447362306a36Sopenharmony_ci	return err;
447462306a36Sopenharmony_ci}
447562306a36Sopenharmony_ci
447662306a36Sopenharmony_cistatic int encode_snap_realms(struct ceph_mds_client *mdsc,
447762306a36Sopenharmony_ci			      struct ceph_reconnect_state *recon_state)
447862306a36Sopenharmony_ci{
447962306a36Sopenharmony_ci	struct rb_node *p;
448062306a36Sopenharmony_ci	struct ceph_pagelist *pagelist = recon_state->pagelist;
448162306a36Sopenharmony_ci	int err = 0;
448262306a36Sopenharmony_ci
448362306a36Sopenharmony_ci	if (recon_state->msg_version >= 4) {
448462306a36Sopenharmony_ci		err = ceph_pagelist_encode_32(pagelist, mdsc->num_snap_realms);
448562306a36Sopenharmony_ci		if (err < 0)
448662306a36Sopenharmony_ci			goto fail;
448762306a36Sopenharmony_ci	}
448862306a36Sopenharmony_ci
448962306a36Sopenharmony_ci	/*
449062306a36Sopenharmony_ci	 * snaprealms.  we provide mds with the ino, seq (version), and
449162306a36Sopenharmony_ci	 * parent for all of our realms.  If the mds has any newer info,
449262306a36Sopenharmony_ci	 * it will tell us.
449362306a36Sopenharmony_ci	 */
449462306a36Sopenharmony_ci	for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
449562306a36Sopenharmony_ci		struct ceph_snap_realm *realm =
449662306a36Sopenharmony_ci		       rb_entry(p, struct ceph_snap_realm, node);
449762306a36Sopenharmony_ci		struct ceph_mds_snaprealm_reconnect sr_rec;
449862306a36Sopenharmony_ci
449962306a36Sopenharmony_ci		if (recon_state->msg_version >= 4) {
450062306a36Sopenharmony_ci			size_t need = sizeof(u8) * 2 + sizeof(u32) +
450162306a36Sopenharmony_ci				      sizeof(sr_rec);
450262306a36Sopenharmony_ci
450362306a36Sopenharmony_ci			if (pagelist->length + need > RECONNECT_MAX_SIZE) {
450462306a36Sopenharmony_ci				err = send_reconnect_partial(recon_state);
450562306a36Sopenharmony_ci				if (err)
450662306a36Sopenharmony_ci					goto fail;
450762306a36Sopenharmony_ci				pagelist = recon_state->pagelist;
450862306a36Sopenharmony_ci			}
450962306a36Sopenharmony_ci
451062306a36Sopenharmony_ci			err = ceph_pagelist_reserve(pagelist, need);
451162306a36Sopenharmony_ci			if (err)
451262306a36Sopenharmony_ci				goto fail;
451362306a36Sopenharmony_ci
451462306a36Sopenharmony_ci			ceph_pagelist_encode_8(pagelist, 1);
451562306a36Sopenharmony_ci			ceph_pagelist_encode_8(pagelist, 1);
451662306a36Sopenharmony_ci			ceph_pagelist_encode_32(pagelist, sizeof(sr_rec));
451762306a36Sopenharmony_ci		}
451862306a36Sopenharmony_ci
451962306a36Sopenharmony_ci		dout(" adding snap realm %llx seq %lld parent %llx\n",
452062306a36Sopenharmony_ci		     realm->ino, realm->seq, realm->parent_ino);
452162306a36Sopenharmony_ci		sr_rec.ino = cpu_to_le64(realm->ino);
452262306a36Sopenharmony_ci		sr_rec.seq = cpu_to_le64(realm->seq);
452362306a36Sopenharmony_ci		sr_rec.parent = cpu_to_le64(realm->parent_ino);
452462306a36Sopenharmony_ci
452562306a36Sopenharmony_ci		err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
452662306a36Sopenharmony_ci		if (err)
452762306a36Sopenharmony_ci			goto fail;
452862306a36Sopenharmony_ci
452962306a36Sopenharmony_ci		recon_state->nr_realms++;
453062306a36Sopenharmony_ci	}
453162306a36Sopenharmony_cifail:
453262306a36Sopenharmony_ci	return err;
453362306a36Sopenharmony_ci}
453462306a36Sopenharmony_ci
453562306a36Sopenharmony_ci
453662306a36Sopenharmony_ci/*
453762306a36Sopenharmony_ci * If an MDS fails and recovers, clients need to reconnect in order to
453862306a36Sopenharmony_ci * reestablish shared state.  This includes all caps issued through
453962306a36Sopenharmony_ci * this session _and_ the snap_realm hierarchy.  Because it's not
454062306a36Sopenharmony_ci * clear which snap realms the mds cares about, we send everything we
454162306a36Sopenharmony_ci * know about.. that ensures we'll then get any new info the
454262306a36Sopenharmony_ci * recovering MDS might have.
454362306a36Sopenharmony_ci *
454462306a36Sopenharmony_ci * This is a relatively heavyweight operation, but it's rare.
454562306a36Sopenharmony_ci */
454662306a36Sopenharmony_cistatic void send_mds_reconnect(struct ceph_mds_client *mdsc,
454762306a36Sopenharmony_ci			       struct ceph_mds_session *session)
454862306a36Sopenharmony_ci{
454962306a36Sopenharmony_ci	struct ceph_msg *reply;
455062306a36Sopenharmony_ci	int mds = session->s_mds;
455162306a36Sopenharmony_ci	int err = -ENOMEM;
455262306a36Sopenharmony_ci	struct ceph_reconnect_state recon_state = {
455362306a36Sopenharmony_ci		.session = session,
455462306a36Sopenharmony_ci	};
455562306a36Sopenharmony_ci	LIST_HEAD(dispose);
455662306a36Sopenharmony_ci
455762306a36Sopenharmony_ci	pr_info("mds%d reconnect start\n", mds);
455862306a36Sopenharmony_ci
455962306a36Sopenharmony_ci	recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS);
456062306a36Sopenharmony_ci	if (!recon_state.pagelist)
456162306a36Sopenharmony_ci		goto fail_nopagelist;
456262306a36Sopenharmony_ci
456362306a36Sopenharmony_ci	reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
456462306a36Sopenharmony_ci	if (!reply)
456562306a36Sopenharmony_ci		goto fail_nomsg;
456662306a36Sopenharmony_ci
456762306a36Sopenharmony_ci	xa_destroy(&session->s_delegated_inos);
456862306a36Sopenharmony_ci
456962306a36Sopenharmony_ci	mutex_lock(&session->s_mutex);
457062306a36Sopenharmony_ci	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
457162306a36Sopenharmony_ci	session->s_seq = 0;
457262306a36Sopenharmony_ci
457362306a36Sopenharmony_ci	dout("session %p state %s\n", session,
457462306a36Sopenharmony_ci	     ceph_session_state_name(session->s_state));
457562306a36Sopenharmony_ci
457662306a36Sopenharmony_ci	atomic_inc(&session->s_cap_gen);
457762306a36Sopenharmony_ci
457862306a36Sopenharmony_ci	spin_lock(&session->s_cap_lock);
457962306a36Sopenharmony_ci	/* don't know if session is readonly */
458062306a36Sopenharmony_ci	session->s_readonly = 0;
458162306a36Sopenharmony_ci	/*
458262306a36Sopenharmony_ci	 * notify __ceph_remove_cap() that we are composing cap reconnect.
458362306a36Sopenharmony_ci	 * If a cap get released before being added to the cap reconnect,
458462306a36Sopenharmony_ci	 * __ceph_remove_cap() should skip queuing cap release.
458562306a36Sopenharmony_ci	 */
458662306a36Sopenharmony_ci	session->s_cap_reconnect = 1;
458762306a36Sopenharmony_ci	/* drop old cap expires; we're about to reestablish that state */
458862306a36Sopenharmony_ci	detach_cap_releases(session, &dispose);
458962306a36Sopenharmony_ci	spin_unlock(&session->s_cap_lock);
459062306a36Sopenharmony_ci	dispose_cap_releases(mdsc, &dispose);
459162306a36Sopenharmony_ci
459262306a36Sopenharmony_ci	/* trim unused caps to reduce MDS's cache rejoin time */
459362306a36Sopenharmony_ci	if (mdsc->fsc->sb->s_root)
459462306a36Sopenharmony_ci		shrink_dcache_parent(mdsc->fsc->sb->s_root);
459562306a36Sopenharmony_ci
459662306a36Sopenharmony_ci	ceph_con_close(&session->s_con);
459762306a36Sopenharmony_ci	ceph_con_open(&session->s_con,
459862306a36Sopenharmony_ci		      CEPH_ENTITY_TYPE_MDS, mds,
459962306a36Sopenharmony_ci		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
460062306a36Sopenharmony_ci
460162306a36Sopenharmony_ci	/* replay unsafe requests */
460262306a36Sopenharmony_ci	replay_unsafe_requests(mdsc, session);
460362306a36Sopenharmony_ci
460462306a36Sopenharmony_ci	ceph_early_kick_flushing_caps(mdsc, session);
460562306a36Sopenharmony_ci
460662306a36Sopenharmony_ci	down_read(&mdsc->snap_rwsem);
460762306a36Sopenharmony_ci
460862306a36Sopenharmony_ci	/* placeholder for nr_caps */
460962306a36Sopenharmony_ci	err = ceph_pagelist_encode_32(recon_state.pagelist, 0);
461062306a36Sopenharmony_ci	if (err)
461162306a36Sopenharmony_ci		goto fail;
461262306a36Sopenharmony_ci
461362306a36Sopenharmony_ci	if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) {
461462306a36Sopenharmony_ci		recon_state.msg_version = 3;
461562306a36Sopenharmony_ci		recon_state.allow_multi = true;
461662306a36Sopenharmony_ci	} else if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) {
461762306a36Sopenharmony_ci		recon_state.msg_version = 3;
461862306a36Sopenharmony_ci	} else {
461962306a36Sopenharmony_ci		recon_state.msg_version = 2;
462062306a36Sopenharmony_ci	}
462162306a36Sopenharmony_ci	/* trsaverse this session's caps */
462262306a36Sopenharmony_ci	err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state);
462362306a36Sopenharmony_ci
462462306a36Sopenharmony_ci	spin_lock(&session->s_cap_lock);
462562306a36Sopenharmony_ci	session->s_cap_reconnect = 0;
462662306a36Sopenharmony_ci	spin_unlock(&session->s_cap_lock);
462762306a36Sopenharmony_ci
462862306a36Sopenharmony_ci	if (err < 0)
462962306a36Sopenharmony_ci		goto fail;
463062306a36Sopenharmony_ci
463162306a36Sopenharmony_ci	/* check if all realms can be encoded into current message */
463262306a36Sopenharmony_ci	if (mdsc->num_snap_realms) {
463362306a36Sopenharmony_ci		size_t total_len =
463462306a36Sopenharmony_ci			recon_state.pagelist->length +
463562306a36Sopenharmony_ci			mdsc->num_snap_realms *
463662306a36Sopenharmony_ci			sizeof(struct ceph_mds_snaprealm_reconnect);
463762306a36Sopenharmony_ci		if (recon_state.msg_version >= 4) {
463862306a36Sopenharmony_ci			/* number of realms */
463962306a36Sopenharmony_ci			total_len += sizeof(u32);
464062306a36Sopenharmony_ci			/* version, compat_version and struct_len */
464162306a36Sopenharmony_ci			total_len += mdsc->num_snap_realms *
464262306a36Sopenharmony_ci				     (2 * sizeof(u8) + sizeof(u32));
464362306a36Sopenharmony_ci		}
464462306a36Sopenharmony_ci		if (total_len > RECONNECT_MAX_SIZE) {
464562306a36Sopenharmony_ci			if (!recon_state.allow_multi) {
464662306a36Sopenharmony_ci				err = -ENOSPC;
464762306a36Sopenharmony_ci				goto fail;
464862306a36Sopenharmony_ci			}
464962306a36Sopenharmony_ci			if (recon_state.nr_caps) {
465062306a36Sopenharmony_ci				err = send_reconnect_partial(&recon_state);
465162306a36Sopenharmony_ci				if (err)
465262306a36Sopenharmony_ci					goto fail;
465362306a36Sopenharmony_ci			}
465462306a36Sopenharmony_ci			recon_state.msg_version = 5;
465562306a36Sopenharmony_ci		}
465662306a36Sopenharmony_ci	}
465762306a36Sopenharmony_ci
465862306a36Sopenharmony_ci	err = encode_snap_realms(mdsc, &recon_state);
465962306a36Sopenharmony_ci	if (err < 0)
466062306a36Sopenharmony_ci		goto fail;
466162306a36Sopenharmony_ci
466262306a36Sopenharmony_ci	if (recon_state.msg_version >= 5) {
466362306a36Sopenharmony_ci		err = ceph_pagelist_encode_8(recon_state.pagelist, 0);
466462306a36Sopenharmony_ci		if (err < 0)
466562306a36Sopenharmony_ci			goto fail;
466662306a36Sopenharmony_ci	}
466762306a36Sopenharmony_ci
466862306a36Sopenharmony_ci	if (recon_state.nr_caps || recon_state.nr_realms) {
466962306a36Sopenharmony_ci		struct page *page =
467062306a36Sopenharmony_ci			list_first_entry(&recon_state.pagelist->head,
467162306a36Sopenharmony_ci					struct page, lru);
467262306a36Sopenharmony_ci		__le32 *addr = kmap_atomic(page);
467362306a36Sopenharmony_ci		if (recon_state.nr_caps) {
467462306a36Sopenharmony_ci			WARN_ON(recon_state.nr_realms != mdsc->num_snap_realms);
467562306a36Sopenharmony_ci			*addr = cpu_to_le32(recon_state.nr_caps);
467662306a36Sopenharmony_ci		} else if (recon_state.msg_version >= 4) {
467762306a36Sopenharmony_ci			*(addr + 1) = cpu_to_le32(recon_state.nr_realms);
467862306a36Sopenharmony_ci		}
467962306a36Sopenharmony_ci		kunmap_atomic(addr);
468062306a36Sopenharmony_ci	}
468162306a36Sopenharmony_ci
468262306a36Sopenharmony_ci	reply->hdr.version = cpu_to_le16(recon_state.msg_version);
468362306a36Sopenharmony_ci	if (recon_state.msg_version >= 4)
468462306a36Sopenharmony_ci		reply->hdr.compat_version = cpu_to_le16(4);
468562306a36Sopenharmony_ci
468662306a36Sopenharmony_ci	reply->hdr.data_len = cpu_to_le32(recon_state.pagelist->length);
468762306a36Sopenharmony_ci	ceph_msg_data_add_pagelist(reply, recon_state.pagelist);
468862306a36Sopenharmony_ci
468962306a36Sopenharmony_ci	ceph_con_send(&session->s_con, reply);
469062306a36Sopenharmony_ci
469162306a36Sopenharmony_ci	mutex_unlock(&session->s_mutex);
469262306a36Sopenharmony_ci
469362306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
469462306a36Sopenharmony_ci	__wake_requests(mdsc, &session->s_waiting);
469562306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
469662306a36Sopenharmony_ci
469762306a36Sopenharmony_ci	up_read(&mdsc->snap_rwsem);
469862306a36Sopenharmony_ci	ceph_pagelist_release(recon_state.pagelist);
469962306a36Sopenharmony_ci	return;
470062306a36Sopenharmony_ci
470162306a36Sopenharmony_cifail:
470262306a36Sopenharmony_ci	ceph_msg_put(reply);
470362306a36Sopenharmony_ci	up_read(&mdsc->snap_rwsem);
470462306a36Sopenharmony_ci	mutex_unlock(&session->s_mutex);
470562306a36Sopenharmony_cifail_nomsg:
470662306a36Sopenharmony_ci	ceph_pagelist_release(recon_state.pagelist);
470762306a36Sopenharmony_cifail_nopagelist:
470862306a36Sopenharmony_ci	pr_err("error %d preparing reconnect for mds%d\n", err, mds);
470962306a36Sopenharmony_ci	return;
471062306a36Sopenharmony_ci}
471162306a36Sopenharmony_ci
471262306a36Sopenharmony_ci
471362306a36Sopenharmony_ci/*
471462306a36Sopenharmony_ci * compare old and new mdsmaps, kicking requests
471562306a36Sopenharmony_ci * and closing out old connections as necessary
471662306a36Sopenharmony_ci *
471762306a36Sopenharmony_ci * called under mdsc->mutex.
471862306a36Sopenharmony_ci */
471962306a36Sopenharmony_cistatic void check_new_map(struct ceph_mds_client *mdsc,
472062306a36Sopenharmony_ci			  struct ceph_mdsmap *newmap,
472162306a36Sopenharmony_ci			  struct ceph_mdsmap *oldmap)
472262306a36Sopenharmony_ci{
472362306a36Sopenharmony_ci	int i, j, err;
472462306a36Sopenharmony_ci	int oldstate, newstate;
472562306a36Sopenharmony_ci	struct ceph_mds_session *s;
472662306a36Sopenharmony_ci	unsigned long targets[DIV_ROUND_UP(CEPH_MAX_MDS, sizeof(unsigned long))] = {0};
472762306a36Sopenharmony_ci
472862306a36Sopenharmony_ci	dout("check_new_map new %u old %u\n",
472962306a36Sopenharmony_ci	     newmap->m_epoch, oldmap->m_epoch);
473062306a36Sopenharmony_ci
473162306a36Sopenharmony_ci	if (newmap->m_info) {
473262306a36Sopenharmony_ci		for (i = 0; i < newmap->possible_max_rank; i++) {
473362306a36Sopenharmony_ci			for (j = 0; j < newmap->m_info[i].num_export_targets; j++)
473462306a36Sopenharmony_ci				set_bit(newmap->m_info[i].export_targets[j], targets);
473562306a36Sopenharmony_ci		}
473662306a36Sopenharmony_ci	}
473762306a36Sopenharmony_ci
473862306a36Sopenharmony_ci	for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) {
473962306a36Sopenharmony_ci		if (!mdsc->sessions[i])
474062306a36Sopenharmony_ci			continue;
474162306a36Sopenharmony_ci		s = mdsc->sessions[i];
474262306a36Sopenharmony_ci		oldstate = ceph_mdsmap_get_state(oldmap, i);
474362306a36Sopenharmony_ci		newstate = ceph_mdsmap_get_state(newmap, i);
474462306a36Sopenharmony_ci
474562306a36Sopenharmony_ci		dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
474662306a36Sopenharmony_ci		     i, ceph_mds_state_name(oldstate),
474762306a36Sopenharmony_ci		     ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
474862306a36Sopenharmony_ci		     ceph_mds_state_name(newstate),
474962306a36Sopenharmony_ci		     ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
475062306a36Sopenharmony_ci		     ceph_session_state_name(s->s_state));
475162306a36Sopenharmony_ci
475262306a36Sopenharmony_ci		if (i >= newmap->possible_max_rank) {
475362306a36Sopenharmony_ci			/* force close session for stopped mds */
475462306a36Sopenharmony_ci			ceph_get_mds_session(s);
475562306a36Sopenharmony_ci			__unregister_session(mdsc, s);
475662306a36Sopenharmony_ci			__wake_requests(mdsc, &s->s_waiting);
475762306a36Sopenharmony_ci			mutex_unlock(&mdsc->mutex);
475862306a36Sopenharmony_ci
475962306a36Sopenharmony_ci			mutex_lock(&s->s_mutex);
476062306a36Sopenharmony_ci			cleanup_session_requests(mdsc, s);
476162306a36Sopenharmony_ci			remove_session_caps(s);
476262306a36Sopenharmony_ci			mutex_unlock(&s->s_mutex);
476362306a36Sopenharmony_ci
476462306a36Sopenharmony_ci			ceph_put_mds_session(s);
476562306a36Sopenharmony_ci
476662306a36Sopenharmony_ci			mutex_lock(&mdsc->mutex);
476762306a36Sopenharmony_ci			kick_requests(mdsc, i);
476862306a36Sopenharmony_ci			continue;
476962306a36Sopenharmony_ci		}
477062306a36Sopenharmony_ci
477162306a36Sopenharmony_ci		if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
477262306a36Sopenharmony_ci			   ceph_mdsmap_get_addr(newmap, i),
477362306a36Sopenharmony_ci			   sizeof(struct ceph_entity_addr))) {
477462306a36Sopenharmony_ci			/* just close it */
477562306a36Sopenharmony_ci			mutex_unlock(&mdsc->mutex);
477662306a36Sopenharmony_ci			mutex_lock(&s->s_mutex);
477762306a36Sopenharmony_ci			mutex_lock(&mdsc->mutex);
477862306a36Sopenharmony_ci			ceph_con_close(&s->s_con);
477962306a36Sopenharmony_ci			mutex_unlock(&s->s_mutex);
478062306a36Sopenharmony_ci			s->s_state = CEPH_MDS_SESSION_RESTARTING;
478162306a36Sopenharmony_ci		} else if (oldstate == newstate) {
478262306a36Sopenharmony_ci			continue;  /* nothing new with this mds */
478362306a36Sopenharmony_ci		}
478462306a36Sopenharmony_ci
478562306a36Sopenharmony_ci		/*
478662306a36Sopenharmony_ci		 * send reconnect?
478762306a36Sopenharmony_ci		 */
478862306a36Sopenharmony_ci		if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
478962306a36Sopenharmony_ci		    newstate >= CEPH_MDS_STATE_RECONNECT) {
479062306a36Sopenharmony_ci			mutex_unlock(&mdsc->mutex);
479162306a36Sopenharmony_ci			clear_bit(i, targets);
479262306a36Sopenharmony_ci			send_mds_reconnect(mdsc, s);
479362306a36Sopenharmony_ci			mutex_lock(&mdsc->mutex);
479462306a36Sopenharmony_ci		}
479562306a36Sopenharmony_ci
479662306a36Sopenharmony_ci		/*
479762306a36Sopenharmony_ci		 * kick request on any mds that has gone active.
479862306a36Sopenharmony_ci		 */
479962306a36Sopenharmony_ci		if (oldstate < CEPH_MDS_STATE_ACTIVE &&
480062306a36Sopenharmony_ci		    newstate >= CEPH_MDS_STATE_ACTIVE) {
480162306a36Sopenharmony_ci			if (oldstate != CEPH_MDS_STATE_CREATING &&
480262306a36Sopenharmony_ci			    oldstate != CEPH_MDS_STATE_STARTING)
480362306a36Sopenharmony_ci				pr_info("mds%d recovery completed\n", s->s_mds);
480462306a36Sopenharmony_ci			kick_requests(mdsc, i);
480562306a36Sopenharmony_ci			mutex_unlock(&mdsc->mutex);
480662306a36Sopenharmony_ci			mutex_lock(&s->s_mutex);
480762306a36Sopenharmony_ci			mutex_lock(&mdsc->mutex);
480862306a36Sopenharmony_ci			ceph_kick_flushing_caps(mdsc, s);
480962306a36Sopenharmony_ci			mutex_unlock(&s->s_mutex);
481062306a36Sopenharmony_ci			wake_up_session_caps(s, RECONNECT);
481162306a36Sopenharmony_ci		}
481262306a36Sopenharmony_ci	}
481362306a36Sopenharmony_ci
481462306a36Sopenharmony_ci	/*
481562306a36Sopenharmony_ci	 * Only open and reconnect sessions that don't exist yet.
481662306a36Sopenharmony_ci	 */
481762306a36Sopenharmony_ci	for (i = 0; i < newmap->possible_max_rank; i++) {
481862306a36Sopenharmony_ci		/*
481962306a36Sopenharmony_ci		 * In case the import MDS is crashed just after
482062306a36Sopenharmony_ci		 * the EImportStart journal is flushed, so when
482162306a36Sopenharmony_ci		 * a standby MDS takes over it and is replaying
482262306a36Sopenharmony_ci		 * the EImportStart journal the new MDS daemon
482362306a36Sopenharmony_ci		 * will wait the client to reconnect it, but the
482462306a36Sopenharmony_ci		 * client may never register/open the session yet.
482562306a36Sopenharmony_ci		 *
482662306a36Sopenharmony_ci		 * Will try to reconnect that MDS daemon if the
482762306a36Sopenharmony_ci		 * rank number is in the export targets array and
482862306a36Sopenharmony_ci		 * is the up:reconnect state.
482962306a36Sopenharmony_ci		 */
483062306a36Sopenharmony_ci		newstate = ceph_mdsmap_get_state(newmap, i);
483162306a36Sopenharmony_ci		if (!test_bit(i, targets) || newstate != CEPH_MDS_STATE_RECONNECT)
483262306a36Sopenharmony_ci			continue;
483362306a36Sopenharmony_ci
483462306a36Sopenharmony_ci		/*
483562306a36Sopenharmony_ci		 * The session maybe registered and opened by some
483662306a36Sopenharmony_ci		 * requests which were choosing random MDSes during
483762306a36Sopenharmony_ci		 * the mdsc->mutex's unlock/lock gap below in rare
483862306a36Sopenharmony_ci		 * case. But the related MDS daemon will just queue
483962306a36Sopenharmony_ci		 * that requests and be still waiting for the client's
484062306a36Sopenharmony_ci		 * reconnection request in up:reconnect state.
484162306a36Sopenharmony_ci		 */
484262306a36Sopenharmony_ci		s = __ceph_lookup_mds_session(mdsc, i);
484362306a36Sopenharmony_ci		if (likely(!s)) {
484462306a36Sopenharmony_ci			s = __open_export_target_session(mdsc, i);
484562306a36Sopenharmony_ci			if (IS_ERR(s)) {
484662306a36Sopenharmony_ci				err = PTR_ERR(s);
484762306a36Sopenharmony_ci				pr_err("failed to open export target session, err %d\n",
484862306a36Sopenharmony_ci				       err);
484962306a36Sopenharmony_ci				continue;
485062306a36Sopenharmony_ci			}
485162306a36Sopenharmony_ci		}
485262306a36Sopenharmony_ci		dout("send reconnect to export target mds.%d\n", i);
485362306a36Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
485462306a36Sopenharmony_ci		send_mds_reconnect(mdsc, s);
485562306a36Sopenharmony_ci		ceph_put_mds_session(s);
485662306a36Sopenharmony_ci		mutex_lock(&mdsc->mutex);
485762306a36Sopenharmony_ci	}
485862306a36Sopenharmony_ci
485962306a36Sopenharmony_ci	for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) {
486062306a36Sopenharmony_ci		s = mdsc->sessions[i];
486162306a36Sopenharmony_ci		if (!s)
486262306a36Sopenharmony_ci			continue;
486362306a36Sopenharmony_ci		if (!ceph_mdsmap_is_laggy(newmap, i))
486462306a36Sopenharmony_ci			continue;
486562306a36Sopenharmony_ci		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
486662306a36Sopenharmony_ci		    s->s_state == CEPH_MDS_SESSION_HUNG ||
486762306a36Sopenharmony_ci		    s->s_state == CEPH_MDS_SESSION_CLOSING) {
486862306a36Sopenharmony_ci			dout(" connecting to export targets of laggy mds%d\n",
486962306a36Sopenharmony_ci			     i);
487062306a36Sopenharmony_ci			__open_export_target_sessions(mdsc, s);
487162306a36Sopenharmony_ci		}
487262306a36Sopenharmony_ci	}
487362306a36Sopenharmony_ci}
487462306a36Sopenharmony_ci
487562306a36Sopenharmony_ci
487662306a36Sopenharmony_ci
487762306a36Sopenharmony_ci/*
487862306a36Sopenharmony_ci * leases
487962306a36Sopenharmony_ci */
488062306a36Sopenharmony_ci
488162306a36Sopenharmony_ci/*
488262306a36Sopenharmony_ci * caller must hold session s_mutex, dentry->d_lock
488362306a36Sopenharmony_ci */
488462306a36Sopenharmony_civoid __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
488562306a36Sopenharmony_ci{
488662306a36Sopenharmony_ci	struct ceph_dentry_info *di = ceph_dentry(dentry);
488762306a36Sopenharmony_ci
488862306a36Sopenharmony_ci	ceph_put_mds_session(di->lease_session);
488962306a36Sopenharmony_ci	di->lease_session = NULL;
489062306a36Sopenharmony_ci}
489162306a36Sopenharmony_ci
489262306a36Sopenharmony_cistatic void handle_lease(struct ceph_mds_client *mdsc,
489362306a36Sopenharmony_ci			 struct ceph_mds_session *session,
489462306a36Sopenharmony_ci			 struct ceph_msg *msg)
489562306a36Sopenharmony_ci{
489662306a36Sopenharmony_ci	struct super_block *sb = mdsc->fsc->sb;
489762306a36Sopenharmony_ci	struct inode *inode;
489862306a36Sopenharmony_ci	struct dentry *parent, *dentry;
489962306a36Sopenharmony_ci	struct ceph_dentry_info *di;
490062306a36Sopenharmony_ci	int mds = session->s_mds;
490162306a36Sopenharmony_ci	struct ceph_mds_lease *h = msg->front.iov_base;
490262306a36Sopenharmony_ci	u32 seq;
490362306a36Sopenharmony_ci	struct ceph_vino vino;
490462306a36Sopenharmony_ci	struct qstr dname;
490562306a36Sopenharmony_ci	int release = 0;
490662306a36Sopenharmony_ci
490762306a36Sopenharmony_ci	dout("handle_lease from mds%d\n", mds);
490862306a36Sopenharmony_ci
490962306a36Sopenharmony_ci	if (!ceph_inc_mds_stopping_blocker(mdsc, session))
491062306a36Sopenharmony_ci		return;
491162306a36Sopenharmony_ci
491262306a36Sopenharmony_ci	/* decode */
491362306a36Sopenharmony_ci	if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
491462306a36Sopenharmony_ci		goto bad;
491562306a36Sopenharmony_ci	vino.ino = le64_to_cpu(h->ino);
491662306a36Sopenharmony_ci	vino.snap = CEPH_NOSNAP;
491762306a36Sopenharmony_ci	seq = le32_to_cpu(h->seq);
491862306a36Sopenharmony_ci	dname.len = get_unaligned_le32(h + 1);
491962306a36Sopenharmony_ci	if (msg->front.iov_len < sizeof(*h) + sizeof(u32) + dname.len)
492062306a36Sopenharmony_ci		goto bad;
492162306a36Sopenharmony_ci	dname.name = (void *)(h + 1) + sizeof(u32);
492262306a36Sopenharmony_ci
492362306a36Sopenharmony_ci	/* lookup inode */
492462306a36Sopenharmony_ci	inode = ceph_find_inode(sb, vino);
492562306a36Sopenharmony_ci	dout("handle_lease %s, ino %llx %p %.*s\n",
492662306a36Sopenharmony_ci	     ceph_lease_op_name(h->action), vino.ino, inode,
492762306a36Sopenharmony_ci	     dname.len, dname.name);
492862306a36Sopenharmony_ci
492962306a36Sopenharmony_ci	mutex_lock(&session->s_mutex);
493062306a36Sopenharmony_ci	if (!inode) {
493162306a36Sopenharmony_ci		dout("handle_lease no inode %llx\n", vino.ino);
493262306a36Sopenharmony_ci		goto release;
493362306a36Sopenharmony_ci	}
493462306a36Sopenharmony_ci
493562306a36Sopenharmony_ci	/* dentry */
493662306a36Sopenharmony_ci	parent = d_find_alias(inode);
493762306a36Sopenharmony_ci	if (!parent) {
493862306a36Sopenharmony_ci		dout("no parent dentry on inode %p\n", inode);
493962306a36Sopenharmony_ci		WARN_ON(1);
494062306a36Sopenharmony_ci		goto release;  /* hrm... */
494162306a36Sopenharmony_ci	}
494262306a36Sopenharmony_ci	dname.hash = full_name_hash(parent, dname.name, dname.len);
494362306a36Sopenharmony_ci	dentry = d_lookup(parent, &dname);
494462306a36Sopenharmony_ci	dput(parent);
494562306a36Sopenharmony_ci	if (!dentry)
494662306a36Sopenharmony_ci		goto release;
494762306a36Sopenharmony_ci
494862306a36Sopenharmony_ci	spin_lock(&dentry->d_lock);
494962306a36Sopenharmony_ci	di = ceph_dentry(dentry);
495062306a36Sopenharmony_ci	switch (h->action) {
495162306a36Sopenharmony_ci	case CEPH_MDS_LEASE_REVOKE:
495262306a36Sopenharmony_ci		if (di->lease_session == session) {
495362306a36Sopenharmony_ci			if (ceph_seq_cmp(di->lease_seq, seq) > 0)
495462306a36Sopenharmony_ci				h->seq = cpu_to_le32(di->lease_seq);
495562306a36Sopenharmony_ci			__ceph_mdsc_drop_dentry_lease(dentry);
495662306a36Sopenharmony_ci		}
495762306a36Sopenharmony_ci		release = 1;
495862306a36Sopenharmony_ci		break;
495962306a36Sopenharmony_ci
496062306a36Sopenharmony_ci	case CEPH_MDS_LEASE_RENEW:
496162306a36Sopenharmony_ci		if (di->lease_session == session &&
496262306a36Sopenharmony_ci		    di->lease_gen == atomic_read(&session->s_cap_gen) &&
496362306a36Sopenharmony_ci		    di->lease_renew_from &&
496462306a36Sopenharmony_ci		    di->lease_renew_after == 0) {
496562306a36Sopenharmony_ci			unsigned long duration =
496662306a36Sopenharmony_ci				msecs_to_jiffies(le32_to_cpu(h->duration_ms));
496762306a36Sopenharmony_ci
496862306a36Sopenharmony_ci			di->lease_seq = seq;
496962306a36Sopenharmony_ci			di->time = di->lease_renew_from + duration;
497062306a36Sopenharmony_ci			di->lease_renew_after = di->lease_renew_from +
497162306a36Sopenharmony_ci				(duration >> 1);
497262306a36Sopenharmony_ci			di->lease_renew_from = 0;
497362306a36Sopenharmony_ci		}
497462306a36Sopenharmony_ci		break;
497562306a36Sopenharmony_ci	}
497662306a36Sopenharmony_ci	spin_unlock(&dentry->d_lock);
497762306a36Sopenharmony_ci	dput(dentry);
497862306a36Sopenharmony_ci
497962306a36Sopenharmony_ci	if (!release)
498062306a36Sopenharmony_ci		goto out;
498162306a36Sopenharmony_ci
498262306a36Sopenharmony_cirelease:
498362306a36Sopenharmony_ci	/* let's just reuse the same message */
498462306a36Sopenharmony_ci	h->action = CEPH_MDS_LEASE_REVOKE_ACK;
498562306a36Sopenharmony_ci	ceph_msg_get(msg);
498662306a36Sopenharmony_ci	ceph_con_send(&session->s_con, msg);
498762306a36Sopenharmony_ci
498862306a36Sopenharmony_ciout:
498962306a36Sopenharmony_ci	mutex_unlock(&session->s_mutex);
499062306a36Sopenharmony_ci	iput(inode);
499162306a36Sopenharmony_ci
499262306a36Sopenharmony_ci	ceph_dec_mds_stopping_blocker(mdsc);
499362306a36Sopenharmony_ci	return;
499462306a36Sopenharmony_ci
499562306a36Sopenharmony_cibad:
499662306a36Sopenharmony_ci	ceph_dec_mds_stopping_blocker(mdsc);
499762306a36Sopenharmony_ci
499862306a36Sopenharmony_ci	pr_err("corrupt lease message\n");
499962306a36Sopenharmony_ci	ceph_msg_dump(msg);
500062306a36Sopenharmony_ci}
500162306a36Sopenharmony_ci
500262306a36Sopenharmony_civoid ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
500362306a36Sopenharmony_ci			      struct dentry *dentry, char action,
500462306a36Sopenharmony_ci			      u32 seq)
500562306a36Sopenharmony_ci{
500662306a36Sopenharmony_ci	struct ceph_msg *msg;
500762306a36Sopenharmony_ci	struct ceph_mds_lease *lease;
500862306a36Sopenharmony_ci	struct inode *dir;
500962306a36Sopenharmony_ci	int len = sizeof(*lease) + sizeof(u32) + NAME_MAX;
501062306a36Sopenharmony_ci
501162306a36Sopenharmony_ci	dout("lease_send_msg identry %p %s to mds%d\n",
501262306a36Sopenharmony_ci	     dentry, ceph_lease_op_name(action), session->s_mds);
501362306a36Sopenharmony_ci
501462306a36Sopenharmony_ci	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
501562306a36Sopenharmony_ci	if (!msg)
501662306a36Sopenharmony_ci		return;
501762306a36Sopenharmony_ci	lease = msg->front.iov_base;
501862306a36Sopenharmony_ci	lease->action = action;
501962306a36Sopenharmony_ci	lease->seq = cpu_to_le32(seq);
502062306a36Sopenharmony_ci
502162306a36Sopenharmony_ci	spin_lock(&dentry->d_lock);
502262306a36Sopenharmony_ci	dir = d_inode(dentry->d_parent);
502362306a36Sopenharmony_ci	lease->ino = cpu_to_le64(ceph_ino(dir));
502462306a36Sopenharmony_ci	lease->first = lease->last = cpu_to_le64(ceph_snap(dir));
502562306a36Sopenharmony_ci
502662306a36Sopenharmony_ci	put_unaligned_le32(dentry->d_name.len, lease + 1);
502762306a36Sopenharmony_ci	memcpy((void *)(lease + 1) + 4,
502862306a36Sopenharmony_ci	       dentry->d_name.name, dentry->d_name.len);
502962306a36Sopenharmony_ci	spin_unlock(&dentry->d_lock);
503062306a36Sopenharmony_ci
503162306a36Sopenharmony_ci	ceph_con_send(&session->s_con, msg);
503262306a36Sopenharmony_ci}
503362306a36Sopenharmony_ci
503462306a36Sopenharmony_ci/*
503562306a36Sopenharmony_ci * lock unlock the session, to wait ongoing session activities
503662306a36Sopenharmony_ci */
503762306a36Sopenharmony_cistatic void lock_unlock_session(struct ceph_mds_session *s)
503862306a36Sopenharmony_ci{
503962306a36Sopenharmony_ci	mutex_lock(&s->s_mutex);
504062306a36Sopenharmony_ci	mutex_unlock(&s->s_mutex);
504162306a36Sopenharmony_ci}
504262306a36Sopenharmony_ci
504362306a36Sopenharmony_cistatic void maybe_recover_session(struct ceph_mds_client *mdsc)
504462306a36Sopenharmony_ci{
504562306a36Sopenharmony_ci	struct ceph_fs_client *fsc = mdsc->fsc;
504662306a36Sopenharmony_ci
504762306a36Sopenharmony_ci	if (!ceph_test_mount_opt(fsc, CLEANRECOVER))
504862306a36Sopenharmony_ci		return;
504962306a36Sopenharmony_ci
505062306a36Sopenharmony_ci	if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
505162306a36Sopenharmony_ci		return;
505262306a36Sopenharmony_ci
505362306a36Sopenharmony_ci	if (!READ_ONCE(fsc->blocklisted))
505462306a36Sopenharmony_ci		return;
505562306a36Sopenharmony_ci
505662306a36Sopenharmony_ci	pr_info("auto reconnect after blocklisted\n");
505762306a36Sopenharmony_ci	ceph_force_reconnect(fsc->sb);
505862306a36Sopenharmony_ci}
505962306a36Sopenharmony_ci
506062306a36Sopenharmony_cibool check_session_state(struct ceph_mds_session *s)
506162306a36Sopenharmony_ci{
506262306a36Sopenharmony_ci	switch (s->s_state) {
506362306a36Sopenharmony_ci	case CEPH_MDS_SESSION_OPEN:
506462306a36Sopenharmony_ci		if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
506562306a36Sopenharmony_ci			s->s_state = CEPH_MDS_SESSION_HUNG;
506662306a36Sopenharmony_ci			pr_info("mds%d hung\n", s->s_mds);
506762306a36Sopenharmony_ci		}
506862306a36Sopenharmony_ci		break;
506962306a36Sopenharmony_ci	case CEPH_MDS_SESSION_CLOSING:
507062306a36Sopenharmony_ci	case CEPH_MDS_SESSION_NEW:
507162306a36Sopenharmony_ci	case CEPH_MDS_SESSION_RESTARTING:
507262306a36Sopenharmony_ci	case CEPH_MDS_SESSION_CLOSED:
507362306a36Sopenharmony_ci	case CEPH_MDS_SESSION_REJECTED:
507462306a36Sopenharmony_ci		return false;
507562306a36Sopenharmony_ci	}
507662306a36Sopenharmony_ci
507762306a36Sopenharmony_ci	return true;
507862306a36Sopenharmony_ci}
507962306a36Sopenharmony_ci
508062306a36Sopenharmony_ci/*
508162306a36Sopenharmony_ci * If the sequence is incremented while we're waiting on a REQUEST_CLOSE reply,
508262306a36Sopenharmony_ci * then we need to retransmit that request.
508362306a36Sopenharmony_ci */
508462306a36Sopenharmony_civoid inc_session_sequence(struct ceph_mds_session *s)
508562306a36Sopenharmony_ci{
508662306a36Sopenharmony_ci	lockdep_assert_held(&s->s_mutex);
508762306a36Sopenharmony_ci
508862306a36Sopenharmony_ci	s->s_seq++;
508962306a36Sopenharmony_ci
509062306a36Sopenharmony_ci	if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
509162306a36Sopenharmony_ci		int ret;
509262306a36Sopenharmony_ci
509362306a36Sopenharmony_ci		dout("resending session close request for mds%d\n", s->s_mds);
509462306a36Sopenharmony_ci		ret = request_close_session(s);
509562306a36Sopenharmony_ci		if (ret < 0)
509662306a36Sopenharmony_ci			pr_err("unable to close session to mds%d: %d\n",
509762306a36Sopenharmony_ci			       s->s_mds, ret);
509862306a36Sopenharmony_ci	}
509962306a36Sopenharmony_ci}
510062306a36Sopenharmony_ci
510162306a36Sopenharmony_ci/*
510262306a36Sopenharmony_ci * delayed work -- periodically trim expired leases, renew caps with mds.  If
510362306a36Sopenharmony_ci * the @delay parameter is set to 0 or if it's more than 5 secs, the default
510462306a36Sopenharmony_ci * workqueue delay value of 5 secs will be used.
510562306a36Sopenharmony_ci */
510662306a36Sopenharmony_cistatic void schedule_delayed(struct ceph_mds_client *mdsc, unsigned long delay)
510762306a36Sopenharmony_ci{
510862306a36Sopenharmony_ci	unsigned long max_delay = HZ * 5;
510962306a36Sopenharmony_ci
511062306a36Sopenharmony_ci	/* 5 secs default delay */
511162306a36Sopenharmony_ci	if (!delay || (delay > max_delay))
511262306a36Sopenharmony_ci		delay = max_delay;
511362306a36Sopenharmony_ci	schedule_delayed_work(&mdsc->delayed_work,
511462306a36Sopenharmony_ci			      round_jiffies_relative(delay));
511562306a36Sopenharmony_ci}
511662306a36Sopenharmony_ci
511762306a36Sopenharmony_cistatic void delayed_work(struct work_struct *work)
511862306a36Sopenharmony_ci{
511962306a36Sopenharmony_ci	struct ceph_mds_client *mdsc =
512062306a36Sopenharmony_ci		container_of(work, struct ceph_mds_client, delayed_work.work);
512162306a36Sopenharmony_ci	unsigned long delay;
512262306a36Sopenharmony_ci	int renew_interval;
512362306a36Sopenharmony_ci	int renew_caps;
512462306a36Sopenharmony_ci	int i;
512562306a36Sopenharmony_ci
512662306a36Sopenharmony_ci	dout("mdsc delayed_work\n");
512762306a36Sopenharmony_ci
512862306a36Sopenharmony_ci	if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED)
512962306a36Sopenharmony_ci		return;
513062306a36Sopenharmony_ci
513162306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
513262306a36Sopenharmony_ci	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
513362306a36Sopenharmony_ci	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
513462306a36Sopenharmony_ci				   mdsc->last_renew_caps);
513562306a36Sopenharmony_ci	if (renew_caps)
513662306a36Sopenharmony_ci		mdsc->last_renew_caps = jiffies;
513762306a36Sopenharmony_ci
513862306a36Sopenharmony_ci	for (i = 0; i < mdsc->max_sessions; i++) {
513962306a36Sopenharmony_ci		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
514062306a36Sopenharmony_ci		if (!s)
514162306a36Sopenharmony_ci			continue;
514262306a36Sopenharmony_ci
514362306a36Sopenharmony_ci		if (!check_session_state(s)) {
514462306a36Sopenharmony_ci			ceph_put_mds_session(s);
514562306a36Sopenharmony_ci			continue;
514662306a36Sopenharmony_ci		}
514762306a36Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
514862306a36Sopenharmony_ci
514962306a36Sopenharmony_ci		mutex_lock(&s->s_mutex);
515062306a36Sopenharmony_ci		if (renew_caps)
515162306a36Sopenharmony_ci			send_renew_caps(mdsc, s);
515262306a36Sopenharmony_ci		else
515362306a36Sopenharmony_ci			ceph_con_keepalive(&s->s_con);
515462306a36Sopenharmony_ci		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
515562306a36Sopenharmony_ci		    s->s_state == CEPH_MDS_SESSION_HUNG)
515662306a36Sopenharmony_ci			ceph_send_cap_releases(mdsc, s);
515762306a36Sopenharmony_ci		mutex_unlock(&s->s_mutex);
515862306a36Sopenharmony_ci		ceph_put_mds_session(s);
515962306a36Sopenharmony_ci
516062306a36Sopenharmony_ci		mutex_lock(&mdsc->mutex);
516162306a36Sopenharmony_ci	}
516262306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
516362306a36Sopenharmony_ci
516462306a36Sopenharmony_ci	delay = ceph_check_delayed_caps(mdsc);
516562306a36Sopenharmony_ci
516662306a36Sopenharmony_ci	ceph_queue_cap_reclaim_work(mdsc);
516762306a36Sopenharmony_ci
516862306a36Sopenharmony_ci	ceph_trim_snapid_map(mdsc);
516962306a36Sopenharmony_ci
517062306a36Sopenharmony_ci	maybe_recover_session(mdsc);
517162306a36Sopenharmony_ci
517262306a36Sopenharmony_ci	schedule_delayed(mdsc, delay);
517362306a36Sopenharmony_ci}
517462306a36Sopenharmony_ci
517562306a36Sopenharmony_ciint ceph_mdsc_init(struct ceph_fs_client *fsc)
517662306a36Sopenharmony_ci
517762306a36Sopenharmony_ci{
517862306a36Sopenharmony_ci	struct ceph_mds_client *mdsc;
517962306a36Sopenharmony_ci	int err;
518062306a36Sopenharmony_ci
518162306a36Sopenharmony_ci	mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
518262306a36Sopenharmony_ci	if (!mdsc)
518362306a36Sopenharmony_ci		return -ENOMEM;
518462306a36Sopenharmony_ci	mdsc->fsc = fsc;
518562306a36Sopenharmony_ci	mutex_init(&mdsc->mutex);
518662306a36Sopenharmony_ci	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
518762306a36Sopenharmony_ci	if (!mdsc->mdsmap) {
518862306a36Sopenharmony_ci		err = -ENOMEM;
518962306a36Sopenharmony_ci		goto err_mdsc;
519062306a36Sopenharmony_ci	}
519162306a36Sopenharmony_ci
519262306a36Sopenharmony_ci	init_completion(&mdsc->safe_umount_waiters);
519362306a36Sopenharmony_ci	spin_lock_init(&mdsc->stopping_lock);
519462306a36Sopenharmony_ci	atomic_set(&mdsc->stopping_blockers, 0);
519562306a36Sopenharmony_ci	init_completion(&mdsc->stopping_waiter);
519662306a36Sopenharmony_ci	init_waitqueue_head(&mdsc->session_close_wq);
519762306a36Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->waiting_for_map);
519862306a36Sopenharmony_ci	mdsc->quotarealms_inodes = RB_ROOT;
519962306a36Sopenharmony_ci	mutex_init(&mdsc->quotarealms_inodes_mutex);
520062306a36Sopenharmony_ci	init_rwsem(&mdsc->snap_rwsem);
520162306a36Sopenharmony_ci	mdsc->snap_realms = RB_ROOT;
520262306a36Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->snap_empty);
520362306a36Sopenharmony_ci	spin_lock_init(&mdsc->snap_empty_lock);
520462306a36Sopenharmony_ci	mdsc->request_tree = RB_ROOT;
520562306a36Sopenharmony_ci	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
520662306a36Sopenharmony_ci	mdsc->last_renew_caps = jiffies;
520762306a36Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->cap_delay_list);
520862306a36Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->cap_wait_list);
520962306a36Sopenharmony_ci	spin_lock_init(&mdsc->cap_delay_lock);
521062306a36Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->snap_flush_list);
521162306a36Sopenharmony_ci	spin_lock_init(&mdsc->snap_flush_lock);
521262306a36Sopenharmony_ci	mdsc->last_cap_flush_tid = 1;
521362306a36Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->cap_flush_list);
521462306a36Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
521562306a36Sopenharmony_ci	spin_lock_init(&mdsc->cap_dirty_lock);
521662306a36Sopenharmony_ci	init_waitqueue_head(&mdsc->cap_flushing_wq);
521762306a36Sopenharmony_ci	INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
521862306a36Sopenharmony_ci	err = ceph_metric_init(&mdsc->metric);
521962306a36Sopenharmony_ci	if (err)
522062306a36Sopenharmony_ci		goto err_mdsmap;
522162306a36Sopenharmony_ci
522262306a36Sopenharmony_ci	spin_lock_init(&mdsc->dentry_list_lock);
522362306a36Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->dentry_leases);
522462306a36Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->dentry_dir_leases);
522562306a36Sopenharmony_ci
522662306a36Sopenharmony_ci	ceph_caps_init(mdsc);
522762306a36Sopenharmony_ci	ceph_adjust_caps_max_min(mdsc, fsc->mount_options);
522862306a36Sopenharmony_ci
522962306a36Sopenharmony_ci	spin_lock_init(&mdsc->snapid_map_lock);
523062306a36Sopenharmony_ci	mdsc->snapid_map_tree = RB_ROOT;
523162306a36Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->snapid_map_lru);
523262306a36Sopenharmony_ci
523362306a36Sopenharmony_ci	init_rwsem(&mdsc->pool_perm_rwsem);
523462306a36Sopenharmony_ci	mdsc->pool_perm_tree = RB_ROOT;
523562306a36Sopenharmony_ci
523662306a36Sopenharmony_ci	strscpy(mdsc->nodename, utsname()->nodename,
523762306a36Sopenharmony_ci		sizeof(mdsc->nodename));
523862306a36Sopenharmony_ci
523962306a36Sopenharmony_ci	fsc->mdsc = mdsc;
524062306a36Sopenharmony_ci	return 0;
524162306a36Sopenharmony_ci
524262306a36Sopenharmony_cierr_mdsmap:
524362306a36Sopenharmony_ci	kfree(mdsc->mdsmap);
524462306a36Sopenharmony_cierr_mdsc:
524562306a36Sopenharmony_ci	kfree(mdsc);
524662306a36Sopenharmony_ci	return err;
524762306a36Sopenharmony_ci}
524862306a36Sopenharmony_ci
524962306a36Sopenharmony_ci/*
525062306a36Sopenharmony_ci * Wait for safe replies on open mds requests.  If we time out, drop
525162306a36Sopenharmony_ci * all requests from the tree to avoid dangling dentry refs.
525262306a36Sopenharmony_ci */
525362306a36Sopenharmony_cistatic void wait_requests(struct ceph_mds_client *mdsc)
525462306a36Sopenharmony_ci{
525562306a36Sopenharmony_ci	struct ceph_options *opts = mdsc->fsc->client->options;
525662306a36Sopenharmony_ci	struct ceph_mds_request *req;
525762306a36Sopenharmony_ci
525862306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
525962306a36Sopenharmony_ci	if (__get_oldest_req(mdsc)) {
526062306a36Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
526162306a36Sopenharmony_ci
526262306a36Sopenharmony_ci		dout("wait_requests waiting for requests\n");
526362306a36Sopenharmony_ci		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
526462306a36Sopenharmony_ci				    ceph_timeout_jiffies(opts->mount_timeout));
526562306a36Sopenharmony_ci
526662306a36Sopenharmony_ci		/* tear down remaining requests */
526762306a36Sopenharmony_ci		mutex_lock(&mdsc->mutex);
526862306a36Sopenharmony_ci		while ((req = __get_oldest_req(mdsc))) {
526962306a36Sopenharmony_ci			dout("wait_requests timed out on tid %llu\n",
527062306a36Sopenharmony_ci			     req->r_tid);
527162306a36Sopenharmony_ci			list_del_init(&req->r_wait);
527262306a36Sopenharmony_ci			__unregister_request(mdsc, req);
527362306a36Sopenharmony_ci		}
527462306a36Sopenharmony_ci	}
527562306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
527662306a36Sopenharmony_ci	dout("wait_requests done\n");
527762306a36Sopenharmony_ci}
527862306a36Sopenharmony_ci
527962306a36Sopenharmony_civoid send_flush_mdlog(struct ceph_mds_session *s)
528062306a36Sopenharmony_ci{
528162306a36Sopenharmony_ci	struct ceph_msg *msg;
528262306a36Sopenharmony_ci
528362306a36Sopenharmony_ci	/*
528462306a36Sopenharmony_ci	 * Pre-luminous MDS crashes when it sees an unknown session request
528562306a36Sopenharmony_ci	 */
528662306a36Sopenharmony_ci	if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS))
528762306a36Sopenharmony_ci		return;
528862306a36Sopenharmony_ci
528962306a36Sopenharmony_ci	mutex_lock(&s->s_mutex);
529062306a36Sopenharmony_ci	dout("request mdlog flush to mds%d (%s)s seq %lld\n", s->s_mds,
529162306a36Sopenharmony_ci	     ceph_session_state_name(s->s_state), s->s_seq);
529262306a36Sopenharmony_ci	msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG,
529362306a36Sopenharmony_ci				      s->s_seq);
529462306a36Sopenharmony_ci	if (!msg) {
529562306a36Sopenharmony_ci		pr_err("failed to request mdlog flush to mds%d (%s) seq %lld\n",
529662306a36Sopenharmony_ci		       s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);
529762306a36Sopenharmony_ci	} else {
529862306a36Sopenharmony_ci		ceph_con_send(&s->s_con, msg);
529962306a36Sopenharmony_ci	}
530062306a36Sopenharmony_ci	mutex_unlock(&s->s_mutex);
530162306a36Sopenharmony_ci}
530262306a36Sopenharmony_ci
530362306a36Sopenharmony_ci/*
530462306a36Sopenharmony_ci * called before mount is ro, and before dentries are torn down.
530562306a36Sopenharmony_ci * (hmm, does this still race with new lookups?)
530662306a36Sopenharmony_ci */
530762306a36Sopenharmony_civoid ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
530862306a36Sopenharmony_ci{
530962306a36Sopenharmony_ci	dout("pre_umount\n");
531062306a36Sopenharmony_ci	mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN;
531162306a36Sopenharmony_ci
531262306a36Sopenharmony_ci	ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
531362306a36Sopenharmony_ci	ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
531462306a36Sopenharmony_ci	ceph_flush_dirty_caps(mdsc);
531562306a36Sopenharmony_ci	wait_requests(mdsc);
531662306a36Sopenharmony_ci
531762306a36Sopenharmony_ci	/*
531862306a36Sopenharmony_ci	 * wait for reply handlers to drop their request refs and
531962306a36Sopenharmony_ci	 * their inode/dcache refs
532062306a36Sopenharmony_ci	 */
532162306a36Sopenharmony_ci	ceph_msgr_flush();
532262306a36Sopenharmony_ci
532362306a36Sopenharmony_ci	ceph_cleanup_quotarealms_inodes(mdsc);
532462306a36Sopenharmony_ci}
532562306a36Sopenharmony_ci
532662306a36Sopenharmony_ci/*
532762306a36Sopenharmony_ci * flush the mdlog and wait for all write mds requests to flush.
532862306a36Sopenharmony_ci */
532962306a36Sopenharmony_cistatic void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
533062306a36Sopenharmony_ci						 u64 want_tid)
533162306a36Sopenharmony_ci{
533262306a36Sopenharmony_ci	struct ceph_mds_request *req = NULL, *nextreq;
533362306a36Sopenharmony_ci	struct ceph_mds_session *last_session = NULL;
533462306a36Sopenharmony_ci	struct rb_node *n;
533562306a36Sopenharmony_ci
533662306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
533762306a36Sopenharmony_ci	dout("%s want %lld\n", __func__, want_tid);
533862306a36Sopenharmony_cirestart:
533962306a36Sopenharmony_ci	req = __get_oldest_req(mdsc);
534062306a36Sopenharmony_ci	while (req && req->r_tid <= want_tid) {
534162306a36Sopenharmony_ci		/* find next request */
534262306a36Sopenharmony_ci		n = rb_next(&req->r_node);
534362306a36Sopenharmony_ci		if (n)
534462306a36Sopenharmony_ci			nextreq = rb_entry(n, struct ceph_mds_request, r_node);
534562306a36Sopenharmony_ci		else
534662306a36Sopenharmony_ci			nextreq = NULL;
534762306a36Sopenharmony_ci		if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&
534862306a36Sopenharmony_ci		    (req->r_op & CEPH_MDS_OP_WRITE)) {
534962306a36Sopenharmony_ci			struct ceph_mds_session *s = req->r_session;
535062306a36Sopenharmony_ci
535162306a36Sopenharmony_ci			if (!s) {
535262306a36Sopenharmony_ci				req = nextreq;
535362306a36Sopenharmony_ci				continue;
535462306a36Sopenharmony_ci			}
535562306a36Sopenharmony_ci
535662306a36Sopenharmony_ci			/* write op */
535762306a36Sopenharmony_ci			ceph_mdsc_get_request(req);
535862306a36Sopenharmony_ci			if (nextreq)
535962306a36Sopenharmony_ci				ceph_mdsc_get_request(nextreq);
536062306a36Sopenharmony_ci			s = ceph_get_mds_session(s);
536162306a36Sopenharmony_ci			mutex_unlock(&mdsc->mutex);
536262306a36Sopenharmony_ci
536362306a36Sopenharmony_ci			/* send flush mdlog request to MDS */
536462306a36Sopenharmony_ci			if (last_session != s) {
536562306a36Sopenharmony_ci				send_flush_mdlog(s);
536662306a36Sopenharmony_ci				ceph_put_mds_session(last_session);
536762306a36Sopenharmony_ci				last_session = s;
536862306a36Sopenharmony_ci			} else {
536962306a36Sopenharmony_ci				ceph_put_mds_session(s);
537062306a36Sopenharmony_ci			}
537162306a36Sopenharmony_ci			dout("%s wait on %llu (want %llu)\n", __func__,
537262306a36Sopenharmony_ci			     req->r_tid, want_tid);
537362306a36Sopenharmony_ci			wait_for_completion(&req->r_safe_completion);
537462306a36Sopenharmony_ci
537562306a36Sopenharmony_ci			mutex_lock(&mdsc->mutex);
537662306a36Sopenharmony_ci			ceph_mdsc_put_request(req);
537762306a36Sopenharmony_ci			if (!nextreq)
537862306a36Sopenharmony_ci				break;  /* next dne before, so we're done! */
537962306a36Sopenharmony_ci			if (RB_EMPTY_NODE(&nextreq->r_node)) {
538062306a36Sopenharmony_ci				/* next request was removed from tree */
538162306a36Sopenharmony_ci				ceph_mdsc_put_request(nextreq);
538262306a36Sopenharmony_ci				goto restart;
538362306a36Sopenharmony_ci			}
538462306a36Sopenharmony_ci			ceph_mdsc_put_request(nextreq);  /* won't go away */
538562306a36Sopenharmony_ci		}
538662306a36Sopenharmony_ci		req = nextreq;
538762306a36Sopenharmony_ci	}
538862306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
538962306a36Sopenharmony_ci	ceph_put_mds_session(last_session);
539062306a36Sopenharmony_ci	dout("%s done\n", __func__);
539162306a36Sopenharmony_ci}
539262306a36Sopenharmony_ci
539362306a36Sopenharmony_civoid ceph_mdsc_sync(struct ceph_mds_client *mdsc)
539462306a36Sopenharmony_ci{
539562306a36Sopenharmony_ci	u64 want_tid, want_flush;
539662306a36Sopenharmony_ci
539762306a36Sopenharmony_ci	if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)
539862306a36Sopenharmony_ci		return;
539962306a36Sopenharmony_ci
540062306a36Sopenharmony_ci	dout("sync\n");
540162306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
540262306a36Sopenharmony_ci	want_tid = mdsc->last_tid;
540362306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
540462306a36Sopenharmony_ci
540562306a36Sopenharmony_ci	ceph_flush_dirty_caps(mdsc);
540662306a36Sopenharmony_ci	spin_lock(&mdsc->cap_dirty_lock);
540762306a36Sopenharmony_ci	want_flush = mdsc->last_cap_flush_tid;
540862306a36Sopenharmony_ci	if (!list_empty(&mdsc->cap_flush_list)) {
540962306a36Sopenharmony_ci		struct ceph_cap_flush *cf =
541062306a36Sopenharmony_ci			list_last_entry(&mdsc->cap_flush_list,
541162306a36Sopenharmony_ci					struct ceph_cap_flush, g_list);
541262306a36Sopenharmony_ci		cf->wake = true;
541362306a36Sopenharmony_ci	}
541462306a36Sopenharmony_ci	spin_unlock(&mdsc->cap_dirty_lock);
541562306a36Sopenharmony_ci
541662306a36Sopenharmony_ci	dout("sync want tid %lld flush_seq %lld\n",
541762306a36Sopenharmony_ci	     want_tid, want_flush);
541862306a36Sopenharmony_ci
541962306a36Sopenharmony_ci	flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
542062306a36Sopenharmony_ci	wait_caps_flush(mdsc, want_flush);
542162306a36Sopenharmony_ci}
542262306a36Sopenharmony_ci
542362306a36Sopenharmony_ci/*
542462306a36Sopenharmony_ci * true if all sessions are closed, or we force unmount
542562306a36Sopenharmony_ci */
542662306a36Sopenharmony_cistatic bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
542762306a36Sopenharmony_ci{
542862306a36Sopenharmony_ci	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
542962306a36Sopenharmony_ci		return true;
543062306a36Sopenharmony_ci	return atomic_read(&mdsc->num_sessions) <= skipped;
543162306a36Sopenharmony_ci}
543262306a36Sopenharmony_ci
543362306a36Sopenharmony_ci/*
543462306a36Sopenharmony_ci * called after sb is ro or when metadata corrupted.
543562306a36Sopenharmony_ci */
543662306a36Sopenharmony_civoid ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
543762306a36Sopenharmony_ci{
543862306a36Sopenharmony_ci	struct ceph_options *opts = mdsc->fsc->client->options;
543962306a36Sopenharmony_ci	struct ceph_mds_session *session;
544062306a36Sopenharmony_ci	int i;
544162306a36Sopenharmony_ci	int skipped = 0;
544262306a36Sopenharmony_ci
544362306a36Sopenharmony_ci	dout("close_sessions\n");
544462306a36Sopenharmony_ci
544562306a36Sopenharmony_ci	/* close sessions */
544662306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
544762306a36Sopenharmony_ci	for (i = 0; i < mdsc->max_sessions; i++) {
544862306a36Sopenharmony_ci		session = __ceph_lookup_mds_session(mdsc, i);
544962306a36Sopenharmony_ci		if (!session)
545062306a36Sopenharmony_ci			continue;
545162306a36Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
545262306a36Sopenharmony_ci		mutex_lock(&session->s_mutex);
545362306a36Sopenharmony_ci		if (__close_session(mdsc, session) <= 0)
545462306a36Sopenharmony_ci			skipped++;
545562306a36Sopenharmony_ci		mutex_unlock(&session->s_mutex);
545662306a36Sopenharmony_ci		ceph_put_mds_session(session);
545762306a36Sopenharmony_ci		mutex_lock(&mdsc->mutex);
545862306a36Sopenharmony_ci	}
545962306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
546062306a36Sopenharmony_ci
546162306a36Sopenharmony_ci	dout("waiting for sessions to close\n");
546262306a36Sopenharmony_ci	wait_event_timeout(mdsc->session_close_wq,
546362306a36Sopenharmony_ci			   done_closing_sessions(mdsc, skipped),
546462306a36Sopenharmony_ci			   ceph_timeout_jiffies(opts->mount_timeout));
546562306a36Sopenharmony_ci
546662306a36Sopenharmony_ci	/* tear down remaining sessions */
546762306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
546862306a36Sopenharmony_ci	for (i = 0; i < mdsc->max_sessions; i++) {
546962306a36Sopenharmony_ci		if (mdsc->sessions[i]) {
547062306a36Sopenharmony_ci			session = ceph_get_mds_session(mdsc->sessions[i]);
547162306a36Sopenharmony_ci			__unregister_session(mdsc, session);
547262306a36Sopenharmony_ci			mutex_unlock(&mdsc->mutex);
547362306a36Sopenharmony_ci			mutex_lock(&session->s_mutex);
547462306a36Sopenharmony_ci			remove_session_caps(session);
547562306a36Sopenharmony_ci			mutex_unlock(&session->s_mutex);
547662306a36Sopenharmony_ci			ceph_put_mds_session(session);
547762306a36Sopenharmony_ci			mutex_lock(&mdsc->mutex);
547862306a36Sopenharmony_ci		}
547962306a36Sopenharmony_ci	}
548062306a36Sopenharmony_ci	WARN_ON(!list_empty(&mdsc->cap_delay_list));
548162306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
548262306a36Sopenharmony_ci
548362306a36Sopenharmony_ci	ceph_cleanup_snapid_map(mdsc);
548462306a36Sopenharmony_ci	ceph_cleanup_global_and_empty_realms(mdsc);
548562306a36Sopenharmony_ci
548662306a36Sopenharmony_ci	cancel_work_sync(&mdsc->cap_reclaim_work);
548762306a36Sopenharmony_ci	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
548862306a36Sopenharmony_ci
548962306a36Sopenharmony_ci	dout("stopped\n");
549062306a36Sopenharmony_ci}
549162306a36Sopenharmony_ci
549262306a36Sopenharmony_civoid ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
549362306a36Sopenharmony_ci{
549462306a36Sopenharmony_ci	struct ceph_mds_session *session;
549562306a36Sopenharmony_ci	int mds;
549662306a36Sopenharmony_ci
549762306a36Sopenharmony_ci	dout("force umount\n");
549862306a36Sopenharmony_ci
549962306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
550062306a36Sopenharmony_ci	for (mds = 0; mds < mdsc->max_sessions; mds++) {
550162306a36Sopenharmony_ci		session = __ceph_lookup_mds_session(mdsc, mds);
550262306a36Sopenharmony_ci		if (!session)
550362306a36Sopenharmony_ci			continue;
550462306a36Sopenharmony_ci
550562306a36Sopenharmony_ci		if (session->s_state == CEPH_MDS_SESSION_REJECTED)
550662306a36Sopenharmony_ci			__unregister_session(mdsc, session);
550762306a36Sopenharmony_ci		__wake_requests(mdsc, &session->s_waiting);
550862306a36Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
550962306a36Sopenharmony_ci
551062306a36Sopenharmony_ci		mutex_lock(&session->s_mutex);
551162306a36Sopenharmony_ci		__close_session(mdsc, session);
551262306a36Sopenharmony_ci		if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
551362306a36Sopenharmony_ci			cleanup_session_requests(mdsc, session);
551462306a36Sopenharmony_ci			remove_session_caps(session);
551562306a36Sopenharmony_ci		}
551662306a36Sopenharmony_ci		mutex_unlock(&session->s_mutex);
551762306a36Sopenharmony_ci		ceph_put_mds_session(session);
551862306a36Sopenharmony_ci
551962306a36Sopenharmony_ci		mutex_lock(&mdsc->mutex);
552062306a36Sopenharmony_ci		kick_requests(mdsc, mds);
552162306a36Sopenharmony_ci	}
552262306a36Sopenharmony_ci	__wake_requests(mdsc, &mdsc->waiting_for_map);
552362306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
552462306a36Sopenharmony_ci}
552562306a36Sopenharmony_ci
552662306a36Sopenharmony_cistatic void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
552762306a36Sopenharmony_ci{
552862306a36Sopenharmony_ci	dout("stop\n");
552962306a36Sopenharmony_ci	/*
553062306a36Sopenharmony_ci	 * Make sure the delayed work stopped before releasing
553162306a36Sopenharmony_ci	 * the resources.
553262306a36Sopenharmony_ci	 *
553362306a36Sopenharmony_ci	 * Because the cancel_delayed_work_sync() will only
553462306a36Sopenharmony_ci	 * guarantee that the work finishes executing. But the
553562306a36Sopenharmony_ci	 * delayed work will re-arm itself again after that.
553662306a36Sopenharmony_ci	 */
553762306a36Sopenharmony_ci	flush_delayed_work(&mdsc->delayed_work);
553862306a36Sopenharmony_ci
553962306a36Sopenharmony_ci	if (mdsc->mdsmap)
554062306a36Sopenharmony_ci		ceph_mdsmap_destroy(mdsc->mdsmap);
554162306a36Sopenharmony_ci	kfree(mdsc->sessions);
554262306a36Sopenharmony_ci	ceph_caps_finalize(mdsc);
554362306a36Sopenharmony_ci	ceph_pool_perm_destroy(mdsc);
554462306a36Sopenharmony_ci}
554562306a36Sopenharmony_ci
554662306a36Sopenharmony_civoid ceph_mdsc_destroy(struct ceph_fs_client *fsc)
554762306a36Sopenharmony_ci{
554862306a36Sopenharmony_ci	struct ceph_mds_client *mdsc = fsc->mdsc;
554962306a36Sopenharmony_ci	dout("mdsc_destroy %p\n", mdsc);
555062306a36Sopenharmony_ci
555162306a36Sopenharmony_ci	if (!mdsc)
555262306a36Sopenharmony_ci		return;
555362306a36Sopenharmony_ci
555462306a36Sopenharmony_ci	/* flush out any connection work with references to us */
555562306a36Sopenharmony_ci	ceph_msgr_flush();
555662306a36Sopenharmony_ci
555762306a36Sopenharmony_ci	ceph_mdsc_stop(mdsc);
555862306a36Sopenharmony_ci
555962306a36Sopenharmony_ci	ceph_metric_destroy(&mdsc->metric);
556062306a36Sopenharmony_ci
556162306a36Sopenharmony_ci	fsc->mdsc = NULL;
556262306a36Sopenharmony_ci	kfree(mdsc);
556362306a36Sopenharmony_ci	dout("mdsc_destroy %p done\n", mdsc);
556462306a36Sopenharmony_ci}
556562306a36Sopenharmony_ci
556662306a36Sopenharmony_civoid ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
556762306a36Sopenharmony_ci{
556862306a36Sopenharmony_ci	struct ceph_fs_client *fsc = mdsc->fsc;
556962306a36Sopenharmony_ci	const char *mds_namespace = fsc->mount_options->mds_namespace;
557062306a36Sopenharmony_ci	void *p = msg->front.iov_base;
557162306a36Sopenharmony_ci	void *end = p + msg->front.iov_len;
557262306a36Sopenharmony_ci	u32 epoch;
557362306a36Sopenharmony_ci	u32 num_fs;
557462306a36Sopenharmony_ci	u32 mount_fscid = (u32)-1;
557562306a36Sopenharmony_ci	int err = -EINVAL;
557662306a36Sopenharmony_ci
557762306a36Sopenharmony_ci	ceph_decode_need(&p, end, sizeof(u32), bad);
557862306a36Sopenharmony_ci	epoch = ceph_decode_32(&p);
557962306a36Sopenharmony_ci
558062306a36Sopenharmony_ci	dout("handle_fsmap epoch %u\n", epoch);
558162306a36Sopenharmony_ci
558262306a36Sopenharmony_ci	/* struct_v, struct_cv, map_len, epoch, legacy_client_fscid */
558362306a36Sopenharmony_ci	ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 3, bad);
558462306a36Sopenharmony_ci
558562306a36Sopenharmony_ci	ceph_decode_32_safe(&p, end, num_fs, bad);
558662306a36Sopenharmony_ci	while (num_fs-- > 0) {
558762306a36Sopenharmony_ci		void *info_p, *info_end;
558862306a36Sopenharmony_ci		u32 info_len;
558962306a36Sopenharmony_ci		u32 fscid, namelen;
559062306a36Sopenharmony_ci
559162306a36Sopenharmony_ci		ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
559262306a36Sopenharmony_ci		p += 2;		// info_v, info_cv
559362306a36Sopenharmony_ci		info_len = ceph_decode_32(&p);
559462306a36Sopenharmony_ci		ceph_decode_need(&p, end, info_len, bad);
559562306a36Sopenharmony_ci		info_p = p;
559662306a36Sopenharmony_ci		info_end = p + info_len;
559762306a36Sopenharmony_ci		p = info_end;
559862306a36Sopenharmony_ci
559962306a36Sopenharmony_ci		ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad);
560062306a36Sopenharmony_ci		fscid = ceph_decode_32(&info_p);
560162306a36Sopenharmony_ci		namelen = ceph_decode_32(&info_p);
560262306a36Sopenharmony_ci		ceph_decode_need(&info_p, info_end, namelen, bad);
560362306a36Sopenharmony_ci
560462306a36Sopenharmony_ci		if (mds_namespace &&
560562306a36Sopenharmony_ci		    strlen(mds_namespace) == namelen &&
560662306a36Sopenharmony_ci		    !strncmp(mds_namespace, (char *)info_p, namelen)) {
560762306a36Sopenharmony_ci			mount_fscid = fscid;
560862306a36Sopenharmony_ci			break;
560962306a36Sopenharmony_ci		}
561062306a36Sopenharmony_ci	}
561162306a36Sopenharmony_ci
561262306a36Sopenharmony_ci	ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch);
561362306a36Sopenharmony_ci	if (mount_fscid != (u32)-1) {
561462306a36Sopenharmony_ci		fsc->client->monc.fs_cluster_id = mount_fscid;
561562306a36Sopenharmony_ci		ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
561662306a36Sopenharmony_ci				   0, true);
561762306a36Sopenharmony_ci		ceph_monc_renew_subs(&fsc->client->monc);
561862306a36Sopenharmony_ci	} else {
561962306a36Sopenharmony_ci		err = -ENOENT;
562062306a36Sopenharmony_ci		goto err_out;
562162306a36Sopenharmony_ci	}
562262306a36Sopenharmony_ci	return;
562362306a36Sopenharmony_ci
562462306a36Sopenharmony_cibad:
562562306a36Sopenharmony_ci	pr_err("error decoding fsmap %d. Shutting down mount.\n", err);
562662306a36Sopenharmony_ci	ceph_umount_begin(mdsc->fsc->sb);
562762306a36Sopenharmony_ci	ceph_msg_dump(msg);
562862306a36Sopenharmony_cierr_out:
562962306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
563062306a36Sopenharmony_ci	mdsc->mdsmap_err = err;
563162306a36Sopenharmony_ci	__wake_requests(mdsc, &mdsc->waiting_for_map);
563262306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
563362306a36Sopenharmony_ci}
563462306a36Sopenharmony_ci
563562306a36Sopenharmony_ci/*
563662306a36Sopenharmony_ci * handle mds map update.
563762306a36Sopenharmony_ci */
563862306a36Sopenharmony_civoid ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
563962306a36Sopenharmony_ci{
564062306a36Sopenharmony_ci	u32 epoch;
564162306a36Sopenharmony_ci	u32 maplen;
564262306a36Sopenharmony_ci	void *p = msg->front.iov_base;
564362306a36Sopenharmony_ci	void *end = p + msg->front.iov_len;
564462306a36Sopenharmony_ci	struct ceph_mdsmap *newmap, *oldmap;
564562306a36Sopenharmony_ci	struct ceph_fsid fsid;
564662306a36Sopenharmony_ci	int err = -EINVAL;
564762306a36Sopenharmony_ci
564862306a36Sopenharmony_ci	ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
564962306a36Sopenharmony_ci	ceph_decode_copy(&p, &fsid, sizeof(fsid));
565062306a36Sopenharmony_ci	if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
565162306a36Sopenharmony_ci		return;
565262306a36Sopenharmony_ci	epoch = ceph_decode_32(&p);
565362306a36Sopenharmony_ci	maplen = ceph_decode_32(&p);
565462306a36Sopenharmony_ci	dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
565562306a36Sopenharmony_ci
565662306a36Sopenharmony_ci	/* do we need it? */
565762306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
565862306a36Sopenharmony_ci	if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
565962306a36Sopenharmony_ci		dout("handle_map epoch %u <= our %u\n",
566062306a36Sopenharmony_ci		     epoch, mdsc->mdsmap->m_epoch);
566162306a36Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
566262306a36Sopenharmony_ci		return;
566362306a36Sopenharmony_ci	}
566462306a36Sopenharmony_ci
566562306a36Sopenharmony_ci	newmap = ceph_mdsmap_decode(&p, end, ceph_msgr2(mdsc->fsc->client));
566662306a36Sopenharmony_ci	if (IS_ERR(newmap)) {
566762306a36Sopenharmony_ci		err = PTR_ERR(newmap);
566862306a36Sopenharmony_ci		goto bad_unlock;
566962306a36Sopenharmony_ci	}
567062306a36Sopenharmony_ci
567162306a36Sopenharmony_ci	/* swap into place */
567262306a36Sopenharmony_ci	if (mdsc->mdsmap) {
567362306a36Sopenharmony_ci		oldmap = mdsc->mdsmap;
567462306a36Sopenharmony_ci		mdsc->mdsmap = newmap;
567562306a36Sopenharmony_ci		check_new_map(mdsc, newmap, oldmap);
567662306a36Sopenharmony_ci		ceph_mdsmap_destroy(oldmap);
567762306a36Sopenharmony_ci	} else {
567862306a36Sopenharmony_ci		mdsc->mdsmap = newmap;  /* first mds map */
567962306a36Sopenharmony_ci	}
568062306a36Sopenharmony_ci	mdsc->fsc->max_file_size = min((loff_t)mdsc->mdsmap->m_max_file_size,
568162306a36Sopenharmony_ci					MAX_LFS_FILESIZE);
568262306a36Sopenharmony_ci
568362306a36Sopenharmony_ci	__wake_requests(mdsc, &mdsc->waiting_for_map);
568462306a36Sopenharmony_ci	ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP,
568562306a36Sopenharmony_ci			  mdsc->mdsmap->m_epoch);
568662306a36Sopenharmony_ci
568762306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
568862306a36Sopenharmony_ci	schedule_delayed(mdsc, 0);
568962306a36Sopenharmony_ci	return;
569062306a36Sopenharmony_ci
569162306a36Sopenharmony_cibad_unlock:
569262306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
569362306a36Sopenharmony_cibad:
569462306a36Sopenharmony_ci	pr_err("error decoding mdsmap %d. Shutting down mount.\n", err);
569562306a36Sopenharmony_ci	ceph_umount_begin(mdsc->fsc->sb);
569662306a36Sopenharmony_ci	ceph_msg_dump(msg);
569762306a36Sopenharmony_ci	return;
569862306a36Sopenharmony_ci}
569962306a36Sopenharmony_ci
570062306a36Sopenharmony_cistatic struct ceph_connection *mds_get_con(struct ceph_connection *con)
570162306a36Sopenharmony_ci{
570262306a36Sopenharmony_ci	struct ceph_mds_session *s = con->private;
570362306a36Sopenharmony_ci
570462306a36Sopenharmony_ci	if (ceph_get_mds_session(s))
570562306a36Sopenharmony_ci		return con;
570662306a36Sopenharmony_ci	return NULL;
570762306a36Sopenharmony_ci}
570862306a36Sopenharmony_ci
570962306a36Sopenharmony_cistatic void mds_put_con(struct ceph_connection *con)
571062306a36Sopenharmony_ci{
571162306a36Sopenharmony_ci	struct ceph_mds_session *s = con->private;
571262306a36Sopenharmony_ci
571362306a36Sopenharmony_ci	ceph_put_mds_session(s);
571462306a36Sopenharmony_ci}
571562306a36Sopenharmony_ci
571662306a36Sopenharmony_ci/*
571762306a36Sopenharmony_ci * if the client is unresponsive for long enough, the mds will kill
571862306a36Sopenharmony_ci * the session entirely.
571962306a36Sopenharmony_ci */
572062306a36Sopenharmony_cistatic void mds_peer_reset(struct ceph_connection *con)
572162306a36Sopenharmony_ci{
572262306a36Sopenharmony_ci	struct ceph_mds_session *s = con->private;
572362306a36Sopenharmony_ci	struct ceph_mds_client *mdsc = s->s_mdsc;
572462306a36Sopenharmony_ci
572562306a36Sopenharmony_ci	pr_warn("mds%d closed our session\n", s->s_mds);
572662306a36Sopenharmony_ci	if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO)
572762306a36Sopenharmony_ci		send_mds_reconnect(mdsc, s);
572862306a36Sopenharmony_ci}
572962306a36Sopenharmony_ci
573062306a36Sopenharmony_cistatic void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
573162306a36Sopenharmony_ci{
573262306a36Sopenharmony_ci	struct ceph_mds_session *s = con->private;
573362306a36Sopenharmony_ci	struct ceph_mds_client *mdsc = s->s_mdsc;
573462306a36Sopenharmony_ci	int type = le16_to_cpu(msg->hdr.type);
573562306a36Sopenharmony_ci
573662306a36Sopenharmony_ci	mutex_lock(&mdsc->mutex);
573762306a36Sopenharmony_ci	if (__verify_registered_session(mdsc, s) < 0) {
573862306a36Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
573962306a36Sopenharmony_ci		goto out;
574062306a36Sopenharmony_ci	}
574162306a36Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
574262306a36Sopenharmony_ci
574362306a36Sopenharmony_ci	switch (type) {
574462306a36Sopenharmony_ci	case CEPH_MSG_MDS_MAP:
574562306a36Sopenharmony_ci		ceph_mdsc_handle_mdsmap(mdsc, msg);
574662306a36Sopenharmony_ci		break;
574762306a36Sopenharmony_ci	case CEPH_MSG_FS_MAP_USER:
574862306a36Sopenharmony_ci		ceph_mdsc_handle_fsmap(mdsc, msg);
574962306a36Sopenharmony_ci		break;
575062306a36Sopenharmony_ci	case CEPH_MSG_CLIENT_SESSION:
575162306a36Sopenharmony_ci		handle_session(s, msg);
575262306a36Sopenharmony_ci		break;
575362306a36Sopenharmony_ci	case CEPH_MSG_CLIENT_REPLY:
575462306a36Sopenharmony_ci		handle_reply(s, msg);
575562306a36Sopenharmony_ci		break;
575662306a36Sopenharmony_ci	case CEPH_MSG_CLIENT_REQUEST_FORWARD:
575762306a36Sopenharmony_ci		handle_forward(mdsc, s, msg);
575862306a36Sopenharmony_ci		break;
575962306a36Sopenharmony_ci	case CEPH_MSG_CLIENT_CAPS:
576062306a36Sopenharmony_ci		ceph_handle_caps(s, msg);
576162306a36Sopenharmony_ci		break;
576262306a36Sopenharmony_ci	case CEPH_MSG_CLIENT_SNAP:
576362306a36Sopenharmony_ci		ceph_handle_snap(mdsc, s, msg);
576462306a36Sopenharmony_ci		break;
576562306a36Sopenharmony_ci	case CEPH_MSG_CLIENT_LEASE:
576662306a36Sopenharmony_ci		handle_lease(mdsc, s, msg);
576762306a36Sopenharmony_ci		break;
576862306a36Sopenharmony_ci	case CEPH_MSG_CLIENT_QUOTA:
576962306a36Sopenharmony_ci		ceph_handle_quota(mdsc, s, msg);
577062306a36Sopenharmony_ci		break;
577162306a36Sopenharmony_ci
577262306a36Sopenharmony_ci	default:
577362306a36Sopenharmony_ci		pr_err("received unknown message type %d %s\n", type,
577462306a36Sopenharmony_ci		       ceph_msg_type_name(type));
577562306a36Sopenharmony_ci	}
577662306a36Sopenharmony_ciout:
577762306a36Sopenharmony_ci	ceph_msg_put(msg);
577862306a36Sopenharmony_ci}
577962306a36Sopenharmony_ci
578062306a36Sopenharmony_ci/*
578162306a36Sopenharmony_ci * authentication
578262306a36Sopenharmony_ci */
578362306a36Sopenharmony_ci
578462306a36Sopenharmony_ci/*
578562306a36Sopenharmony_ci * Note: returned pointer is the address of a structure that's
578662306a36Sopenharmony_ci * managed separately.  Caller must *not* attempt to free it.
578762306a36Sopenharmony_ci */
578862306a36Sopenharmony_cistatic struct ceph_auth_handshake *
578962306a36Sopenharmony_cimds_get_authorizer(struct ceph_connection *con, int *proto, int force_new)
579062306a36Sopenharmony_ci{
579162306a36Sopenharmony_ci	struct ceph_mds_session *s = con->private;
579262306a36Sopenharmony_ci	struct ceph_mds_client *mdsc = s->s_mdsc;
579362306a36Sopenharmony_ci	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
579462306a36Sopenharmony_ci	struct ceph_auth_handshake *auth = &s->s_auth;
579562306a36Sopenharmony_ci	int ret;
579662306a36Sopenharmony_ci
579762306a36Sopenharmony_ci	ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS,
579862306a36Sopenharmony_ci					 force_new, proto, NULL, NULL);
579962306a36Sopenharmony_ci	if (ret)
580062306a36Sopenharmony_ci		return ERR_PTR(ret);
580162306a36Sopenharmony_ci
580262306a36Sopenharmony_ci	return auth;
580362306a36Sopenharmony_ci}
580462306a36Sopenharmony_ci
580562306a36Sopenharmony_cistatic int mds_add_authorizer_challenge(struct ceph_connection *con,
580662306a36Sopenharmony_ci				    void *challenge_buf, int challenge_buf_len)
580762306a36Sopenharmony_ci{
580862306a36Sopenharmony_ci	struct ceph_mds_session *s = con->private;
580962306a36Sopenharmony_ci	struct ceph_mds_client *mdsc = s->s_mdsc;
581062306a36Sopenharmony_ci	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
581162306a36Sopenharmony_ci
581262306a36Sopenharmony_ci	return ceph_auth_add_authorizer_challenge(ac, s->s_auth.authorizer,
581362306a36Sopenharmony_ci					    challenge_buf, challenge_buf_len);
581462306a36Sopenharmony_ci}
581562306a36Sopenharmony_ci
581662306a36Sopenharmony_cistatic int mds_verify_authorizer_reply(struct ceph_connection *con)
581762306a36Sopenharmony_ci{
581862306a36Sopenharmony_ci	struct ceph_mds_session *s = con->private;
581962306a36Sopenharmony_ci	struct ceph_mds_client *mdsc = s->s_mdsc;
582062306a36Sopenharmony_ci	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
582162306a36Sopenharmony_ci	struct ceph_auth_handshake *auth = &s->s_auth;
582262306a36Sopenharmony_ci
582362306a36Sopenharmony_ci	return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
582462306a36Sopenharmony_ci		auth->authorizer_reply_buf, auth->authorizer_reply_buf_len,
582562306a36Sopenharmony_ci		NULL, NULL, NULL, NULL);
582662306a36Sopenharmony_ci}
582762306a36Sopenharmony_ci
582862306a36Sopenharmony_cistatic int mds_invalidate_authorizer(struct ceph_connection *con)
582962306a36Sopenharmony_ci{
583062306a36Sopenharmony_ci	struct ceph_mds_session *s = con->private;
583162306a36Sopenharmony_ci	struct ceph_mds_client *mdsc = s->s_mdsc;
583262306a36Sopenharmony_ci	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
583362306a36Sopenharmony_ci
583462306a36Sopenharmony_ci	ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
583562306a36Sopenharmony_ci
583662306a36Sopenharmony_ci	return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
583762306a36Sopenharmony_ci}
583862306a36Sopenharmony_ci
583962306a36Sopenharmony_cistatic int mds_get_auth_request(struct ceph_connection *con,
584062306a36Sopenharmony_ci				void *buf, int *buf_len,
584162306a36Sopenharmony_ci				void **authorizer, int *authorizer_len)
584262306a36Sopenharmony_ci{
584362306a36Sopenharmony_ci	struct ceph_mds_session *s = con->private;
584462306a36Sopenharmony_ci	struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
584562306a36Sopenharmony_ci	struct ceph_auth_handshake *auth = &s->s_auth;
584662306a36Sopenharmony_ci	int ret;
584762306a36Sopenharmony_ci
584862306a36Sopenharmony_ci	ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_MDS,
584962306a36Sopenharmony_ci				       buf, buf_len);
585062306a36Sopenharmony_ci	if (ret)
585162306a36Sopenharmony_ci		return ret;
585262306a36Sopenharmony_ci
585362306a36Sopenharmony_ci	*authorizer = auth->authorizer_buf;
585462306a36Sopenharmony_ci	*authorizer_len = auth->authorizer_buf_len;
585562306a36Sopenharmony_ci	return 0;
585662306a36Sopenharmony_ci}
585762306a36Sopenharmony_ci
585862306a36Sopenharmony_cistatic int mds_handle_auth_reply_more(struct ceph_connection *con,
585962306a36Sopenharmony_ci				      void *reply, int reply_len,
586062306a36Sopenharmony_ci				      void *buf, int *buf_len,
586162306a36Sopenharmony_ci				      void **authorizer, int *authorizer_len)
586262306a36Sopenharmony_ci{
586362306a36Sopenharmony_ci	struct ceph_mds_session *s = con->private;
586462306a36Sopenharmony_ci	struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
586562306a36Sopenharmony_ci	struct ceph_auth_handshake *auth = &s->s_auth;
586662306a36Sopenharmony_ci	int ret;
586762306a36Sopenharmony_ci
586862306a36Sopenharmony_ci	ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len,
586962306a36Sopenharmony_ci					      buf, buf_len);
587062306a36Sopenharmony_ci	if (ret)
587162306a36Sopenharmony_ci		return ret;
587262306a36Sopenharmony_ci
587362306a36Sopenharmony_ci	*authorizer = auth->authorizer_buf;
587462306a36Sopenharmony_ci	*authorizer_len = auth->authorizer_buf_len;
587562306a36Sopenharmony_ci	return 0;
587662306a36Sopenharmony_ci}
587762306a36Sopenharmony_ci
587862306a36Sopenharmony_cistatic int mds_handle_auth_done(struct ceph_connection *con,
587962306a36Sopenharmony_ci				u64 global_id, void *reply, int reply_len,
588062306a36Sopenharmony_ci				u8 *session_key, int *session_key_len,
588162306a36Sopenharmony_ci				u8 *con_secret, int *con_secret_len)
588262306a36Sopenharmony_ci{
588362306a36Sopenharmony_ci	struct ceph_mds_session *s = con->private;
588462306a36Sopenharmony_ci	struct ceph_auth_client *ac = s->s_mdsc->fsc->client->monc.auth;
588562306a36Sopenharmony_ci	struct ceph_auth_handshake *auth = &s->s_auth;
588662306a36Sopenharmony_ci
588762306a36Sopenharmony_ci	return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len,
588862306a36Sopenharmony_ci					       session_key, session_key_len,
588962306a36Sopenharmony_ci					       con_secret, con_secret_len);
589062306a36Sopenharmony_ci}
589162306a36Sopenharmony_ci
589262306a36Sopenharmony_cistatic int mds_handle_auth_bad_method(struct ceph_connection *con,
589362306a36Sopenharmony_ci				      int used_proto, int result,
589462306a36Sopenharmony_ci				      const int *allowed_protos, int proto_cnt,
589562306a36Sopenharmony_ci				      const int *allowed_modes, int mode_cnt)
589662306a36Sopenharmony_ci{
589762306a36Sopenharmony_ci	struct ceph_mds_session *s = con->private;
589862306a36Sopenharmony_ci	struct ceph_mon_client *monc = &s->s_mdsc->fsc->client->monc;
589962306a36Sopenharmony_ci	int ret;
590062306a36Sopenharmony_ci
590162306a36Sopenharmony_ci	if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_MDS,
590262306a36Sopenharmony_ci					    used_proto, result,
590362306a36Sopenharmony_ci					    allowed_protos, proto_cnt,
590462306a36Sopenharmony_ci					    allowed_modes, mode_cnt)) {
590562306a36Sopenharmony_ci		ret = ceph_monc_validate_auth(monc);
590662306a36Sopenharmony_ci		if (ret)
590762306a36Sopenharmony_ci			return ret;
590862306a36Sopenharmony_ci	}
590962306a36Sopenharmony_ci
591062306a36Sopenharmony_ci	return -EACCES;
591162306a36Sopenharmony_ci}
591262306a36Sopenharmony_ci
591362306a36Sopenharmony_cistatic struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
591462306a36Sopenharmony_ci				struct ceph_msg_header *hdr, int *skip)
591562306a36Sopenharmony_ci{
591662306a36Sopenharmony_ci	struct ceph_msg *msg;
591762306a36Sopenharmony_ci	int type = (int) le16_to_cpu(hdr->type);
591862306a36Sopenharmony_ci	int front_len = (int) le32_to_cpu(hdr->front_len);
591962306a36Sopenharmony_ci
592062306a36Sopenharmony_ci	if (con->in_msg)
592162306a36Sopenharmony_ci		return con->in_msg;
592262306a36Sopenharmony_ci
592362306a36Sopenharmony_ci	*skip = 0;
592462306a36Sopenharmony_ci	msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
592562306a36Sopenharmony_ci	if (!msg) {
592662306a36Sopenharmony_ci		pr_err("unable to allocate msg type %d len %d\n",
592762306a36Sopenharmony_ci		       type, front_len);
592862306a36Sopenharmony_ci		return NULL;
592962306a36Sopenharmony_ci	}
593062306a36Sopenharmony_ci
593162306a36Sopenharmony_ci	return msg;
593262306a36Sopenharmony_ci}
593362306a36Sopenharmony_ci
593462306a36Sopenharmony_cistatic int mds_sign_message(struct ceph_msg *msg)
593562306a36Sopenharmony_ci{
593662306a36Sopenharmony_ci       struct ceph_mds_session *s = msg->con->private;
593762306a36Sopenharmony_ci       struct ceph_auth_handshake *auth = &s->s_auth;
593862306a36Sopenharmony_ci
593962306a36Sopenharmony_ci       return ceph_auth_sign_message(auth, msg);
594062306a36Sopenharmony_ci}
594162306a36Sopenharmony_ci
594262306a36Sopenharmony_cistatic int mds_check_message_signature(struct ceph_msg *msg)
594362306a36Sopenharmony_ci{
594462306a36Sopenharmony_ci       struct ceph_mds_session *s = msg->con->private;
594562306a36Sopenharmony_ci       struct ceph_auth_handshake *auth = &s->s_auth;
594662306a36Sopenharmony_ci
594762306a36Sopenharmony_ci       return ceph_auth_check_message_signature(auth, msg);
594862306a36Sopenharmony_ci}
594962306a36Sopenharmony_ci
595062306a36Sopenharmony_cistatic const struct ceph_connection_operations mds_con_ops = {
595162306a36Sopenharmony_ci	.get = mds_get_con,
595262306a36Sopenharmony_ci	.put = mds_put_con,
595362306a36Sopenharmony_ci	.alloc_msg = mds_alloc_msg,
595462306a36Sopenharmony_ci	.dispatch = mds_dispatch,
595562306a36Sopenharmony_ci	.peer_reset = mds_peer_reset,
595662306a36Sopenharmony_ci	.get_authorizer = mds_get_authorizer,
595762306a36Sopenharmony_ci	.add_authorizer_challenge = mds_add_authorizer_challenge,
595862306a36Sopenharmony_ci	.verify_authorizer_reply = mds_verify_authorizer_reply,
595962306a36Sopenharmony_ci	.invalidate_authorizer = mds_invalidate_authorizer,
596062306a36Sopenharmony_ci	.sign_message = mds_sign_message,
596162306a36Sopenharmony_ci	.check_message_signature = mds_check_message_signature,
596262306a36Sopenharmony_ci	.get_auth_request = mds_get_auth_request,
596362306a36Sopenharmony_ci	.handle_auth_reply_more = mds_handle_auth_reply_more,
596462306a36Sopenharmony_ci	.handle_auth_done = mds_handle_auth_done,
596562306a36Sopenharmony_ci	.handle_auth_bad_method = mds_handle_auth_bad_method,
596662306a36Sopenharmony_ci};
596762306a36Sopenharmony_ci
596862306a36Sopenharmony_ci/* eof */
5969