18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
28c2ecf20Sopenharmony_ci#include <linux/ceph/ceph_debug.h>
38c2ecf20Sopenharmony_ci
48c2ecf20Sopenharmony_ci#include <linux/fs.h>
58c2ecf20Sopenharmony_ci#include <linux/wait.h>
68c2ecf20Sopenharmony_ci#include <linux/slab.h>
78c2ecf20Sopenharmony_ci#include <linux/gfp.h>
88c2ecf20Sopenharmony_ci#include <linux/sched.h>
98c2ecf20Sopenharmony_ci#include <linux/debugfs.h>
108c2ecf20Sopenharmony_ci#include <linux/seq_file.h>
118c2ecf20Sopenharmony_ci#include <linux/ratelimit.h>
128c2ecf20Sopenharmony_ci#include <linux/bits.h>
138c2ecf20Sopenharmony_ci#include <linux/ktime.h>
148c2ecf20Sopenharmony_ci
158c2ecf20Sopenharmony_ci#include "super.h"
168c2ecf20Sopenharmony_ci#include "mds_client.h"
178c2ecf20Sopenharmony_ci
188c2ecf20Sopenharmony_ci#include <linux/ceph/ceph_features.h>
198c2ecf20Sopenharmony_ci#include <linux/ceph/messenger.h>
208c2ecf20Sopenharmony_ci#include <linux/ceph/decode.h>
218c2ecf20Sopenharmony_ci#include <linux/ceph/pagelist.h>
228c2ecf20Sopenharmony_ci#include <linux/ceph/auth.h>
238c2ecf20Sopenharmony_ci#include <linux/ceph/debugfs.h>
248c2ecf20Sopenharmony_ci
258c2ecf20Sopenharmony_ci#define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE)
268c2ecf20Sopenharmony_ci
278c2ecf20Sopenharmony_ci/*
288c2ecf20Sopenharmony_ci * A cluster of MDS (metadata server) daemons is responsible for
298c2ecf20Sopenharmony_ci * managing the file system namespace (the directory hierarchy and
308c2ecf20Sopenharmony_ci * inodes) and for coordinating shared access to storage.  Metadata is
318c2ecf20Sopenharmony_ci * partitioning hierarchically across a number of servers, and that
328c2ecf20Sopenharmony_ci * partition varies over time as the cluster adjusts the distribution
338c2ecf20Sopenharmony_ci * in order to balance load.
348c2ecf20Sopenharmony_ci *
358c2ecf20Sopenharmony_ci * The MDS client is primarily responsible to managing synchronous
368c2ecf20Sopenharmony_ci * metadata requests for operations like open, unlink, and so forth.
378c2ecf20Sopenharmony_ci * If there is a MDS failure, we find out about it when we (possibly
388c2ecf20Sopenharmony_ci * request and) receive a new MDS map, and can resubmit affected
398c2ecf20Sopenharmony_ci * requests.
408c2ecf20Sopenharmony_ci *
418c2ecf20Sopenharmony_ci * For the most part, though, we take advantage of a lossless
428c2ecf20Sopenharmony_ci * communications channel to the MDS, and do not need to worry about
438c2ecf20Sopenharmony_ci * timing out or resubmitting requests.
448c2ecf20Sopenharmony_ci *
458c2ecf20Sopenharmony_ci * We maintain a stateful "session" with each MDS we interact with.
468c2ecf20Sopenharmony_ci * Within each session, we sent periodic heartbeat messages to ensure
478c2ecf20Sopenharmony_ci * any capabilities or leases we have been issues remain valid.  If
488c2ecf20Sopenharmony_ci * the session times out and goes stale, our leases and capabilities
498c2ecf20Sopenharmony_ci * are no longer valid.
508c2ecf20Sopenharmony_ci */
518c2ecf20Sopenharmony_ci
528c2ecf20Sopenharmony_cistruct ceph_reconnect_state {
538c2ecf20Sopenharmony_ci	struct ceph_mds_session *session;
548c2ecf20Sopenharmony_ci	int nr_caps, nr_realms;
558c2ecf20Sopenharmony_ci	struct ceph_pagelist *pagelist;
568c2ecf20Sopenharmony_ci	unsigned msg_version;
578c2ecf20Sopenharmony_ci	bool allow_multi;
588c2ecf20Sopenharmony_ci};
598c2ecf20Sopenharmony_ci
608c2ecf20Sopenharmony_cistatic void __wake_requests(struct ceph_mds_client *mdsc,
618c2ecf20Sopenharmony_ci			    struct list_head *head);
628c2ecf20Sopenharmony_cistatic void ceph_cap_release_work(struct work_struct *work);
638c2ecf20Sopenharmony_cistatic void ceph_cap_reclaim_work(struct work_struct *work);
648c2ecf20Sopenharmony_ci
658c2ecf20Sopenharmony_cistatic const struct ceph_connection_operations mds_con_ops;
668c2ecf20Sopenharmony_ci
678c2ecf20Sopenharmony_ci
688c2ecf20Sopenharmony_ci/*
698c2ecf20Sopenharmony_ci * mds reply parsing
708c2ecf20Sopenharmony_ci */
718c2ecf20Sopenharmony_ci
728c2ecf20Sopenharmony_cistatic int parse_reply_info_quota(void **p, void *end,
738c2ecf20Sopenharmony_ci				  struct ceph_mds_reply_info_in *info)
748c2ecf20Sopenharmony_ci{
758c2ecf20Sopenharmony_ci	u8 struct_v, struct_compat;
768c2ecf20Sopenharmony_ci	u32 struct_len;
778c2ecf20Sopenharmony_ci
788c2ecf20Sopenharmony_ci	ceph_decode_8_safe(p, end, struct_v, bad);
798c2ecf20Sopenharmony_ci	ceph_decode_8_safe(p, end, struct_compat, bad);
808c2ecf20Sopenharmony_ci	/* struct_v is expected to be >= 1. we only
818c2ecf20Sopenharmony_ci	 * understand encoding with struct_compat == 1. */
828c2ecf20Sopenharmony_ci	if (!struct_v || struct_compat != 1)
838c2ecf20Sopenharmony_ci		goto bad;
848c2ecf20Sopenharmony_ci	ceph_decode_32_safe(p, end, struct_len, bad);
858c2ecf20Sopenharmony_ci	ceph_decode_need(p, end, struct_len, bad);
868c2ecf20Sopenharmony_ci	end = *p + struct_len;
878c2ecf20Sopenharmony_ci	ceph_decode_64_safe(p, end, info->max_bytes, bad);
888c2ecf20Sopenharmony_ci	ceph_decode_64_safe(p, end, info->max_files, bad);
898c2ecf20Sopenharmony_ci	*p = end;
908c2ecf20Sopenharmony_ci	return 0;
918c2ecf20Sopenharmony_cibad:
928c2ecf20Sopenharmony_ci	return -EIO;
938c2ecf20Sopenharmony_ci}
948c2ecf20Sopenharmony_ci
958c2ecf20Sopenharmony_ci/*
968c2ecf20Sopenharmony_ci * parse individual inode info
978c2ecf20Sopenharmony_ci */
988c2ecf20Sopenharmony_cistatic int parse_reply_info_in(void **p, void *end,
998c2ecf20Sopenharmony_ci			       struct ceph_mds_reply_info_in *info,
1008c2ecf20Sopenharmony_ci			       u64 features)
1018c2ecf20Sopenharmony_ci{
1028c2ecf20Sopenharmony_ci	int err = 0;
1038c2ecf20Sopenharmony_ci	u8 struct_v = 0;
1048c2ecf20Sopenharmony_ci
1058c2ecf20Sopenharmony_ci	if (features == (u64)-1) {
1068c2ecf20Sopenharmony_ci		u32 struct_len;
1078c2ecf20Sopenharmony_ci		u8 struct_compat;
1088c2ecf20Sopenharmony_ci		ceph_decode_8_safe(p, end, struct_v, bad);
1098c2ecf20Sopenharmony_ci		ceph_decode_8_safe(p, end, struct_compat, bad);
1108c2ecf20Sopenharmony_ci		/* struct_v is expected to be >= 1. we only understand
1118c2ecf20Sopenharmony_ci		 * encoding with struct_compat == 1. */
1128c2ecf20Sopenharmony_ci		if (!struct_v || struct_compat != 1)
1138c2ecf20Sopenharmony_ci			goto bad;
1148c2ecf20Sopenharmony_ci		ceph_decode_32_safe(p, end, struct_len, bad);
1158c2ecf20Sopenharmony_ci		ceph_decode_need(p, end, struct_len, bad);
1168c2ecf20Sopenharmony_ci		end = *p + struct_len;
1178c2ecf20Sopenharmony_ci	}
1188c2ecf20Sopenharmony_ci
1198c2ecf20Sopenharmony_ci	ceph_decode_need(p, end, sizeof(struct ceph_mds_reply_inode), bad);
1208c2ecf20Sopenharmony_ci	info->in = *p;
1218c2ecf20Sopenharmony_ci	*p += sizeof(struct ceph_mds_reply_inode) +
1228c2ecf20Sopenharmony_ci		sizeof(*info->in->fragtree.splits) *
1238c2ecf20Sopenharmony_ci		le32_to_cpu(info->in->fragtree.nsplits);
1248c2ecf20Sopenharmony_ci
1258c2ecf20Sopenharmony_ci	ceph_decode_32_safe(p, end, info->symlink_len, bad);
1268c2ecf20Sopenharmony_ci	ceph_decode_need(p, end, info->symlink_len, bad);
1278c2ecf20Sopenharmony_ci	info->symlink = *p;
1288c2ecf20Sopenharmony_ci	*p += info->symlink_len;
1298c2ecf20Sopenharmony_ci
1308c2ecf20Sopenharmony_ci	ceph_decode_copy_safe(p, end, &info->dir_layout,
1318c2ecf20Sopenharmony_ci			      sizeof(info->dir_layout), bad);
1328c2ecf20Sopenharmony_ci	ceph_decode_32_safe(p, end, info->xattr_len, bad);
1338c2ecf20Sopenharmony_ci	ceph_decode_need(p, end, info->xattr_len, bad);
1348c2ecf20Sopenharmony_ci	info->xattr_data = *p;
1358c2ecf20Sopenharmony_ci	*p += info->xattr_len;
1368c2ecf20Sopenharmony_ci
1378c2ecf20Sopenharmony_ci	if (features == (u64)-1) {
1388c2ecf20Sopenharmony_ci		/* inline data */
1398c2ecf20Sopenharmony_ci		ceph_decode_64_safe(p, end, info->inline_version, bad);
1408c2ecf20Sopenharmony_ci		ceph_decode_32_safe(p, end, info->inline_len, bad);
1418c2ecf20Sopenharmony_ci		ceph_decode_need(p, end, info->inline_len, bad);
1428c2ecf20Sopenharmony_ci		info->inline_data = *p;
1438c2ecf20Sopenharmony_ci		*p += info->inline_len;
1448c2ecf20Sopenharmony_ci		/* quota */
1458c2ecf20Sopenharmony_ci		err = parse_reply_info_quota(p, end, info);
1468c2ecf20Sopenharmony_ci		if (err < 0)
1478c2ecf20Sopenharmony_ci			goto out_bad;
1488c2ecf20Sopenharmony_ci		/* pool namespace */
1498c2ecf20Sopenharmony_ci		ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
1508c2ecf20Sopenharmony_ci		if (info->pool_ns_len > 0) {
1518c2ecf20Sopenharmony_ci			ceph_decode_need(p, end, info->pool_ns_len, bad);
1528c2ecf20Sopenharmony_ci			info->pool_ns_data = *p;
1538c2ecf20Sopenharmony_ci			*p += info->pool_ns_len;
1548c2ecf20Sopenharmony_ci		}
1558c2ecf20Sopenharmony_ci
1568c2ecf20Sopenharmony_ci		/* btime */
1578c2ecf20Sopenharmony_ci		ceph_decode_need(p, end, sizeof(info->btime), bad);
1588c2ecf20Sopenharmony_ci		ceph_decode_copy(p, &info->btime, sizeof(info->btime));
1598c2ecf20Sopenharmony_ci
1608c2ecf20Sopenharmony_ci		/* change attribute */
1618c2ecf20Sopenharmony_ci		ceph_decode_64_safe(p, end, info->change_attr, bad);
1628c2ecf20Sopenharmony_ci
1638c2ecf20Sopenharmony_ci		/* dir pin */
1648c2ecf20Sopenharmony_ci		if (struct_v >= 2) {
1658c2ecf20Sopenharmony_ci			ceph_decode_32_safe(p, end, info->dir_pin, bad);
1668c2ecf20Sopenharmony_ci		} else {
1678c2ecf20Sopenharmony_ci			info->dir_pin = -ENODATA;
1688c2ecf20Sopenharmony_ci		}
1698c2ecf20Sopenharmony_ci
1708c2ecf20Sopenharmony_ci		/* snapshot birth time, remains zero for v<=2 */
1718c2ecf20Sopenharmony_ci		if (struct_v >= 3) {
1728c2ecf20Sopenharmony_ci			ceph_decode_need(p, end, sizeof(info->snap_btime), bad);
1738c2ecf20Sopenharmony_ci			ceph_decode_copy(p, &info->snap_btime,
1748c2ecf20Sopenharmony_ci					 sizeof(info->snap_btime));
1758c2ecf20Sopenharmony_ci		} else {
1768c2ecf20Sopenharmony_ci			memset(&info->snap_btime, 0, sizeof(info->snap_btime));
1778c2ecf20Sopenharmony_ci		}
1788c2ecf20Sopenharmony_ci
1798c2ecf20Sopenharmony_ci		*p = end;
1808c2ecf20Sopenharmony_ci	} else {
1818c2ecf20Sopenharmony_ci		if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
1828c2ecf20Sopenharmony_ci			ceph_decode_64_safe(p, end, info->inline_version, bad);
1838c2ecf20Sopenharmony_ci			ceph_decode_32_safe(p, end, info->inline_len, bad);
1848c2ecf20Sopenharmony_ci			ceph_decode_need(p, end, info->inline_len, bad);
1858c2ecf20Sopenharmony_ci			info->inline_data = *p;
1868c2ecf20Sopenharmony_ci			*p += info->inline_len;
1878c2ecf20Sopenharmony_ci		} else
1888c2ecf20Sopenharmony_ci			info->inline_version = CEPH_INLINE_NONE;
1898c2ecf20Sopenharmony_ci
1908c2ecf20Sopenharmony_ci		if (features & CEPH_FEATURE_MDS_QUOTA) {
1918c2ecf20Sopenharmony_ci			err = parse_reply_info_quota(p, end, info);
1928c2ecf20Sopenharmony_ci			if (err < 0)
1938c2ecf20Sopenharmony_ci				goto out_bad;
1948c2ecf20Sopenharmony_ci		} else {
1958c2ecf20Sopenharmony_ci			info->max_bytes = 0;
1968c2ecf20Sopenharmony_ci			info->max_files = 0;
1978c2ecf20Sopenharmony_ci		}
1988c2ecf20Sopenharmony_ci
1998c2ecf20Sopenharmony_ci		info->pool_ns_len = 0;
2008c2ecf20Sopenharmony_ci		info->pool_ns_data = NULL;
2018c2ecf20Sopenharmony_ci		if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
2028c2ecf20Sopenharmony_ci			ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
2038c2ecf20Sopenharmony_ci			if (info->pool_ns_len > 0) {
2048c2ecf20Sopenharmony_ci				ceph_decode_need(p, end, info->pool_ns_len, bad);
2058c2ecf20Sopenharmony_ci				info->pool_ns_data = *p;
2068c2ecf20Sopenharmony_ci				*p += info->pool_ns_len;
2078c2ecf20Sopenharmony_ci			}
2088c2ecf20Sopenharmony_ci		}
2098c2ecf20Sopenharmony_ci
2108c2ecf20Sopenharmony_ci		if (features & CEPH_FEATURE_FS_BTIME) {
2118c2ecf20Sopenharmony_ci			ceph_decode_need(p, end, sizeof(info->btime), bad);
2128c2ecf20Sopenharmony_ci			ceph_decode_copy(p, &info->btime, sizeof(info->btime));
2138c2ecf20Sopenharmony_ci			ceph_decode_64_safe(p, end, info->change_attr, bad);
2148c2ecf20Sopenharmony_ci		}
2158c2ecf20Sopenharmony_ci
2168c2ecf20Sopenharmony_ci		info->dir_pin = -ENODATA;
2178c2ecf20Sopenharmony_ci		/* info->snap_btime remains zero */
2188c2ecf20Sopenharmony_ci	}
2198c2ecf20Sopenharmony_ci	return 0;
2208c2ecf20Sopenharmony_cibad:
2218c2ecf20Sopenharmony_ci	err = -EIO;
2228c2ecf20Sopenharmony_ciout_bad:
2238c2ecf20Sopenharmony_ci	return err;
2248c2ecf20Sopenharmony_ci}
2258c2ecf20Sopenharmony_ci
2268c2ecf20Sopenharmony_cistatic int parse_reply_info_dir(void **p, void *end,
2278c2ecf20Sopenharmony_ci				struct ceph_mds_reply_dirfrag **dirfrag,
2288c2ecf20Sopenharmony_ci				u64 features)
2298c2ecf20Sopenharmony_ci{
2308c2ecf20Sopenharmony_ci	if (features == (u64)-1) {
2318c2ecf20Sopenharmony_ci		u8 struct_v, struct_compat;
2328c2ecf20Sopenharmony_ci		u32 struct_len;
2338c2ecf20Sopenharmony_ci		ceph_decode_8_safe(p, end, struct_v, bad);
2348c2ecf20Sopenharmony_ci		ceph_decode_8_safe(p, end, struct_compat, bad);
2358c2ecf20Sopenharmony_ci		/* struct_v is expected to be >= 1. we only understand
2368c2ecf20Sopenharmony_ci		 * encoding whose struct_compat == 1. */
2378c2ecf20Sopenharmony_ci		if (!struct_v || struct_compat != 1)
2388c2ecf20Sopenharmony_ci			goto bad;
2398c2ecf20Sopenharmony_ci		ceph_decode_32_safe(p, end, struct_len, bad);
2408c2ecf20Sopenharmony_ci		ceph_decode_need(p, end, struct_len, bad);
2418c2ecf20Sopenharmony_ci		end = *p + struct_len;
2428c2ecf20Sopenharmony_ci	}
2438c2ecf20Sopenharmony_ci
2448c2ecf20Sopenharmony_ci	ceph_decode_need(p, end, sizeof(**dirfrag), bad);
2458c2ecf20Sopenharmony_ci	*dirfrag = *p;
2468c2ecf20Sopenharmony_ci	*p += sizeof(**dirfrag) + sizeof(u32) * le32_to_cpu((*dirfrag)->ndist);
2478c2ecf20Sopenharmony_ci	if (unlikely(*p > end))
2488c2ecf20Sopenharmony_ci		goto bad;
2498c2ecf20Sopenharmony_ci	if (features == (u64)-1)
2508c2ecf20Sopenharmony_ci		*p = end;
2518c2ecf20Sopenharmony_ci	return 0;
2528c2ecf20Sopenharmony_cibad:
2538c2ecf20Sopenharmony_ci	return -EIO;
2548c2ecf20Sopenharmony_ci}
2558c2ecf20Sopenharmony_ci
2568c2ecf20Sopenharmony_cistatic int parse_reply_info_lease(void **p, void *end,
2578c2ecf20Sopenharmony_ci				  struct ceph_mds_reply_lease **lease,
2588c2ecf20Sopenharmony_ci				  u64 features)
2598c2ecf20Sopenharmony_ci{
2608c2ecf20Sopenharmony_ci	if (features == (u64)-1) {
2618c2ecf20Sopenharmony_ci		u8 struct_v, struct_compat;
2628c2ecf20Sopenharmony_ci		u32 struct_len;
2638c2ecf20Sopenharmony_ci		ceph_decode_8_safe(p, end, struct_v, bad);
2648c2ecf20Sopenharmony_ci		ceph_decode_8_safe(p, end, struct_compat, bad);
2658c2ecf20Sopenharmony_ci		/* struct_v is expected to be >= 1. we only understand
2668c2ecf20Sopenharmony_ci		 * encoding whose struct_compat == 1. */
2678c2ecf20Sopenharmony_ci		if (!struct_v || struct_compat != 1)
2688c2ecf20Sopenharmony_ci			goto bad;
2698c2ecf20Sopenharmony_ci		ceph_decode_32_safe(p, end, struct_len, bad);
2708c2ecf20Sopenharmony_ci		ceph_decode_need(p, end, struct_len, bad);
2718c2ecf20Sopenharmony_ci		end = *p + struct_len;
2728c2ecf20Sopenharmony_ci	}
2738c2ecf20Sopenharmony_ci
2748c2ecf20Sopenharmony_ci	ceph_decode_need(p, end, sizeof(**lease), bad);
2758c2ecf20Sopenharmony_ci	*lease = *p;
2768c2ecf20Sopenharmony_ci	*p += sizeof(**lease);
2778c2ecf20Sopenharmony_ci	if (features == (u64)-1)
2788c2ecf20Sopenharmony_ci		*p = end;
2798c2ecf20Sopenharmony_ci	return 0;
2808c2ecf20Sopenharmony_cibad:
2818c2ecf20Sopenharmony_ci	return -EIO;
2828c2ecf20Sopenharmony_ci}
2838c2ecf20Sopenharmony_ci
2848c2ecf20Sopenharmony_ci/*
2858c2ecf20Sopenharmony_ci * parse a normal reply, which may contain a (dir+)dentry and/or a
2868c2ecf20Sopenharmony_ci * target inode.
2878c2ecf20Sopenharmony_ci */
2888c2ecf20Sopenharmony_cistatic int parse_reply_info_trace(void **p, void *end,
2898c2ecf20Sopenharmony_ci				  struct ceph_mds_reply_info_parsed *info,
2908c2ecf20Sopenharmony_ci				  u64 features)
2918c2ecf20Sopenharmony_ci{
2928c2ecf20Sopenharmony_ci	int err;
2938c2ecf20Sopenharmony_ci
2948c2ecf20Sopenharmony_ci	if (info->head->is_dentry) {
2958c2ecf20Sopenharmony_ci		err = parse_reply_info_in(p, end, &info->diri, features);
2968c2ecf20Sopenharmony_ci		if (err < 0)
2978c2ecf20Sopenharmony_ci			goto out_bad;
2988c2ecf20Sopenharmony_ci
2998c2ecf20Sopenharmony_ci		err = parse_reply_info_dir(p, end, &info->dirfrag, features);
3008c2ecf20Sopenharmony_ci		if (err < 0)
3018c2ecf20Sopenharmony_ci			goto out_bad;
3028c2ecf20Sopenharmony_ci
3038c2ecf20Sopenharmony_ci		ceph_decode_32_safe(p, end, info->dname_len, bad);
3048c2ecf20Sopenharmony_ci		ceph_decode_need(p, end, info->dname_len, bad);
3058c2ecf20Sopenharmony_ci		info->dname = *p;
3068c2ecf20Sopenharmony_ci		*p += info->dname_len;
3078c2ecf20Sopenharmony_ci
3088c2ecf20Sopenharmony_ci		err = parse_reply_info_lease(p, end, &info->dlease, features);
3098c2ecf20Sopenharmony_ci		if (err < 0)
3108c2ecf20Sopenharmony_ci			goto out_bad;
3118c2ecf20Sopenharmony_ci	}
3128c2ecf20Sopenharmony_ci
3138c2ecf20Sopenharmony_ci	if (info->head->is_target) {
3148c2ecf20Sopenharmony_ci		err = parse_reply_info_in(p, end, &info->targeti, features);
3158c2ecf20Sopenharmony_ci		if (err < 0)
3168c2ecf20Sopenharmony_ci			goto out_bad;
3178c2ecf20Sopenharmony_ci	}
3188c2ecf20Sopenharmony_ci
3198c2ecf20Sopenharmony_ci	if (unlikely(*p != end))
3208c2ecf20Sopenharmony_ci		goto bad;
3218c2ecf20Sopenharmony_ci	return 0;
3228c2ecf20Sopenharmony_ci
3238c2ecf20Sopenharmony_cibad:
3248c2ecf20Sopenharmony_ci	err = -EIO;
3258c2ecf20Sopenharmony_ciout_bad:
3268c2ecf20Sopenharmony_ci	pr_err("problem parsing mds trace %d\n", err);
3278c2ecf20Sopenharmony_ci	return err;
3288c2ecf20Sopenharmony_ci}
3298c2ecf20Sopenharmony_ci
3308c2ecf20Sopenharmony_ci/*
3318c2ecf20Sopenharmony_ci * parse readdir results
3328c2ecf20Sopenharmony_ci */
3338c2ecf20Sopenharmony_cistatic int parse_reply_info_readdir(void **p, void *end,
3348c2ecf20Sopenharmony_ci				struct ceph_mds_reply_info_parsed *info,
3358c2ecf20Sopenharmony_ci				u64 features)
3368c2ecf20Sopenharmony_ci{
3378c2ecf20Sopenharmony_ci	u32 num, i = 0;
3388c2ecf20Sopenharmony_ci	int err;
3398c2ecf20Sopenharmony_ci
3408c2ecf20Sopenharmony_ci	err = parse_reply_info_dir(p, end, &info->dir_dir, features);
3418c2ecf20Sopenharmony_ci	if (err < 0)
3428c2ecf20Sopenharmony_ci		goto out_bad;
3438c2ecf20Sopenharmony_ci
3448c2ecf20Sopenharmony_ci	ceph_decode_need(p, end, sizeof(num) + 2, bad);
3458c2ecf20Sopenharmony_ci	num = ceph_decode_32(p);
3468c2ecf20Sopenharmony_ci	{
3478c2ecf20Sopenharmony_ci		u16 flags = ceph_decode_16(p);
3488c2ecf20Sopenharmony_ci		info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
3498c2ecf20Sopenharmony_ci		info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
3508c2ecf20Sopenharmony_ci		info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
3518c2ecf20Sopenharmony_ci		info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH);
3528c2ecf20Sopenharmony_ci	}
3538c2ecf20Sopenharmony_ci	if (num == 0)
3548c2ecf20Sopenharmony_ci		goto done;
3558c2ecf20Sopenharmony_ci
3568c2ecf20Sopenharmony_ci	BUG_ON(!info->dir_entries);
3578c2ecf20Sopenharmony_ci	if ((unsigned long)(info->dir_entries + num) >
3588c2ecf20Sopenharmony_ci	    (unsigned long)info->dir_entries + info->dir_buf_size) {
3598c2ecf20Sopenharmony_ci		pr_err("dir contents are larger than expected\n");
3608c2ecf20Sopenharmony_ci		WARN_ON(1);
3618c2ecf20Sopenharmony_ci		goto bad;
3628c2ecf20Sopenharmony_ci	}
3638c2ecf20Sopenharmony_ci
3648c2ecf20Sopenharmony_ci	info->dir_nr = num;
3658c2ecf20Sopenharmony_ci	while (num) {
3668c2ecf20Sopenharmony_ci		struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
3678c2ecf20Sopenharmony_ci		/* dentry */
3688c2ecf20Sopenharmony_ci		ceph_decode_32_safe(p, end, rde->name_len, bad);
3698c2ecf20Sopenharmony_ci		ceph_decode_need(p, end, rde->name_len, bad);
3708c2ecf20Sopenharmony_ci		rde->name = *p;
3718c2ecf20Sopenharmony_ci		*p += rde->name_len;
3728c2ecf20Sopenharmony_ci		dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
3738c2ecf20Sopenharmony_ci
3748c2ecf20Sopenharmony_ci		/* dentry lease */
3758c2ecf20Sopenharmony_ci		err = parse_reply_info_lease(p, end, &rde->lease, features);
3768c2ecf20Sopenharmony_ci		if (err)
3778c2ecf20Sopenharmony_ci			goto out_bad;
3788c2ecf20Sopenharmony_ci		/* inode */
3798c2ecf20Sopenharmony_ci		err = parse_reply_info_in(p, end, &rde->inode, features);
3808c2ecf20Sopenharmony_ci		if (err < 0)
3818c2ecf20Sopenharmony_ci			goto out_bad;
3828c2ecf20Sopenharmony_ci		/* ceph_readdir_prepopulate() will update it */
3838c2ecf20Sopenharmony_ci		rde->offset = 0;
3848c2ecf20Sopenharmony_ci		i++;
3858c2ecf20Sopenharmony_ci		num--;
3868c2ecf20Sopenharmony_ci	}
3878c2ecf20Sopenharmony_ci
3888c2ecf20Sopenharmony_cidone:
3898c2ecf20Sopenharmony_ci	/* Skip over any unrecognized fields */
3908c2ecf20Sopenharmony_ci	*p = end;
3918c2ecf20Sopenharmony_ci	return 0;
3928c2ecf20Sopenharmony_ci
3938c2ecf20Sopenharmony_cibad:
3948c2ecf20Sopenharmony_ci	err = -EIO;
3958c2ecf20Sopenharmony_ciout_bad:
3968c2ecf20Sopenharmony_ci	pr_err("problem parsing dir contents %d\n", err);
3978c2ecf20Sopenharmony_ci	return err;
3988c2ecf20Sopenharmony_ci}
3998c2ecf20Sopenharmony_ci
4008c2ecf20Sopenharmony_ci/*
4018c2ecf20Sopenharmony_ci * parse fcntl F_GETLK results
4028c2ecf20Sopenharmony_ci */
4038c2ecf20Sopenharmony_cistatic int parse_reply_info_filelock(void **p, void *end,
4048c2ecf20Sopenharmony_ci				     struct ceph_mds_reply_info_parsed *info,
4058c2ecf20Sopenharmony_ci				     u64 features)
4068c2ecf20Sopenharmony_ci{
4078c2ecf20Sopenharmony_ci	if (*p + sizeof(*info->filelock_reply) > end)
4088c2ecf20Sopenharmony_ci		goto bad;
4098c2ecf20Sopenharmony_ci
4108c2ecf20Sopenharmony_ci	info->filelock_reply = *p;
4118c2ecf20Sopenharmony_ci
4128c2ecf20Sopenharmony_ci	/* Skip over any unrecognized fields */
4138c2ecf20Sopenharmony_ci	*p = end;
4148c2ecf20Sopenharmony_ci	return 0;
4158c2ecf20Sopenharmony_cibad:
4168c2ecf20Sopenharmony_ci	return -EIO;
4178c2ecf20Sopenharmony_ci}
4188c2ecf20Sopenharmony_ci
4198c2ecf20Sopenharmony_ci
4208c2ecf20Sopenharmony_ci#if BITS_PER_LONG == 64
4218c2ecf20Sopenharmony_ci
4228c2ecf20Sopenharmony_ci#define DELEGATED_INO_AVAILABLE		xa_mk_value(1)
4238c2ecf20Sopenharmony_ci
4248c2ecf20Sopenharmony_cistatic int ceph_parse_deleg_inos(void **p, void *end,
4258c2ecf20Sopenharmony_ci				 struct ceph_mds_session *s)
4268c2ecf20Sopenharmony_ci{
4278c2ecf20Sopenharmony_ci	u32 sets;
4288c2ecf20Sopenharmony_ci
4298c2ecf20Sopenharmony_ci	ceph_decode_32_safe(p, end, sets, bad);
4308c2ecf20Sopenharmony_ci	dout("got %u sets of delegated inodes\n", sets);
4318c2ecf20Sopenharmony_ci	while (sets--) {
4328c2ecf20Sopenharmony_ci		u64 start, len, ino;
4338c2ecf20Sopenharmony_ci
4348c2ecf20Sopenharmony_ci		ceph_decode_64_safe(p, end, start, bad);
4358c2ecf20Sopenharmony_ci		ceph_decode_64_safe(p, end, len, bad);
4368c2ecf20Sopenharmony_ci
4378c2ecf20Sopenharmony_ci		/* Don't accept a delegation of system inodes */
4388c2ecf20Sopenharmony_ci		if (start < CEPH_INO_SYSTEM_BASE) {
4398c2ecf20Sopenharmony_ci			pr_warn_ratelimited("ceph: ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n",
4408c2ecf20Sopenharmony_ci					start, len);
4418c2ecf20Sopenharmony_ci			continue;
4428c2ecf20Sopenharmony_ci		}
4438c2ecf20Sopenharmony_ci		while (len--) {
4448c2ecf20Sopenharmony_ci			int err = xa_insert(&s->s_delegated_inos, ino = start++,
4458c2ecf20Sopenharmony_ci					    DELEGATED_INO_AVAILABLE,
4468c2ecf20Sopenharmony_ci					    GFP_KERNEL);
4478c2ecf20Sopenharmony_ci			if (!err) {
4488c2ecf20Sopenharmony_ci				dout("added delegated inode 0x%llx\n",
4498c2ecf20Sopenharmony_ci				     start - 1);
4508c2ecf20Sopenharmony_ci			} else if (err == -EBUSY) {
4518c2ecf20Sopenharmony_ci				pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n",
4528c2ecf20Sopenharmony_ci					start - 1);
4538c2ecf20Sopenharmony_ci			} else {
4548c2ecf20Sopenharmony_ci				return err;
4558c2ecf20Sopenharmony_ci			}
4568c2ecf20Sopenharmony_ci		}
4578c2ecf20Sopenharmony_ci	}
4588c2ecf20Sopenharmony_ci	return 0;
4598c2ecf20Sopenharmony_cibad:
4608c2ecf20Sopenharmony_ci	return -EIO;
4618c2ecf20Sopenharmony_ci}
4628c2ecf20Sopenharmony_ci
4638c2ecf20Sopenharmony_ciu64 ceph_get_deleg_ino(struct ceph_mds_session *s)
4648c2ecf20Sopenharmony_ci{
4658c2ecf20Sopenharmony_ci	unsigned long ino;
4668c2ecf20Sopenharmony_ci	void *val;
4678c2ecf20Sopenharmony_ci
4688c2ecf20Sopenharmony_ci	xa_for_each(&s->s_delegated_inos, ino, val) {
4698c2ecf20Sopenharmony_ci		val = xa_erase(&s->s_delegated_inos, ino);
4708c2ecf20Sopenharmony_ci		if (val == DELEGATED_INO_AVAILABLE)
4718c2ecf20Sopenharmony_ci			return ino;
4728c2ecf20Sopenharmony_ci	}
4738c2ecf20Sopenharmony_ci	return 0;
4748c2ecf20Sopenharmony_ci}
4758c2ecf20Sopenharmony_ci
4768c2ecf20Sopenharmony_ciint ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
4778c2ecf20Sopenharmony_ci{
4788c2ecf20Sopenharmony_ci	return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE,
4798c2ecf20Sopenharmony_ci			 GFP_KERNEL);
4808c2ecf20Sopenharmony_ci}
4818c2ecf20Sopenharmony_ci#else /* BITS_PER_LONG == 64 */
4828c2ecf20Sopenharmony_ci/*
4838c2ecf20Sopenharmony_ci * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just
4848c2ecf20Sopenharmony_ci * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top
4858c2ecf20Sopenharmony_ci * and bottom words?
4868c2ecf20Sopenharmony_ci */
4878c2ecf20Sopenharmony_cistatic int ceph_parse_deleg_inos(void **p, void *end,
4888c2ecf20Sopenharmony_ci				 struct ceph_mds_session *s)
4898c2ecf20Sopenharmony_ci{
4908c2ecf20Sopenharmony_ci	u32 sets;
4918c2ecf20Sopenharmony_ci
4928c2ecf20Sopenharmony_ci	ceph_decode_32_safe(p, end, sets, bad);
4938c2ecf20Sopenharmony_ci	if (sets)
4948c2ecf20Sopenharmony_ci		ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad);
4958c2ecf20Sopenharmony_ci	return 0;
4968c2ecf20Sopenharmony_cibad:
4978c2ecf20Sopenharmony_ci	return -EIO;
4988c2ecf20Sopenharmony_ci}
4998c2ecf20Sopenharmony_ci
5008c2ecf20Sopenharmony_ciu64 ceph_get_deleg_ino(struct ceph_mds_session *s)
5018c2ecf20Sopenharmony_ci{
5028c2ecf20Sopenharmony_ci	return 0;
5038c2ecf20Sopenharmony_ci}
5048c2ecf20Sopenharmony_ci
5058c2ecf20Sopenharmony_ciint ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino)
5068c2ecf20Sopenharmony_ci{
5078c2ecf20Sopenharmony_ci	return 0;
5088c2ecf20Sopenharmony_ci}
5098c2ecf20Sopenharmony_ci#endif /* BITS_PER_LONG == 64 */
5108c2ecf20Sopenharmony_ci
5118c2ecf20Sopenharmony_ci/*
5128c2ecf20Sopenharmony_ci * parse create results
5138c2ecf20Sopenharmony_ci */
5148c2ecf20Sopenharmony_cistatic int parse_reply_info_create(void **p, void *end,
5158c2ecf20Sopenharmony_ci				  struct ceph_mds_reply_info_parsed *info,
5168c2ecf20Sopenharmony_ci				  u64 features, struct ceph_mds_session *s)
5178c2ecf20Sopenharmony_ci{
5188c2ecf20Sopenharmony_ci	int ret;
5198c2ecf20Sopenharmony_ci
5208c2ecf20Sopenharmony_ci	if (features == (u64)-1 ||
5218c2ecf20Sopenharmony_ci	    (features & CEPH_FEATURE_REPLY_CREATE_INODE)) {
5228c2ecf20Sopenharmony_ci		if (*p == end) {
5238c2ecf20Sopenharmony_ci			/* Malformed reply? */
5248c2ecf20Sopenharmony_ci			info->has_create_ino = false;
5258c2ecf20Sopenharmony_ci		} else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) {
5268c2ecf20Sopenharmony_ci			u8 struct_v, struct_compat;
5278c2ecf20Sopenharmony_ci			u32 len;
5288c2ecf20Sopenharmony_ci
5298c2ecf20Sopenharmony_ci			info->has_create_ino = true;
5308c2ecf20Sopenharmony_ci			ceph_decode_8_safe(p, end, struct_v, bad);
5318c2ecf20Sopenharmony_ci			ceph_decode_8_safe(p, end, struct_compat, bad);
5328c2ecf20Sopenharmony_ci			ceph_decode_32_safe(p, end, len, bad);
5338c2ecf20Sopenharmony_ci			ceph_decode_64_safe(p, end, info->ino, bad);
5348c2ecf20Sopenharmony_ci			ret = ceph_parse_deleg_inos(p, end, s);
5358c2ecf20Sopenharmony_ci			if (ret)
5368c2ecf20Sopenharmony_ci				return ret;
5378c2ecf20Sopenharmony_ci		} else {
5388c2ecf20Sopenharmony_ci			/* legacy */
5398c2ecf20Sopenharmony_ci			ceph_decode_64_safe(p, end, info->ino, bad);
5408c2ecf20Sopenharmony_ci			info->has_create_ino = true;
5418c2ecf20Sopenharmony_ci		}
5428c2ecf20Sopenharmony_ci	} else {
5438c2ecf20Sopenharmony_ci		if (*p != end)
5448c2ecf20Sopenharmony_ci			goto bad;
5458c2ecf20Sopenharmony_ci	}
5468c2ecf20Sopenharmony_ci
5478c2ecf20Sopenharmony_ci	/* Skip over any unrecognized fields */
5488c2ecf20Sopenharmony_ci	*p = end;
5498c2ecf20Sopenharmony_ci	return 0;
5508c2ecf20Sopenharmony_cibad:
5518c2ecf20Sopenharmony_ci	return -EIO;
5528c2ecf20Sopenharmony_ci}
5538c2ecf20Sopenharmony_ci
5548c2ecf20Sopenharmony_ci/*
5558c2ecf20Sopenharmony_ci * parse extra results
5568c2ecf20Sopenharmony_ci */
5578c2ecf20Sopenharmony_cistatic int parse_reply_info_extra(void **p, void *end,
5588c2ecf20Sopenharmony_ci				  struct ceph_mds_reply_info_parsed *info,
5598c2ecf20Sopenharmony_ci				  u64 features, struct ceph_mds_session *s)
5608c2ecf20Sopenharmony_ci{
5618c2ecf20Sopenharmony_ci	u32 op = le32_to_cpu(info->head->op);
5628c2ecf20Sopenharmony_ci
5638c2ecf20Sopenharmony_ci	if (op == CEPH_MDS_OP_GETFILELOCK)
5648c2ecf20Sopenharmony_ci		return parse_reply_info_filelock(p, end, info, features);
5658c2ecf20Sopenharmony_ci	else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
5668c2ecf20Sopenharmony_ci		return parse_reply_info_readdir(p, end, info, features);
5678c2ecf20Sopenharmony_ci	else if (op == CEPH_MDS_OP_CREATE)
5688c2ecf20Sopenharmony_ci		return parse_reply_info_create(p, end, info, features, s);
5698c2ecf20Sopenharmony_ci	else
5708c2ecf20Sopenharmony_ci		return -EIO;
5718c2ecf20Sopenharmony_ci}
5728c2ecf20Sopenharmony_ci
5738c2ecf20Sopenharmony_ci/*
5748c2ecf20Sopenharmony_ci * parse entire mds reply
5758c2ecf20Sopenharmony_ci */
5768c2ecf20Sopenharmony_cistatic int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
5778c2ecf20Sopenharmony_ci			    struct ceph_mds_reply_info_parsed *info,
5788c2ecf20Sopenharmony_ci			    u64 features)
5798c2ecf20Sopenharmony_ci{
5808c2ecf20Sopenharmony_ci	void *p, *end;
5818c2ecf20Sopenharmony_ci	u32 len;
5828c2ecf20Sopenharmony_ci	int err;
5838c2ecf20Sopenharmony_ci
5848c2ecf20Sopenharmony_ci	info->head = msg->front.iov_base;
5858c2ecf20Sopenharmony_ci	p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
5868c2ecf20Sopenharmony_ci	end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
5878c2ecf20Sopenharmony_ci
5888c2ecf20Sopenharmony_ci	/* trace */
5898c2ecf20Sopenharmony_ci	ceph_decode_32_safe(&p, end, len, bad);
5908c2ecf20Sopenharmony_ci	if (len > 0) {
5918c2ecf20Sopenharmony_ci		ceph_decode_need(&p, end, len, bad);
5928c2ecf20Sopenharmony_ci		err = parse_reply_info_trace(&p, p+len, info, features);
5938c2ecf20Sopenharmony_ci		if (err < 0)
5948c2ecf20Sopenharmony_ci			goto out_bad;
5958c2ecf20Sopenharmony_ci	}
5968c2ecf20Sopenharmony_ci
5978c2ecf20Sopenharmony_ci	/* extra */
5988c2ecf20Sopenharmony_ci	ceph_decode_32_safe(&p, end, len, bad);
5998c2ecf20Sopenharmony_ci	if (len > 0) {
6008c2ecf20Sopenharmony_ci		ceph_decode_need(&p, end, len, bad);
6018c2ecf20Sopenharmony_ci		err = parse_reply_info_extra(&p, p+len, info, features, s);
6028c2ecf20Sopenharmony_ci		if (err < 0)
6038c2ecf20Sopenharmony_ci			goto out_bad;
6048c2ecf20Sopenharmony_ci	}
6058c2ecf20Sopenharmony_ci
6068c2ecf20Sopenharmony_ci	/* snap blob */
6078c2ecf20Sopenharmony_ci	ceph_decode_32_safe(&p, end, len, bad);
6088c2ecf20Sopenharmony_ci	info->snapblob_len = len;
6098c2ecf20Sopenharmony_ci	info->snapblob = p;
6108c2ecf20Sopenharmony_ci	p += len;
6118c2ecf20Sopenharmony_ci
6128c2ecf20Sopenharmony_ci	if (p != end)
6138c2ecf20Sopenharmony_ci		goto bad;
6148c2ecf20Sopenharmony_ci	return 0;
6158c2ecf20Sopenharmony_ci
6168c2ecf20Sopenharmony_cibad:
6178c2ecf20Sopenharmony_ci	err = -EIO;
6188c2ecf20Sopenharmony_ciout_bad:
6198c2ecf20Sopenharmony_ci	pr_err("mds parse_reply err %d\n", err);
6208c2ecf20Sopenharmony_ci	return err;
6218c2ecf20Sopenharmony_ci}
6228c2ecf20Sopenharmony_ci
6238c2ecf20Sopenharmony_cistatic void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
6248c2ecf20Sopenharmony_ci{
6258c2ecf20Sopenharmony_ci	if (!info->dir_entries)
6268c2ecf20Sopenharmony_ci		return;
6278c2ecf20Sopenharmony_ci	free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
6288c2ecf20Sopenharmony_ci}
6298c2ecf20Sopenharmony_ci
6308c2ecf20Sopenharmony_ci
6318c2ecf20Sopenharmony_ci/*
6328c2ecf20Sopenharmony_ci * sessions
6338c2ecf20Sopenharmony_ci */
6348c2ecf20Sopenharmony_ciconst char *ceph_session_state_name(int s)
6358c2ecf20Sopenharmony_ci{
6368c2ecf20Sopenharmony_ci	switch (s) {
6378c2ecf20Sopenharmony_ci	case CEPH_MDS_SESSION_NEW: return "new";
6388c2ecf20Sopenharmony_ci	case CEPH_MDS_SESSION_OPENING: return "opening";
6398c2ecf20Sopenharmony_ci	case CEPH_MDS_SESSION_OPEN: return "open";
6408c2ecf20Sopenharmony_ci	case CEPH_MDS_SESSION_HUNG: return "hung";
6418c2ecf20Sopenharmony_ci	case CEPH_MDS_SESSION_CLOSING: return "closing";
6428c2ecf20Sopenharmony_ci	case CEPH_MDS_SESSION_CLOSED: return "closed";
6438c2ecf20Sopenharmony_ci	case CEPH_MDS_SESSION_RESTARTING: return "restarting";
6448c2ecf20Sopenharmony_ci	case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
6458c2ecf20Sopenharmony_ci	case CEPH_MDS_SESSION_REJECTED: return "rejected";
6468c2ecf20Sopenharmony_ci	default: return "???";
6478c2ecf20Sopenharmony_ci	}
6488c2ecf20Sopenharmony_ci}
6498c2ecf20Sopenharmony_ci
6508c2ecf20Sopenharmony_cistruct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s)
6518c2ecf20Sopenharmony_ci{
6528c2ecf20Sopenharmony_ci	if (refcount_inc_not_zero(&s->s_ref)) {
6538c2ecf20Sopenharmony_ci		dout("mdsc get_session %p %d -> %d\n", s,
6548c2ecf20Sopenharmony_ci		     refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref));
6558c2ecf20Sopenharmony_ci		return s;
6568c2ecf20Sopenharmony_ci	} else {
6578c2ecf20Sopenharmony_ci		dout("mdsc get_session %p 0 -- FAIL\n", s);
6588c2ecf20Sopenharmony_ci		return NULL;
6598c2ecf20Sopenharmony_ci	}
6608c2ecf20Sopenharmony_ci}
6618c2ecf20Sopenharmony_ci
6628c2ecf20Sopenharmony_civoid ceph_put_mds_session(struct ceph_mds_session *s)
6638c2ecf20Sopenharmony_ci{
6648c2ecf20Sopenharmony_ci	if (IS_ERR_OR_NULL(s))
6658c2ecf20Sopenharmony_ci		return;
6668c2ecf20Sopenharmony_ci
6678c2ecf20Sopenharmony_ci	dout("mdsc put_session %p %d -> %d\n", s,
6688c2ecf20Sopenharmony_ci	     refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1);
6698c2ecf20Sopenharmony_ci	if (refcount_dec_and_test(&s->s_ref)) {
6708c2ecf20Sopenharmony_ci		if (s->s_auth.authorizer)
6718c2ecf20Sopenharmony_ci			ceph_auth_destroy_authorizer(s->s_auth.authorizer);
6728c2ecf20Sopenharmony_ci		WARN_ON(mutex_is_locked(&s->s_mutex));
6738c2ecf20Sopenharmony_ci		xa_destroy(&s->s_delegated_inos);
6748c2ecf20Sopenharmony_ci		kfree(s);
6758c2ecf20Sopenharmony_ci	}
6768c2ecf20Sopenharmony_ci}
6778c2ecf20Sopenharmony_ci
6788c2ecf20Sopenharmony_ci/*
6798c2ecf20Sopenharmony_ci * called under mdsc->mutex
6808c2ecf20Sopenharmony_ci */
6818c2ecf20Sopenharmony_cistruct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
6828c2ecf20Sopenharmony_ci						   int mds)
6838c2ecf20Sopenharmony_ci{
6848c2ecf20Sopenharmony_ci	if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
6858c2ecf20Sopenharmony_ci		return NULL;
6868c2ecf20Sopenharmony_ci	return ceph_get_mds_session(mdsc->sessions[mds]);
6878c2ecf20Sopenharmony_ci}
6888c2ecf20Sopenharmony_ci
6898c2ecf20Sopenharmony_cistatic bool __have_session(struct ceph_mds_client *mdsc, int mds)
6908c2ecf20Sopenharmony_ci{
6918c2ecf20Sopenharmony_ci	if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
6928c2ecf20Sopenharmony_ci		return false;
6938c2ecf20Sopenharmony_ci	else
6948c2ecf20Sopenharmony_ci		return true;
6958c2ecf20Sopenharmony_ci}
6968c2ecf20Sopenharmony_ci
6978c2ecf20Sopenharmony_cistatic int __verify_registered_session(struct ceph_mds_client *mdsc,
6988c2ecf20Sopenharmony_ci				       struct ceph_mds_session *s)
6998c2ecf20Sopenharmony_ci{
7008c2ecf20Sopenharmony_ci	if (s->s_mds >= mdsc->max_sessions ||
7018c2ecf20Sopenharmony_ci	    mdsc->sessions[s->s_mds] != s)
7028c2ecf20Sopenharmony_ci		return -ENOENT;
7038c2ecf20Sopenharmony_ci	return 0;
7048c2ecf20Sopenharmony_ci}
7058c2ecf20Sopenharmony_ci
7068c2ecf20Sopenharmony_ci/*
7078c2ecf20Sopenharmony_ci * create+register a new session for given mds.
7088c2ecf20Sopenharmony_ci * called under mdsc->mutex.
7098c2ecf20Sopenharmony_ci */
7108c2ecf20Sopenharmony_cistatic struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
7118c2ecf20Sopenharmony_ci						 int mds)
7128c2ecf20Sopenharmony_ci{
7138c2ecf20Sopenharmony_ci	struct ceph_mds_session *s;
7148c2ecf20Sopenharmony_ci
7158c2ecf20Sopenharmony_ci	if (mds >= mdsc->mdsmap->possible_max_rank)
7168c2ecf20Sopenharmony_ci		return ERR_PTR(-EINVAL);
7178c2ecf20Sopenharmony_ci
7188c2ecf20Sopenharmony_ci	s = kzalloc(sizeof(*s), GFP_NOFS);
7198c2ecf20Sopenharmony_ci	if (!s)
7208c2ecf20Sopenharmony_ci		return ERR_PTR(-ENOMEM);
7218c2ecf20Sopenharmony_ci
7228c2ecf20Sopenharmony_ci	if (mds >= mdsc->max_sessions) {
7238c2ecf20Sopenharmony_ci		int newmax = 1 << get_count_order(mds + 1);
7248c2ecf20Sopenharmony_ci		struct ceph_mds_session **sa;
7258c2ecf20Sopenharmony_ci
7268c2ecf20Sopenharmony_ci		dout("%s: realloc to %d\n", __func__, newmax);
7278c2ecf20Sopenharmony_ci		sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
7288c2ecf20Sopenharmony_ci		if (!sa)
7298c2ecf20Sopenharmony_ci			goto fail_realloc;
7308c2ecf20Sopenharmony_ci		if (mdsc->sessions) {
7318c2ecf20Sopenharmony_ci			memcpy(sa, mdsc->sessions,
7328c2ecf20Sopenharmony_ci			       mdsc->max_sessions * sizeof(void *));
7338c2ecf20Sopenharmony_ci			kfree(mdsc->sessions);
7348c2ecf20Sopenharmony_ci		}
7358c2ecf20Sopenharmony_ci		mdsc->sessions = sa;
7368c2ecf20Sopenharmony_ci		mdsc->max_sessions = newmax;
7378c2ecf20Sopenharmony_ci	}
7388c2ecf20Sopenharmony_ci
7398c2ecf20Sopenharmony_ci	dout("%s: mds%d\n", __func__, mds);
7408c2ecf20Sopenharmony_ci	s->s_mdsc = mdsc;
7418c2ecf20Sopenharmony_ci	s->s_mds = mds;
7428c2ecf20Sopenharmony_ci	s->s_state = CEPH_MDS_SESSION_NEW;
7438c2ecf20Sopenharmony_ci	s->s_ttl = 0;
7448c2ecf20Sopenharmony_ci	s->s_seq = 0;
7458c2ecf20Sopenharmony_ci	mutex_init(&s->s_mutex);
7468c2ecf20Sopenharmony_ci
7478c2ecf20Sopenharmony_ci	ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
7488c2ecf20Sopenharmony_ci
7498c2ecf20Sopenharmony_ci	spin_lock_init(&s->s_gen_ttl_lock);
7508c2ecf20Sopenharmony_ci	s->s_cap_gen = 1;
7518c2ecf20Sopenharmony_ci	s->s_cap_ttl = jiffies - 1;
7528c2ecf20Sopenharmony_ci
7538c2ecf20Sopenharmony_ci	spin_lock_init(&s->s_cap_lock);
7548c2ecf20Sopenharmony_ci	s->s_renew_requested = 0;
7558c2ecf20Sopenharmony_ci	s->s_renew_seq = 0;
7568c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&s->s_caps);
7578c2ecf20Sopenharmony_ci	s->s_nr_caps = 0;
7588c2ecf20Sopenharmony_ci	refcount_set(&s->s_ref, 1);
7598c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&s->s_waiting);
7608c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&s->s_unsafe);
7618c2ecf20Sopenharmony_ci	xa_init(&s->s_delegated_inos);
7628c2ecf20Sopenharmony_ci	s->s_num_cap_releases = 0;
7638c2ecf20Sopenharmony_ci	s->s_cap_reconnect = 0;
7648c2ecf20Sopenharmony_ci	s->s_cap_iterator = NULL;
7658c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&s->s_cap_releases);
7668c2ecf20Sopenharmony_ci	INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work);
7678c2ecf20Sopenharmony_ci
7688c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&s->s_cap_dirty);
7698c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&s->s_cap_flushing);
7708c2ecf20Sopenharmony_ci
7718c2ecf20Sopenharmony_ci	mdsc->sessions[mds] = s;
7728c2ecf20Sopenharmony_ci	atomic_inc(&mdsc->num_sessions);
7738c2ecf20Sopenharmony_ci	refcount_inc(&s->s_ref);  /* one ref to sessions[], one to caller */
7748c2ecf20Sopenharmony_ci
7758c2ecf20Sopenharmony_ci	ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
7768c2ecf20Sopenharmony_ci		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
7778c2ecf20Sopenharmony_ci
7788c2ecf20Sopenharmony_ci	return s;
7798c2ecf20Sopenharmony_ci
7808c2ecf20Sopenharmony_cifail_realloc:
7818c2ecf20Sopenharmony_ci	kfree(s);
7828c2ecf20Sopenharmony_ci	return ERR_PTR(-ENOMEM);
7838c2ecf20Sopenharmony_ci}
7848c2ecf20Sopenharmony_ci
7858c2ecf20Sopenharmony_ci/*
7868c2ecf20Sopenharmony_ci * called under mdsc->mutex
7878c2ecf20Sopenharmony_ci */
7888c2ecf20Sopenharmony_cistatic void __unregister_session(struct ceph_mds_client *mdsc,
7898c2ecf20Sopenharmony_ci			       struct ceph_mds_session *s)
7908c2ecf20Sopenharmony_ci{
7918c2ecf20Sopenharmony_ci	dout("__unregister_session mds%d %p\n", s->s_mds, s);
7928c2ecf20Sopenharmony_ci	BUG_ON(mdsc->sessions[s->s_mds] != s);
7938c2ecf20Sopenharmony_ci	mdsc->sessions[s->s_mds] = NULL;
7948c2ecf20Sopenharmony_ci	ceph_con_close(&s->s_con);
7958c2ecf20Sopenharmony_ci	ceph_put_mds_session(s);
7968c2ecf20Sopenharmony_ci	atomic_dec(&mdsc->num_sessions);
7978c2ecf20Sopenharmony_ci}
7988c2ecf20Sopenharmony_ci
7998c2ecf20Sopenharmony_ci/*
8008c2ecf20Sopenharmony_ci * drop session refs in request.
8018c2ecf20Sopenharmony_ci *
8028c2ecf20Sopenharmony_ci * should be last request ref, or hold mdsc->mutex
8038c2ecf20Sopenharmony_ci */
8048c2ecf20Sopenharmony_cistatic void put_request_session(struct ceph_mds_request *req)
8058c2ecf20Sopenharmony_ci{
8068c2ecf20Sopenharmony_ci	if (req->r_session) {
8078c2ecf20Sopenharmony_ci		ceph_put_mds_session(req->r_session);
8088c2ecf20Sopenharmony_ci		req->r_session = NULL;
8098c2ecf20Sopenharmony_ci	}
8108c2ecf20Sopenharmony_ci}
8118c2ecf20Sopenharmony_ci
8128c2ecf20Sopenharmony_civoid ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
8138c2ecf20Sopenharmony_ci				void (*cb)(struct ceph_mds_session *),
8148c2ecf20Sopenharmony_ci				bool check_state)
8158c2ecf20Sopenharmony_ci{
8168c2ecf20Sopenharmony_ci	int mds;
8178c2ecf20Sopenharmony_ci
8188c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
8198c2ecf20Sopenharmony_ci	for (mds = 0; mds < mdsc->max_sessions; ++mds) {
8208c2ecf20Sopenharmony_ci		struct ceph_mds_session *s;
8218c2ecf20Sopenharmony_ci
8228c2ecf20Sopenharmony_ci		s = __ceph_lookup_mds_session(mdsc, mds);
8238c2ecf20Sopenharmony_ci		if (!s)
8248c2ecf20Sopenharmony_ci			continue;
8258c2ecf20Sopenharmony_ci
8268c2ecf20Sopenharmony_ci		if (check_state && !check_session_state(s)) {
8278c2ecf20Sopenharmony_ci			ceph_put_mds_session(s);
8288c2ecf20Sopenharmony_ci			continue;
8298c2ecf20Sopenharmony_ci		}
8308c2ecf20Sopenharmony_ci
8318c2ecf20Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
8328c2ecf20Sopenharmony_ci		cb(s);
8338c2ecf20Sopenharmony_ci		ceph_put_mds_session(s);
8348c2ecf20Sopenharmony_ci		mutex_lock(&mdsc->mutex);
8358c2ecf20Sopenharmony_ci	}
8368c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
8378c2ecf20Sopenharmony_ci}
8388c2ecf20Sopenharmony_ci
8398c2ecf20Sopenharmony_civoid ceph_mdsc_release_request(struct kref *kref)
8408c2ecf20Sopenharmony_ci{
8418c2ecf20Sopenharmony_ci	struct ceph_mds_request *req = container_of(kref,
8428c2ecf20Sopenharmony_ci						    struct ceph_mds_request,
8438c2ecf20Sopenharmony_ci						    r_kref);
8448c2ecf20Sopenharmony_ci	ceph_mdsc_release_dir_caps_no_check(req);
8458c2ecf20Sopenharmony_ci	destroy_reply_info(&req->r_reply_info);
8468c2ecf20Sopenharmony_ci	if (req->r_request)
8478c2ecf20Sopenharmony_ci		ceph_msg_put(req->r_request);
8488c2ecf20Sopenharmony_ci	if (req->r_reply)
8498c2ecf20Sopenharmony_ci		ceph_msg_put(req->r_reply);
8508c2ecf20Sopenharmony_ci	if (req->r_inode) {
8518c2ecf20Sopenharmony_ci		ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
8528c2ecf20Sopenharmony_ci		/* avoid calling iput_final() in mds dispatch threads */
8538c2ecf20Sopenharmony_ci		ceph_async_iput(req->r_inode);
8548c2ecf20Sopenharmony_ci	}
8558c2ecf20Sopenharmony_ci	if (req->r_parent) {
8568c2ecf20Sopenharmony_ci		ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
8578c2ecf20Sopenharmony_ci		ceph_async_iput(req->r_parent);
8588c2ecf20Sopenharmony_ci	}
8598c2ecf20Sopenharmony_ci	ceph_async_iput(req->r_target_inode);
8608c2ecf20Sopenharmony_ci	if (req->r_dentry)
8618c2ecf20Sopenharmony_ci		dput(req->r_dentry);
8628c2ecf20Sopenharmony_ci	if (req->r_old_dentry)
8638c2ecf20Sopenharmony_ci		dput(req->r_old_dentry);
8648c2ecf20Sopenharmony_ci	if (req->r_old_dentry_dir) {
8658c2ecf20Sopenharmony_ci		/*
8668c2ecf20Sopenharmony_ci		 * track (and drop pins for) r_old_dentry_dir
8678c2ecf20Sopenharmony_ci		 * separately, since r_old_dentry's d_parent may have
8688c2ecf20Sopenharmony_ci		 * changed between the dir mutex being dropped and
8698c2ecf20Sopenharmony_ci		 * this request being freed.
8708c2ecf20Sopenharmony_ci		 */
8718c2ecf20Sopenharmony_ci		ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
8728c2ecf20Sopenharmony_ci				  CEPH_CAP_PIN);
8738c2ecf20Sopenharmony_ci		ceph_async_iput(req->r_old_dentry_dir);
8748c2ecf20Sopenharmony_ci	}
8758c2ecf20Sopenharmony_ci	kfree(req->r_path1);
8768c2ecf20Sopenharmony_ci	kfree(req->r_path2);
8778c2ecf20Sopenharmony_ci	if (req->r_pagelist)
8788c2ecf20Sopenharmony_ci		ceph_pagelist_release(req->r_pagelist);
8798c2ecf20Sopenharmony_ci	put_request_session(req);
8808c2ecf20Sopenharmony_ci	ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
8818c2ecf20Sopenharmony_ci	WARN_ON_ONCE(!list_empty(&req->r_wait));
8828c2ecf20Sopenharmony_ci	kmem_cache_free(ceph_mds_request_cachep, req);
8838c2ecf20Sopenharmony_ci}
8848c2ecf20Sopenharmony_ci
8858c2ecf20Sopenharmony_ciDEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node)
8868c2ecf20Sopenharmony_ci
8878c2ecf20Sopenharmony_ci/*
8888c2ecf20Sopenharmony_ci * lookup session, bump ref if found.
8898c2ecf20Sopenharmony_ci *
8908c2ecf20Sopenharmony_ci * called under mdsc->mutex.
8918c2ecf20Sopenharmony_ci */
8928c2ecf20Sopenharmony_cistatic struct ceph_mds_request *
8938c2ecf20Sopenharmony_cilookup_get_request(struct ceph_mds_client *mdsc, u64 tid)
8948c2ecf20Sopenharmony_ci{
8958c2ecf20Sopenharmony_ci	struct ceph_mds_request *req;
8968c2ecf20Sopenharmony_ci
8978c2ecf20Sopenharmony_ci	req = lookup_request(&mdsc->request_tree, tid);
8988c2ecf20Sopenharmony_ci	if (req)
8998c2ecf20Sopenharmony_ci		ceph_mdsc_get_request(req);
9008c2ecf20Sopenharmony_ci
9018c2ecf20Sopenharmony_ci	return req;
9028c2ecf20Sopenharmony_ci}
9038c2ecf20Sopenharmony_ci
9048c2ecf20Sopenharmony_ci/*
9058c2ecf20Sopenharmony_ci * Register an in-flight request, and assign a tid.  Link to directory
9068c2ecf20Sopenharmony_ci * are modifying (if any).
9078c2ecf20Sopenharmony_ci *
9088c2ecf20Sopenharmony_ci * Called under mdsc->mutex.
9098c2ecf20Sopenharmony_ci */
9108c2ecf20Sopenharmony_cistatic void __register_request(struct ceph_mds_client *mdsc,
9118c2ecf20Sopenharmony_ci			       struct ceph_mds_request *req,
9128c2ecf20Sopenharmony_ci			       struct inode *dir)
9138c2ecf20Sopenharmony_ci{
9148c2ecf20Sopenharmony_ci	int ret = 0;
9158c2ecf20Sopenharmony_ci
9168c2ecf20Sopenharmony_ci	req->r_tid = ++mdsc->last_tid;
9178c2ecf20Sopenharmony_ci	if (req->r_num_caps) {
9188c2ecf20Sopenharmony_ci		ret = ceph_reserve_caps(mdsc, &req->r_caps_reservation,
9198c2ecf20Sopenharmony_ci					req->r_num_caps);
9208c2ecf20Sopenharmony_ci		if (ret < 0) {
9218c2ecf20Sopenharmony_ci			pr_err("__register_request %p "
9228c2ecf20Sopenharmony_ci			       "failed to reserve caps: %d\n", req, ret);
9238c2ecf20Sopenharmony_ci			/* set req->r_err to fail early from __do_request */
9248c2ecf20Sopenharmony_ci			req->r_err = ret;
9258c2ecf20Sopenharmony_ci			return;
9268c2ecf20Sopenharmony_ci		}
9278c2ecf20Sopenharmony_ci	}
9288c2ecf20Sopenharmony_ci	dout("__register_request %p tid %lld\n", req, req->r_tid);
9298c2ecf20Sopenharmony_ci	ceph_mdsc_get_request(req);
9308c2ecf20Sopenharmony_ci	insert_request(&mdsc->request_tree, req);
9318c2ecf20Sopenharmony_ci
9328c2ecf20Sopenharmony_ci	req->r_uid = current_fsuid();
9338c2ecf20Sopenharmony_ci	req->r_gid = current_fsgid();
9348c2ecf20Sopenharmony_ci
9358c2ecf20Sopenharmony_ci	if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK)
9368c2ecf20Sopenharmony_ci		mdsc->oldest_tid = req->r_tid;
9378c2ecf20Sopenharmony_ci
9388c2ecf20Sopenharmony_ci	if (dir) {
9398c2ecf20Sopenharmony_ci		struct ceph_inode_info *ci = ceph_inode(dir);
9408c2ecf20Sopenharmony_ci
9418c2ecf20Sopenharmony_ci		ihold(dir);
9428c2ecf20Sopenharmony_ci		req->r_unsafe_dir = dir;
9438c2ecf20Sopenharmony_ci		spin_lock(&ci->i_unsafe_lock);
9448c2ecf20Sopenharmony_ci		list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
9458c2ecf20Sopenharmony_ci		spin_unlock(&ci->i_unsafe_lock);
9468c2ecf20Sopenharmony_ci	}
9478c2ecf20Sopenharmony_ci}
9488c2ecf20Sopenharmony_ci
9498c2ecf20Sopenharmony_cistatic void __unregister_request(struct ceph_mds_client *mdsc,
9508c2ecf20Sopenharmony_ci				 struct ceph_mds_request *req)
9518c2ecf20Sopenharmony_ci{
9528c2ecf20Sopenharmony_ci	dout("__unregister_request %p tid %lld\n", req, req->r_tid);
9538c2ecf20Sopenharmony_ci
9548c2ecf20Sopenharmony_ci	/* Never leave an unregistered request on an unsafe list! */
9558c2ecf20Sopenharmony_ci	list_del_init(&req->r_unsafe_item);
9568c2ecf20Sopenharmony_ci
9578c2ecf20Sopenharmony_ci	if (req->r_tid == mdsc->oldest_tid) {
9588c2ecf20Sopenharmony_ci		struct rb_node *p = rb_next(&req->r_node);
9598c2ecf20Sopenharmony_ci		mdsc->oldest_tid = 0;
9608c2ecf20Sopenharmony_ci		while (p) {
9618c2ecf20Sopenharmony_ci			struct ceph_mds_request *next_req =
9628c2ecf20Sopenharmony_ci				rb_entry(p, struct ceph_mds_request, r_node);
9638c2ecf20Sopenharmony_ci			if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) {
9648c2ecf20Sopenharmony_ci				mdsc->oldest_tid = next_req->r_tid;
9658c2ecf20Sopenharmony_ci				break;
9668c2ecf20Sopenharmony_ci			}
9678c2ecf20Sopenharmony_ci			p = rb_next(p);
9688c2ecf20Sopenharmony_ci		}
9698c2ecf20Sopenharmony_ci	}
9708c2ecf20Sopenharmony_ci
9718c2ecf20Sopenharmony_ci	erase_request(&mdsc->request_tree, req);
9728c2ecf20Sopenharmony_ci
9738c2ecf20Sopenharmony_ci	if (req->r_unsafe_dir) {
9748c2ecf20Sopenharmony_ci		struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
9758c2ecf20Sopenharmony_ci		spin_lock(&ci->i_unsafe_lock);
9768c2ecf20Sopenharmony_ci		list_del_init(&req->r_unsafe_dir_item);
9778c2ecf20Sopenharmony_ci		spin_unlock(&ci->i_unsafe_lock);
9788c2ecf20Sopenharmony_ci	}
9798c2ecf20Sopenharmony_ci	if (req->r_target_inode &&
9808c2ecf20Sopenharmony_ci	    test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
9818c2ecf20Sopenharmony_ci		struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
9828c2ecf20Sopenharmony_ci		spin_lock(&ci->i_unsafe_lock);
9838c2ecf20Sopenharmony_ci		list_del_init(&req->r_unsafe_target_item);
9848c2ecf20Sopenharmony_ci		spin_unlock(&ci->i_unsafe_lock);
9858c2ecf20Sopenharmony_ci	}
9868c2ecf20Sopenharmony_ci
9878c2ecf20Sopenharmony_ci	if (req->r_unsafe_dir) {
9888c2ecf20Sopenharmony_ci		/* avoid calling iput_final() in mds dispatch threads */
9898c2ecf20Sopenharmony_ci		ceph_async_iput(req->r_unsafe_dir);
9908c2ecf20Sopenharmony_ci		req->r_unsafe_dir = NULL;
9918c2ecf20Sopenharmony_ci	}
9928c2ecf20Sopenharmony_ci
9938c2ecf20Sopenharmony_ci	complete_all(&req->r_safe_completion);
9948c2ecf20Sopenharmony_ci
9958c2ecf20Sopenharmony_ci	ceph_mdsc_put_request(req);
9968c2ecf20Sopenharmony_ci}
9978c2ecf20Sopenharmony_ci
9988c2ecf20Sopenharmony_ci/*
9998c2ecf20Sopenharmony_ci * Walk back up the dentry tree until we hit a dentry representing a
10008c2ecf20Sopenharmony_ci * non-snapshot inode. We do this using the rcu_read_lock (which must be held
10018c2ecf20Sopenharmony_ci * when calling this) to ensure that the objects won't disappear while we're
10028c2ecf20Sopenharmony_ci * working with them. Once we hit a candidate dentry, we attempt to take a
10038c2ecf20Sopenharmony_ci * reference to it, and return that as the result.
10048c2ecf20Sopenharmony_ci */
10058c2ecf20Sopenharmony_cistatic struct inode *get_nonsnap_parent(struct dentry *dentry)
10068c2ecf20Sopenharmony_ci{
10078c2ecf20Sopenharmony_ci	struct inode *inode = NULL;
10088c2ecf20Sopenharmony_ci
10098c2ecf20Sopenharmony_ci	while (dentry && !IS_ROOT(dentry)) {
10108c2ecf20Sopenharmony_ci		inode = d_inode_rcu(dentry);
10118c2ecf20Sopenharmony_ci		if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
10128c2ecf20Sopenharmony_ci			break;
10138c2ecf20Sopenharmony_ci		dentry = dentry->d_parent;
10148c2ecf20Sopenharmony_ci	}
10158c2ecf20Sopenharmony_ci	if (inode)
10168c2ecf20Sopenharmony_ci		inode = igrab(inode);
10178c2ecf20Sopenharmony_ci	return inode;
10188c2ecf20Sopenharmony_ci}
10198c2ecf20Sopenharmony_ci
10208c2ecf20Sopenharmony_ci/*
10218c2ecf20Sopenharmony_ci * Choose mds to send request to next.  If there is a hint set in the
10228c2ecf20Sopenharmony_ci * request (e.g., due to a prior forward hint from the mds), use that.
10238c2ecf20Sopenharmony_ci * Otherwise, consult frag tree and/or caps to identify the
10248c2ecf20Sopenharmony_ci * appropriate mds.  If all else fails, choose randomly.
10258c2ecf20Sopenharmony_ci *
10268c2ecf20Sopenharmony_ci * Called under mdsc->mutex.
10278c2ecf20Sopenharmony_ci */
10288c2ecf20Sopenharmony_cistatic int __choose_mds(struct ceph_mds_client *mdsc,
10298c2ecf20Sopenharmony_ci			struct ceph_mds_request *req,
10308c2ecf20Sopenharmony_ci			bool *random)
10318c2ecf20Sopenharmony_ci{
10328c2ecf20Sopenharmony_ci	struct inode *inode;
10338c2ecf20Sopenharmony_ci	struct ceph_inode_info *ci;
10348c2ecf20Sopenharmony_ci	struct ceph_cap *cap;
10358c2ecf20Sopenharmony_ci	int mode = req->r_direct_mode;
10368c2ecf20Sopenharmony_ci	int mds = -1;
10378c2ecf20Sopenharmony_ci	u32 hash = req->r_direct_hash;
10388c2ecf20Sopenharmony_ci	bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
10398c2ecf20Sopenharmony_ci
10408c2ecf20Sopenharmony_ci	if (random)
10418c2ecf20Sopenharmony_ci		*random = false;
10428c2ecf20Sopenharmony_ci
10438c2ecf20Sopenharmony_ci	/*
10448c2ecf20Sopenharmony_ci	 * is there a specific mds we should try?  ignore hint if we have
10458c2ecf20Sopenharmony_ci	 * no session and the mds is not up (active or recovering).
10468c2ecf20Sopenharmony_ci	 */
10478c2ecf20Sopenharmony_ci	if (req->r_resend_mds >= 0 &&
10488c2ecf20Sopenharmony_ci	    (__have_session(mdsc, req->r_resend_mds) ||
10498c2ecf20Sopenharmony_ci	     ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
10508c2ecf20Sopenharmony_ci		dout("%s using resend_mds mds%d\n", __func__,
10518c2ecf20Sopenharmony_ci		     req->r_resend_mds);
10528c2ecf20Sopenharmony_ci		return req->r_resend_mds;
10538c2ecf20Sopenharmony_ci	}
10548c2ecf20Sopenharmony_ci
10558c2ecf20Sopenharmony_ci	if (mode == USE_RANDOM_MDS)
10568c2ecf20Sopenharmony_ci		goto random;
10578c2ecf20Sopenharmony_ci
10588c2ecf20Sopenharmony_ci	inode = NULL;
10598c2ecf20Sopenharmony_ci	if (req->r_inode) {
10608c2ecf20Sopenharmony_ci		if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) {
10618c2ecf20Sopenharmony_ci			inode = req->r_inode;
10628c2ecf20Sopenharmony_ci			ihold(inode);
10638c2ecf20Sopenharmony_ci		} else {
10648c2ecf20Sopenharmony_ci			/* req->r_dentry is non-null for LSSNAP request */
10658c2ecf20Sopenharmony_ci			rcu_read_lock();
10668c2ecf20Sopenharmony_ci			inode = get_nonsnap_parent(req->r_dentry);
10678c2ecf20Sopenharmony_ci			rcu_read_unlock();
10688c2ecf20Sopenharmony_ci			dout("%s using snapdir's parent %p\n", __func__, inode);
10698c2ecf20Sopenharmony_ci		}
10708c2ecf20Sopenharmony_ci	} else if (req->r_dentry) {
10718c2ecf20Sopenharmony_ci		/* ignore race with rename; old or new d_parent is okay */
10728c2ecf20Sopenharmony_ci		struct dentry *parent;
10738c2ecf20Sopenharmony_ci		struct inode *dir;
10748c2ecf20Sopenharmony_ci
10758c2ecf20Sopenharmony_ci		rcu_read_lock();
10768c2ecf20Sopenharmony_ci		parent = READ_ONCE(req->r_dentry->d_parent);
10778c2ecf20Sopenharmony_ci		dir = req->r_parent ? : d_inode_rcu(parent);
10788c2ecf20Sopenharmony_ci
10798c2ecf20Sopenharmony_ci		if (!dir || dir->i_sb != mdsc->fsc->sb) {
10808c2ecf20Sopenharmony_ci			/*  not this fs or parent went negative */
10818c2ecf20Sopenharmony_ci			inode = d_inode(req->r_dentry);
10828c2ecf20Sopenharmony_ci			if (inode)
10838c2ecf20Sopenharmony_ci				ihold(inode);
10848c2ecf20Sopenharmony_ci		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
10858c2ecf20Sopenharmony_ci			/* direct snapped/virtual snapdir requests
10868c2ecf20Sopenharmony_ci			 * based on parent dir inode */
10878c2ecf20Sopenharmony_ci			inode = get_nonsnap_parent(parent);
10888c2ecf20Sopenharmony_ci			dout("%s using nonsnap parent %p\n", __func__, inode);
10898c2ecf20Sopenharmony_ci		} else {
10908c2ecf20Sopenharmony_ci			/* dentry target */
10918c2ecf20Sopenharmony_ci			inode = d_inode(req->r_dentry);
10928c2ecf20Sopenharmony_ci			if (!inode || mode == USE_AUTH_MDS) {
10938c2ecf20Sopenharmony_ci				/* dir + name */
10948c2ecf20Sopenharmony_ci				inode = igrab(dir);
10958c2ecf20Sopenharmony_ci				hash = ceph_dentry_hash(dir, req->r_dentry);
10968c2ecf20Sopenharmony_ci				is_hash = true;
10978c2ecf20Sopenharmony_ci			} else {
10988c2ecf20Sopenharmony_ci				ihold(inode);
10998c2ecf20Sopenharmony_ci			}
11008c2ecf20Sopenharmony_ci		}
11018c2ecf20Sopenharmony_ci		rcu_read_unlock();
11028c2ecf20Sopenharmony_ci	}
11038c2ecf20Sopenharmony_ci
11048c2ecf20Sopenharmony_ci	dout("%s %p is_hash=%d (0x%x) mode %d\n", __func__, inode, (int)is_hash,
11058c2ecf20Sopenharmony_ci	     hash, mode);
11068c2ecf20Sopenharmony_ci	if (!inode)
11078c2ecf20Sopenharmony_ci		goto random;
11088c2ecf20Sopenharmony_ci	ci = ceph_inode(inode);
11098c2ecf20Sopenharmony_ci
11108c2ecf20Sopenharmony_ci	if (is_hash && S_ISDIR(inode->i_mode)) {
11118c2ecf20Sopenharmony_ci		struct ceph_inode_frag frag;
11128c2ecf20Sopenharmony_ci		int found;
11138c2ecf20Sopenharmony_ci
11148c2ecf20Sopenharmony_ci		ceph_choose_frag(ci, hash, &frag, &found);
11158c2ecf20Sopenharmony_ci		if (found) {
11168c2ecf20Sopenharmony_ci			if (mode == USE_ANY_MDS && frag.ndist > 0) {
11178c2ecf20Sopenharmony_ci				u8 r;
11188c2ecf20Sopenharmony_ci
11198c2ecf20Sopenharmony_ci				/* choose a random replica */
11208c2ecf20Sopenharmony_ci				get_random_bytes(&r, 1);
11218c2ecf20Sopenharmony_ci				r %= frag.ndist;
11228c2ecf20Sopenharmony_ci				mds = frag.dist[r];
11238c2ecf20Sopenharmony_ci				dout("%s %p %llx.%llx frag %u mds%d (%d/%d)\n",
11248c2ecf20Sopenharmony_ci				     __func__, inode, ceph_vinop(inode),
11258c2ecf20Sopenharmony_ci				     frag.frag, mds, (int)r, frag.ndist);
11268c2ecf20Sopenharmony_ci				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
11278c2ecf20Sopenharmony_ci				    CEPH_MDS_STATE_ACTIVE &&
11288c2ecf20Sopenharmony_ci				    !ceph_mdsmap_is_laggy(mdsc->mdsmap, mds))
11298c2ecf20Sopenharmony_ci					goto out;
11308c2ecf20Sopenharmony_ci			}
11318c2ecf20Sopenharmony_ci
11328c2ecf20Sopenharmony_ci			/* since this file/dir wasn't known to be
11338c2ecf20Sopenharmony_ci			 * replicated, then we want to look for the
11348c2ecf20Sopenharmony_ci			 * authoritative mds. */
11358c2ecf20Sopenharmony_ci			if (frag.mds >= 0) {
11368c2ecf20Sopenharmony_ci				/* choose auth mds */
11378c2ecf20Sopenharmony_ci				mds = frag.mds;
11388c2ecf20Sopenharmony_ci				dout("%s %p %llx.%llx frag %u mds%d (auth)\n",
11398c2ecf20Sopenharmony_ci				     __func__, inode, ceph_vinop(inode),
11408c2ecf20Sopenharmony_ci				     frag.frag, mds);
11418c2ecf20Sopenharmony_ci				if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
11428c2ecf20Sopenharmony_ci				    CEPH_MDS_STATE_ACTIVE) {
11438c2ecf20Sopenharmony_ci					if (!ceph_mdsmap_is_laggy(mdsc->mdsmap,
11448c2ecf20Sopenharmony_ci								  mds))
11458c2ecf20Sopenharmony_ci						goto out;
11468c2ecf20Sopenharmony_ci				}
11478c2ecf20Sopenharmony_ci			}
11488c2ecf20Sopenharmony_ci			mode = USE_AUTH_MDS;
11498c2ecf20Sopenharmony_ci		}
11508c2ecf20Sopenharmony_ci	}
11518c2ecf20Sopenharmony_ci
11528c2ecf20Sopenharmony_ci	spin_lock(&ci->i_ceph_lock);
11538c2ecf20Sopenharmony_ci	cap = NULL;
11548c2ecf20Sopenharmony_ci	if (mode == USE_AUTH_MDS)
11558c2ecf20Sopenharmony_ci		cap = ci->i_auth_cap;
11568c2ecf20Sopenharmony_ci	if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
11578c2ecf20Sopenharmony_ci		cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
11588c2ecf20Sopenharmony_ci	if (!cap) {
11598c2ecf20Sopenharmony_ci		spin_unlock(&ci->i_ceph_lock);
11608c2ecf20Sopenharmony_ci		ceph_async_iput(inode);
11618c2ecf20Sopenharmony_ci		goto random;
11628c2ecf20Sopenharmony_ci	}
11638c2ecf20Sopenharmony_ci	mds = cap->session->s_mds;
11648c2ecf20Sopenharmony_ci	dout("%s %p %llx.%llx mds%d (%scap %p)\n", __func__,
11658c2ecf20Sopenharmony_ci	     inode, ceph_vinop(inode), mds,
11668c2ecf20Sopenharmony_ci	     cap == ci->i_auth_cap ? "auth " : "", cap);
11678c2ecf20Sopenharmony_ci	spin_unlock(&ci->i_ceph_lock);
11688c2ecf20Sopenharmony_ciout:
11698c2ecf20Sopenharmony_ci	/* avoid calling iput_final() while holding mdsc->mutex or
11708c2ecf20Sopenharmony_ci	 * in mds dispatch threads */
11718c2ecf20Sopenharmony_ci	ceph_async_iput(inode);
11728c2ecf20Sopenharmony_ci	return mds;
11738c2ecf20Sopenharmony_ci
11748c2ecf20Sopenharmony_cirandom:
11758c2ecf20Sopenharmony_ci	if (random)
11768c2ecf20Sopenharmony_ci		*random = true;
11778c2ecf20Sopenharmony_ci
11788c2ecf20Sopenharmony_ci	mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
11798c2ecf20Sopenharmony_ci	dout("%s chose random mds%d\n", __func__, mds);
11808c2ecf20Sopenharmony_ci	return mds;
11818c2ecf20Sopenharmony_ci}
11828c2ecf20Sopenharmony_ci
11838c2ecf20Sopenharmony_ci
11848c2ecf20Sopenharmony_ci/*
11858c2ecf20Sopenharmony_ci * session messages
11868c2ecf20Sopenharmony_ci */
11878c2ecf20Sopenharmony_cistruct ceph_msg *ceph_create_session_msg(u32 op, u64 seq)
11888c2ecf20Sopenharmony_ci{
11898c2ecf20Sopenharmony_ci	struct ceph_msg *msg;
11908c2ecf20Sopenharmony_ci	struct ceph_mds_session_head *h;
11918c2ecf20Sopenharmony_ci
11928c2ecf20Sopenharmony_ci	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
11938c2ecf20Sopenharmony_ci			   false);
11948c2ecf20Sopenharmony_ci	if (!msg) {
11958c2ecf20Sopenharmony_ci		pr_err("ENOMEM creating session %s msg\n",
11968c2ecf20Sopenharmony_ci		       ceph_session_op_name(op));
11978c2ecf20Sopenharmony_ci		return NULL;
11988c2ecf20Sopenharmony_ci	}
11998c2ecf20Sopenharmony_ci	h = msg->front.iov_base;
12008c2ecf20Sopenharmony_ci	h->op = cpu_to_le32(op);
12018c2ecf20Sopenharmony_ci	h->seq = cpu_to_le64(seq);
12028c2ecf20Sopenharmony_ci
12038c2ecf20Sopenharmony_ci	return msg;
12048c2ecf20Sopenharmony_ci}
12058c2ecf20Sopenharmony_ci
12068c2ecf20Sopenharmony_cistatic const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;
12078c2ecf20Sopenharmony_ci#define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8)
12088c2ecf20Sopenharmony_cistatic int encode_supported_features(void **p, void *end)
12098c2ecf20Sopenharmony_ci{
12108c2ecf20Sopenharmony_ci	static const size_t count = ARRAY_SIZE(feature_bits);
12118c2ecf20Sopenharmony_ci
12128c2ecf20Sopenharmony_ci	if (count > 0) {
12138c2ecf20Sopenharmony_ci		size_t i;
12148c2ecf20Sopenharmony_ci		size_t size = FEATURE_BYTES(count);
12158c2ecf20Sopenharmony_ci		unsigned long bit;
12168c2ecf20Sopenharmony_ci
12178c2ecf20Sopenharmony_ci		if (WARN_ON_ONCE(*p + 4 + size > end))
12188c2ecf20Sopenharmony_ci			return -ERANGE;
12198c2ecf20Sopenharmony_ci
12208c2ecf20Sopenharmony_ci		ceph_encode_32(p, size);
12218c2ecf20Sopenharmony_ci		memset(*p, 0, size);
12228c2ecf20Sopenharmony_ci		for (i = 0; i < count; i++) {
12238c2ecf20Sopenharmony_ci			bit = feature_bits[i];
12248c2ecf20Sopenharmony_ci			((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8);
12258c2ecf20Sopenharmony_ci		}
12268c2ecf20Sopenharmony_ci		*p += size;
12278c2ecf20Sopenharmony_ci	} else {
12288c2ecf20Sopenharmony_ci		if (WARN_ON_ONCE(*p + 4 > end))
12298c2ecf20Sopenharmony_ci			return -ERANGE;
12308c2ecf20Sopenharmony_ci
12318c2ecf20Sopenharmony_ci		ceph_encode_32(p, 0);
12328c2ecf20Sopenharmony_ci	}
12338c2ecf20Sopenharmony_ci
12348c2ecf20Sopenharmony_ci	return 0;
12358c2ecf20Sopenharmony_ci}
12368c2ecf20Sopenharmony_ci
12378c2ecf20Sopenharmony_cistatic const unsigned char metric_bits[] = CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED;
12388c2ecf20Sopenharmony_ci#define METRIC_BYTES(cnt) (DIV_ROUND_UP((size_t)metric_bits[cnt - 1] + 1, 64) * 8)
12398c2ecf20Sopenharmony_cistatic int encode_metric_spec(void **p, void *end)
12408c2ecf20Sopenharmony_ci{
12418c2ecf20Sopenharmony_ci	static const size_t count = ARRAY_SIZE(metric_bits);
12428c2ecf20Sopenharmony_ci
12438c2ecf20Sopenharmony_ci	/* header */
12448c2ecf20Sopenharmony_ci	if (WARN_ON_ONCE(*p + 2 > end))
12458c2ecf20Sopenharmony_ci		return -ERANGE;
12468c2ecf20Sopenharmony_ci
12478c2ecf20Sopenharmony_ci	ceph_encode_8(p, 1); /* version */
12488c2ecf20Sopenharmony_ci	ceph_encode_8(p, 1); /* compat */
12498c2ecf20Sopenharmony_ci
12508c2ecf20Sopenharmony_ci	if (count > 0) {
12518c2ecf20Sopenharmony_ci		size_t i;
12528c2ecf20Sopenharmony_ci		size_t size = METRIC_BYTES(count);
12538c2ecf20Sopenharmony_ci
12548c2ecf20Sopenharmony_ci		if (WARN_ON_ONCE(*p + 4 + 4 + size > end))
12558c2ecf20Sopenharmony_ci			return -ERANGE;
12568c2ecf20Sopenharmony_ci
12578c2ecf20Sopenharmony_ci		/* metric spec info length */
12588c2ecf20Sopenharmony_ci		ceph_encode_32(p, 4 + size);
12598c2ecf20Sopenharmony_ci
12608c2ecf20Sopenharmony_ci		/* metric spec */
12618c2ecf20Sopenharmony_ci		ceph_encode_32(p, size);
12628c2ecf20Sopenharmony_ci		memset(*p, 0, size);
12638c2ecf20Sopenharmony_ci		for (i = 0; i < count; i++)
12648c2ecf20Sopenharmony_ci			((unsigned char *)(*p))[i / 8] |= BIT(metric_bits[i] % 8);
12658c2ecf20Sopenharmony_ci		*p += size;
12668c2ecf20Sopenharmony_ci	} else {
12678c2ecf20Sopenharmony_ci		if (WARN_ON_ONCE(*p + 4 + 4 > end))
12688c2ecf20Sopenharmony_ci			return -ERANGE;
12698c2ecf20Sopenharmony_ci
12708c2ecf20Sopenharmony_ci		/* metric spec info length */
12718c2ecf20Sopenharmony_ci		ceph_encode_32(p, 4);
12728c2ecf20Sopenharmony_ci		/* metric spec */
12738c2ecf20Sopenharmony_ci		ceph_encode_32(p, 0);
12748c2ecf20Sopenharmony_ci	}
12758c2ecf20Sopenharmony_ci
12768c2ecf20Sopenharmony_ci	return 0;
12778c2ecf20Sopenharmony_ci}
12788c2ecf20Sopenharmony_ci
12798c2ecf20Sopenharmony_ci/*
12808c2ecf20Sopenharmony_ci * session message, specialization for CEPH_SESSION_REQUEST_OPEN
12818c2ecf20Sopenharmony_ci * to include additional client metadata fields.
12828c2ecf20Sopenharmony_ci */
12838c2ecf20Sopenharmony_cistatic struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq)
12848c2ecf20Sopenharmony_ci{
12858c2ecf20Sopenharmony_ci	struct ceph_msg *msg;
12868c2ecf20Sopenharmony_ci	struct ceph_mds_session_head *h;
12878c2ecf20Sopenharmony_ci	int i = -1;
12888c2ecf20Sopenharmony_ci	int extra_bytes = 0;
12898c2ecf20Sopenharmony_ci	int metadata_key_count = 0;
12908c2ecf20Sopenharmony_ci	struct ceph_options *opt = mdsc->fsc->client->options;
12918c2ecf20Sopenharmony_ci	struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
12928c2ecf20Sopenharmony_ci	size_t size, count;
12938c2ecf20Sopenharmony_ci	void *p, *end;
12948c2ecf20Sopenharmony_ci	int ret;
12958c2ecf20Sopenharmony_ci
12968c2ecf20Sopenharmony_ci	const char* metadata[][2] = {
12978c2ecf20Sopenharmony_ci		{"hostname", mdsc->nodename},
12988c2ecf20Sopenharmony_ci		{"kernel_version", init_utsname()->release},
12998c2ecf20Sopenharmony_ci		{"entity_id", opt->name ? : ""},
13008c2ecf20Sopenharmony_ci		{"root", fsopt->server_path ? : "/"},
13018c2ecf20Sopenharmony_ci		{NULL, NULL}
13028c2ecf20Sopenharmony_ci	};
13038c2ecf20Sopenharmony_ci
13048c2ecf20Sopenharmony_ci	/* Calculate serialized length of metadata */
13058c2ecf20Sopenharmony_ci	extra_bytes = 4;  /* map length */
13068c2ecf20Sopenharmony_ci	for (i = 0; metadata[i][0]; ++i) {
13078c2ecf20Sopenharmony_ci		extra_bytes += 8 + strlen(metadata[i][0]) +
13088c2ecf20Sopenharmony_ci			strlen(metadata[i][1]);
13098c2ecf20Sopenharmony_ci		metadata_key_count++;
13108c2ecf20Sopenharmony_ci	}
13118c2ecf20Sopenharmony_ci
13128c2ecf20Sopenharmony_ci	/* supported feature */
13138c2ecf20Sopenharmony_ci	size = 0;
13148c2ecf20Sopenharmony_ci	count = ARRAY_SIZE(feature_bits);
13158c2ecf20Sopenharmony_ci	if (count > 0)
13168c2ecf20Sopenharmony_ci		size = FEATURE_BYTES(count);
13178c2ecf20Sopenharmony_ci	extra_bytes += 4 + size;
13188c2ecf20Sopenharmony_ci
13198c2ecf20Sopenharmony_ci	/* metric spec */
13208c2ecf20Sopenharmony_ci	size = 0;
13218c2ecf20Sopenharmony_ci	count = ARRAY_SIZE(metric_bits);
13228c2ecf20Sopenharmony_ci	if (count > 0)
13238c2ecf20Sopenharmony_ci		size = METRIC_BYTES(count);
13248c2ecf20Sopenharmony_ci	extra_bytes += 2 + 4 + 4 + size;
13258c2ecf20Sopenharmony_ci
13268c2ecf20Sopenharmony_ci	/* Allocate the message */
13278c2ecf20Sopenharmony_ci	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
13288c2ecf20Sopenharmony_ci			   GFP_NOFS, false);
13298c2ecf20Sopenharmony_ci	if (!msg) {
13308c2ecf20Sopenharmony_ci		pr_err("ENOMEM creating session open msg\n");
13318c2ecf20Sopenharmony_ci		return ERR_PTR(-ENOMEM);
13328c2ecf20Sopenharmony_ci	}
13338c2ecf20Sopenharmony_ci	p = msg->front.iov_base;
13348c2ecf20Sopenharmony_ci	end = p + msg->front.iov_len;
13358c2ecf20Sopenharmony_ci
13368c2ecf20Sopenharmony_ci	h = p;
13378c2ecf20Sopenharmony_ci	h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN);
13388c2ecf20Sopenharmony_ci	h->seq = cpu_to_le64(seq);
13398c2ecf20Sopenharmony_ci
13408c2ecf20Sopenharmony_ci	/*
13418c2ecf20Sopenharmony_ci	 * Serialize client metadata into waiting buffer space, using
13428c2ecf20Sopenharmony_ci	 * the format that userspace expects for map<string, string>
13438c2ecf20Sopenharmony_ci	 *
13448c2ecf20Sopenharmony_ci	 * ClientSession messages with metadata are v4
13458c2ecf20Sopenharmony_ci	 */
13468c2ecf20Sopenharmony_ci	msg->hdr.version = cpu_to_le16(4);
13478c2ecf20Sopenharmony_ci	msg->hdr.compat_version = cpu_to_le16(1);
13488c2ecf20Sopenharmony_ci
13498c2ecf20Sopenharmony_ci	/* The write pointer, following the session_head structure */
13508c2ecf20Sopenharmony_ci	p += sizeof(*h);
13518c2ecf20Sopenharmony_ci
13528c2ecf20Sopenharmony_ci	/* Number of entries in the map */
13538c2ecf20Sopenharmony_ci	ceph_encode_32(&p, metadata_key_count);
13548c2ecf20Sopenharmony_ci
13558c2ecf20Sopenharmony_ci	/* Two length-prefixed strings for each entry in the map */
13568c2ecf20Sopenharmony_ci	for (i = 0; metadata[i][0]; ++i) {
13578c2ecf20Sopenharmony_ci		size_t const key_len = strlen(metadata[i][0]);
13588c2ecf20Sopenharmony_ci		size_t const val_len = strlen(metadata[i][1]);
13598c2ecf20Sopenharmony_ci
13608c2ecf20Sopenharmony_ci		ceph_encode_32(&p, key_len);
13618c2ecf20Sopenharmony_ci		memcpy(p, metadata[i][0], key_len);
13628c2ecf20Sopenharmony_ci		p += key_len;
13638c2ecf20Sopenharmony_ci		ceph_encode_32(&p, val_len);
13648c2ecf20Sopenharmony_ci		memcpy(p, metadata[i][1], val_len);
13658c2ecf20Sopenharmony_ci		p += val_len;
13668c2ecf20Sopenharmony_ci	}
13678c2ecf20Sopenharmony_ci
13688c2ecf20Sopenharmony_ci	ret = encode_supported_features(&p, end);
13698c2ecf20Sopenharmony_ci	if (ret) {
13708c2ecf20Sopenharmony_ci		pr_err("encode_supported_features failed!\n");
13718c2ecf20Sopenharmony_ci		ceph_msg_put(msg);
13728c2ecf20Sopenharmony_ci		return ERR_PTR(ret);
13738c2ecf20Sopenharmony_ci	}
13748c2ecf20Sopenharmony_ci
13758c2ecf20Sopenharmony_ci	ret = encode_metric_spec(&p, end);
13768c2ecf20Sopenharmony_ci	if (ret) {
13778c2ecf20Sopenharmony_ci		pr_err("encode_metric_spec failed!\n");
13788c2ecf20Sopenharmony_ci		ceph_msg_put(msg);
13798c2ecf20Sopenharmony_ci		return ERR_PTR(ret);
13808c2ecf20Sopenharmony_ci	}
13818c2ecf20Sopenharmony_ci
13828c2ecf20Sopenharmony_ci	msg->front.iov_len = p - msg->front.iov_base;
13838c2ecf20Sopenharmony_ci	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
13848c2ecf20Sopenharmony_ci
13858c2ecf20Sopenharmony_ci	return msg;
13868c2ecf20Sopenharmony_ci}
13878c2ecf20Sopenharmony_ci
13888c2ecf20Sopenharmony_ci/*
13898c2ecf20Sopenharmony_ci * send session open request.
13908c2ecf20Sopenharmony_ci *
13918c2ecf20Sopenharmony_ci * called under mdsc->mutex
13928c2ecf20Sopenharmony_ci */
13938c2ecf20Sopenharmony_cistatic int __open_session(struct ceph_mds_client *mdsc,
13948c2ecf20Sopenharmony_ci			  struct ceph_mds_session *session)
13958c2ecf20Sopenharmony_ci{
13968c2ecf20Sopenharmony_ci	struct ceph_msg *msg;
13978c2ecf20Sopenharmony_ci	int mstate;
13988c2ecf20Sopenharmony_ci	int mds = session->s_mds;
13998c2ecf20Sopenharmony_ci
14008c2ecf20Sopenharmony_ci	/* wait for mds to go active? */
14018c2ecf20Sopenharmony_ci	mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
14028c2ecf20Sopenharmony_ci	dout("open_session to mds%d (%s)\n", mds,
14038c2ecf20Sopenharmony_ci	     ceph_mds_state_name(mstate));
14048c2ecf20Sopenharmony_ci	session->s_state = CEPH_MDS_SESSION_OPENING;
14058c2ecf20Sopenharmony_ci	session->s_renew_requested = jiffies;
14068c2ecf20Sopenharmony_ci
14078c2ecf20Sopenharmony_ci	/* send connect message */
14088c2ecf20Sopenharmony_ci	msg = create_session_open_msg(mdsc, session->s_seq);
14098c2ecf20Sopenharmony_ci	if (IS_ERR(msg))
14108c2ecf20Sopenharmony_ci		return PTR_ERR(msg);
14118c2ecf20Sopenharmony_ci	ceph_con_send(&session->s_con, msg);
14128c2ecf20Sopenharmony_ci	return 0;
14138c2ecf20Sopenharmony_ci}
14148c2ecf20Sopenharmony_ci
14158c2ecf20Sopenharmony_ci/*
14168c2ecf20Sopenharmony_ci * open sessions for any export targets for the given mds
14178c2ecf20Sopenharmony_ci *
14188c2ecf20Sopenharmony_ci * called under mdsc->mutex
14198c2ecf20Sopenharmony_ci */
14208c2ecf20Sopenharmony_cistatic struct ceph_mds_session *
14218c2ecf20Sopenharmony_ci__open_export_target_session(struct ceph_mds_client *mdsc, int target)
14228c2ecf20Sopenharmony_ci{
14238c2ecf20Sopenharmony_ci	struct ceph_mds_session *session;
14248c2ecf20Sopenharmony_ci	int ret;
14258c2ecf20Sopenharmony_ci
14268c2ecf20Sopenharmony_ci	session = __ceph_lookup_mds_session(mdsc, target);
14278c2ecf20Sopenharmony_ci	if (!session) {
14288c2ecf20Sopenharmony_ci		session = register_session(mdsc, target);
14298c2ecf20Sopenharmony_ci		if (IS_ERR(session))
14308c2ecf20Sopenharmony_ci			return session;
14318c2ecf20Sopenharmony_ci	}
14328c2ecf20Sopenharmony_ci	if (session->s_state == CEPH_MDS_SESSION_NEW ||
14338c2ecf20Sopenharmony_ci	    session->s_state == CEPH_MDS_SESSION_CLOSING) {
14348c2ecf20Sopenharmony_ci		ret = __open_session(mdsc, session);
14358c2ecf20Sopenharmony_ci		if (ret)
14368c2ecf20Sopenharmony_ci			return ERR_PTR(ret);
14378c2ecf20Sopenharmony_ci	}
14388c2ecf20Sopenharmony_ci
14398c2ecf20Sopenharmony_ci	return session;
14408c2ecf20Sopenharmony_ci}
14418c2ecf20Sopenharmony_ci
14428c2ecf20Sopenharmony_cistruct ceph_mds_session *
14438c2ecf20Sopenharmony_ciceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
14448c2ecf20Sopenharmony_ci{
14458c2ecf20Sopenharmony_ci	struct ceph_mds_session *session;
14468c2ecf20Sopenharmony_ci
14478c2ecf20Sopenharmony_ci	dout("open_export_target_session to mds%d\n", target);
14488c2ecf20Sopenharmony_ci
14498c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
14508c2ecf20Sopenharmony_ci	session = __open_export_target_session(mdsc, target);
14518c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
14528c2ecf20Sopenharmony_ci
14538c2ecf20Sopenharmony_ci	return session;
14548c2ecf20Sopenharmony_ci}
14558c2ecf20Sopenharmony_ci
14568c2ecf20Sopenharmony_cistatic void __open_export_target_sessions(struct ceph_mds_client *mdsc,
14578c2ecf20Sopenharmony_ci					  struct ceph_mds_session *session)
14588c2ecf20Sopenharmony_ci{
14598c2ecf20Sopenharmony_ci	struct ceph_mds_info *mi;
14608c2ecf20Sopenharmony_ci	struct ceph_mds_session *ts;
14618c2ecf20Sopenharmony_ci	int i, mds = session->s_mds;
14628c2ecf20Sopenharmony_ci
14638c2ecf20Sopenharmony_ci	if (mds >= mdsc->mdsmap->possible_max_rank)
14648c2ecf20Sopenharmony_ci		return;
14658c2ecf20Sopenharmony_ci
14668c2ecf20Sopenharmony_ci	mi = &mdsc->mdsmap->m_info[mds];
14678c2ecf20Sopenharmony_ci	dout("open_export_target_sessions for mds%d (%d targets)\n",
14688c2ecf20Sopenharmony_ci	     session->s_mds, mi->num_export_targets);
14698c2ecf20Sopenharmony_ci
14708c2ecf20Sopenharmony_ci	for (i = 0; i < mi->num_export_targets; i++) {
14718c2ecf20Sopenharmony_ci		ts = __open_export_target_session(mdsc, mi->export_targets[i]);
14728c2ecf20Sopenharmony_ci		ceph_put_mds_session(ts);
14738c2ecf20Sopenharmony_ci	}
14748c2ecf20Sopenharmony_ci}
14758c2ecf20Sopenharmony_ci
14768c2ecf20Sopenharmony_civoid ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
14778c2ecf20Sopenharmony_ci					   struct ceph_mds_session *session)
14788c2ecf20Sopenharmony_ci{
14798c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
14808c2ecf20Sopenharmony_ci	__open_export_target_sessions(mdsc, session);
14818c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
14828c2ecf20Sopenharmony_ci}
14838c2ecf20Sopenharmony_ci
14848c2ecf20Sopenharmony_ci/*
14858c2ecf20Sopenharmony_ci * session caps
14868c2ecf20Sopenharmony_ci */
14878c2ecf20Sopenharmony_ci
14888c2ecf20Sopenharmony_cistatic void detach_cap_releases(struct ceph_mds_session *session,
14898c2ecf20Sopenharmony_ci				struct list_head *target)
14908c2ecf20Sopenharmony_ci{
14918c2ecf20Sopenharmony_ci	lockdep_assert_held(&session->s_cap_lock);
14928c2ecf20Sopenharmony_ci
14938c2ecf20Sopenharmony_ci	list_splice_init(&session->s_cap_releases, target);
14948c2ecf20Sopenharmony_ci	session->s_num_cap_releases = 0;
14958c2ecf20Sopenharmony_ci	dout("dispose_cap_releases mds%d\n", session->s_mds);
14968c2ecf20Sopenharmony_ci}
14978c2ecf20Sopenharmony_ci
14988c2ecf20Sopenharmony_cistatic void dispose_cap_releases(struct ceph_mds_client *mdsc,
14998c2ecf20Sopenharmony_ci				 struct list_head *dispose)
15008c2ecf20Sopenharmony_ci{
15018c2ecf20Sopenharmony_ci	while (!list_empty(dispose)) {
15028c2ecf20Sopenharmony_ci		struct ceph_cap *cap;
15038c2ecf20Sopenharmony_ci		/* zero out the in-progress message */
15048c2ecf20Sopenharmony_ci		cap = list_first_entry(dispose, struct ceph_cap, session_caps);
15058c2ecf20Sopenharmony_ci		list_del(&cap->session_caps);
15068c2ecf20Sopenharmony_ci		ceph_put_cap(mdsc, cap);
15078c2ecf20Sopenharmony_ci	}
15088c2ecf20Sopenharmony_ci}
15098c2ecf20Sopenharmony_ci
15108c2ecf20Sopenharmony_cistatic void cleanup_session_requests(struct ceph_mds_client *mdsc,
15118c2ecf20Sopenharmony_ci				     struct ceph_mds_session *session)
15128c2ecf20Sopenharmony_ci{
15138c2ecf20Sopenharmony_ci	struct ceph_mds_request *req;
15148c2ecf20Sopenharmony_ci	struct rb_node *p;
15158c2ecf20Sopenharmony_ci
15168c2ecf20Sopenharmony_ci	dout("cleanup_session_requests mds%d\n", session->s_mds);
15178c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
15188c2ecf20Sopenharmony_ci	while (!list_empty(&session->s_unsafe)) {
15198c2ecf20Sopenharmony_ci		req = list_first_entry(&session->s_unsafe,
15208c2ecf20Sopenharmony_ci				       struct ceph_mds_request, r_unsafe_item);
15218c2ecf20Sopenharmony_ci		pr_warn_ratelimited(" dropping unsafe request %llu\n",
15228c2ecf20Sopenharmony_ci				    req->r_tid);
15238c2ecf20Sopenharmony_ci		if (req->r_target_inode)
15248c2ecf20Sopenharmony_ci			mapping_set_error(req->r_target_inode->i_mapping, -EIO);
15258c2ecf20Sopenharmony_ci		if (req->r_unsafe_dir)
15268c2ecf20Sopenharmony_ci			mapping_set_error(req->r_unsafe_dir->i_mapping, -EIO);
15278c2ecf20Sopenharmony_ci		__unregister_request(mdsc, req);
15288c2ecf20Sopenharmony_ci	}
15298c2ecf20Sopenharmony_ci	/* zero r_attempts, so kick_requests() will re-send requests */
15308c2ecf20Sopenharmony_ci	p = rb_first(&mdsc->request_tree);
15318c2ecf20Sopenharmony_ci	while (p) {
15328c2ecf20Sopenharmony_ci		req = rb_entry(p, struct ceph_mds_request, r_node);
15338c2ecf20Sopenharmony_ci		p = rb_next(p);
15348c2ecf20Sopenharmony_ci		if (req->r_session &&
15358c2ecf20Sopenharmony_ci		    req->r_session->s_mds == session->s_mds)
15368c2ecf20Sopenharmony_ci			req->r_attempts = 0;
15378c2ecf20Sopenharmony_ci	}
15388c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
15398c2ecf20Sopenharmony_ci}
15408c2ecf20Sopenharmony_ci
15418c2ecf20Sopenharmony_ci/*
15428c2ecf20Sopenharmony_ci * Helper to safely iterate over all caps associated with a session, with
15438c2ecf20Sopenharmony_ci * special care taken to handle a racing __ceph_remove_cap().
15448c2ecf20Sopenharmony_ci *
15458c2ecf20Sopenharmony_ci * Caller must hold session s_mutex.
15468c2ecf20Sopenharmony_ci */
15478c2ecf20Sopenharmony_ciint ceph_iterate_session_caps(struct ceph_mds_session *session,
15488c2ecf20Sopenharmony_ci			      int (*cb)(struct inode *, struct ceph_cap *,
15498c2ecf20Sopenharmony_ci					void *), void *arg)
15508c2ecf20Sopenharmony_ci{
15518c2ecf20Sopenharmony_ci	struct list_head *p;
15528c2ecf20Sopenharmony_ci	struct ceph_cap *cap;
15538c2ecf20Sopenharmony_ci	struct inode *inode, *last_inode = NULL;
15548c2ecf20Sopenharmony_ci	struct ceph_cap *old_cap = NULL;
15558c2ecf20Sopenharmony_ci	int ret;
15568c2ecf20Sopenharmony_ci
15578c2ecf20Sopenharmony_ci	dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
15588c2ecf20Sopenharmony_ci	spin_lock(&session->s_cap_lock);
15598c2ecf20Sopenharmony_ci	p = session->s_caps.next;
15608c2ecf20Sopenharmony_ci	while (p != &session->s_caps) {
15618c2ecf20Sopenharmony_ci		cap = list_entry(p, struct ceph_cap, session_caps);
15628c2ecf20Sopenharmony_ci		inode = igrab(&cap->ci->vfs_inode);
15638c2ecf20Sopenharmony_ci		if (!inode) {
15648c2ecf20Sopenharmony_ci			p = p->next;
15658c2ecf20Sopenharmony_ci			continue;
15668c2ecf20Sopenharmony_ci		}
15678c2ecf20Sopenharmony_ci		session->s_cap_iterator = cap;
15688c2ecf20Sopenharmony_ci		spin_unlock(&session->s_cap_lock);
15698c2ecf20Sopenharmony_ci
15708c2ecf20Sopenharmony_ci		if (last_inode) {
15718c2ecf20Sopenharmony_ci			/* avoid calling iput_final() while holding
15728c2ecf20Sopenharmony_ci			 * s_mutex or in mds dispatch threads */
15738c2ecf20Sopenharmony_ci			ceph_async_iput(last_inode);
15748c2ecf20Sopenharmony_ci			last_inode = NULL;
15758c2ecf20Sopenharmony_ci		}
15768c2ecf20Sopenharmony_ci		if (old_cap) {
15778c2ecf20Sopenharmony_ci			ceph_put_cap(session->s_mdsc, old_cap);
15788c2ecf20Sopenharmony_ci			old_cap = NULL;
15798c2ecf20Sopenharmony_ci		}
15808c2ecf20Sopenharmony_ci
15818c2ecf20Sopenharmony_ci		ret = cb(inode, cap, arg);
15828c2ecf20Sopenharmony_ci		last_inode = inode;
15838c2ecf20Sopenharmony_ci
15848c2ecf20Sopenharmony_ci		spin_lock(&session->s_cap_lock);
15858c2ecf20Sopenharmony_ci		p = p->next;
15868c2ecf20Sopenharmony_ci		if (!cap->ci) {
15878c2ecf20Sopenharmony_ci			dout("iterate_session_caps  finishing cap %p removal\n",
15888c2ecf20Sopenharmony_ci			     cap);
15898c2ecf20Sopenharmony_ci			BUG_ON(cap->session != session);
15908c2ecf20Sopenharmony_ci			cap->session = NULL;
15918c2ecf20Sopenharmony_ci			list_del_init(&cap->session_caps);
15928c2ecf20Sopenharmony_ci			session->s_nr_caps--;
15938c2ecf20Sopenharmony_ci			atomic64_dec(&session->s_mdsc->metric.total_caps);
15948c2ecf20Sopenharmony_ci			if (cap->queue_release)
15958c2ecf20Sopenharmony_ci				__ceph_queue_cap_release(session, cap);
15968c2ecf20Sopenharmony_ci			else
15978c2ecf20Sopenharmony_ci				old_cap = cap;  /* put_cap it w/o locks held */
15988c2ecf20Sopenharmony_ci		}
15998c2ecf20Sopenharmony_ci		if (ret < 0)
16008c2ecf20Sopenharmony_ci			goto out;
16018c2ecf20Sopenharmony_ci	}
16028c2ecf20Sopenharmony_ci	ret = 0;
16038c2ecf20Sopenharmony_ciout:
16048c2ecf20Sopenharmony_ci	session->s_cap_iterator = NULL;
16058c2ecf20Sopenharmony_ci	spin_unlock(&session->s_cap_lock);
16068c2ecf20Sopenharmony_ci
16078c2ecf20Sopenharmony_ci	ceph_async_iput(last_inode);
16088c2ecf20Sopenharmony_ci	if (old_cap)
16098c2ecf20Sopenharmony_ci		ceph_put_cap(session->s_mdsc, old_cap);
16108c2ecf20Sopenharmony_ci
16118c2ecf20Sopenharmony_ci	return ret;
16128c2ecf20Sopenharmony_ci}
16138c2ecf20Sopenharmony_ci
16148c2ecf20Sopenharmony_cistatic int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode)
16158c2ecf20Sopenharmony_ci{
16168c2ecf20Sopenharmony_ci	struct ceph_inode_info *ci = ceph_inode(inode);
16178c2ecf20Sopenharmony_ci	struct ceph_cap_snap *capsnap;
16188c2ecf20Sopenharmony_ci	int capsnap_release = 0;
16198c2ecf20Sopenharmony_ci
16208c2ecf20Sopenharmony_ci	lockdep_assert_held(&ci->i_ceph_lock);
16218c2ecf20Sopenharmony_ci
16228c2ecf20Sopenharmony_ci	dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode);
16238c2ecf20Sopenharmony_ci
16248c2ecf20Sopenharmony_ci	while (!list_empty(&ci->i_cap_snaps)) {
16258c2ecf20Sopenharmony_ci		capsnap = list_first_entry(&ci->i_cap_snaps,
16268c2ecf20Sopenharmony_ci					   struct ceph_cap_snap, ci_item);
16278c2ecf20Sopenharmony_ci		__ceph_remove_capsnap(inode, capsnap, NULL, NULL);
16288c2ecf20Sopenharmony_ci		ceph_put_snap_context(capsnap->context);
16298c2ecf20Sopenharmony_ci		ceph_put_cap_snap(capsnap);
16308c2ecf20Sopenharmony_ci		capsnap_release++;
16318c2ecf20Sopenharmony_ci	}
16328c2ecf20Sopenharmony_ci	wake_up_all(&ci->i_cap_wq);
16338c2ecf20Sopenharmony_ci	wake_up_all(&mdsc->cap_flushing_wq);
16348c2ecf20Sopenharmony_ci	return capsnap_release;
16358c2ecf20Sopenharmony_ci}
16368c2ecf20Sopenharmony_ci
16378c2ecf20Sopenharmony_cistatic int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
16388c2ecf20Sopenharmony_ci				  void *arg)
16398c2ecf20Sopenharmony_ci{
16408c2ecf20Sopenharmony_ci	struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
16418c2ecf20Sopenharmony_ci	struct ceph_mds_client *mdsc = fsc->mdsc;
16428c2ecf20Sopenharmony_ci	struct ceph_inode_info *ci = ceph_inode(inode);
16438c2ecf20Sopenharmony_ci	LIST_HEAD(to_remove);
16448c2ecf20Sopenharmony_ci	bool dirty_dropped = false;
16458c2ecf20Sopenharmony_ci	bool invalidate = false;
16468c2ecf20Sopenharmony_ci	int capsnap_release = 0;
16478c2ecf20Sopenharmony_ci
16488c2ecf20Sopenharmony_ci	dout("removing cap %p, ci is %p, inode is %p\n",
16498c2ecf20Sopenharmony_ci	     cap, ci, &ci->vfs_inode);
16508c2ecf20Sopenharmony_ci	spin_lock(&ci->i_ceph_lock);
16518c2ecf20Sopenharmony_ci	__ceph_remove_cap(cap, false);
16528c2ecf20Sopenharmony_ci	if (!ci->i_auth_cap) {
16538c2ecf20Sopenharmony_ci		struct ceph_cap_flush *cf;
16548c2ecf20Sopenharmony_ci
16558c2ecf20Sopenharmony_ci		if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
16568c2ecf20Sopenharmony_ci			if (inode->i_data.nrpages > 0)
16578c2ecf20Sopenharmony_ci				invalidate = true;
16588c2ecf20Sopenharmony_ci			if (ci->i_wrbuffer_ref > 0)
16598c2ecf20Sopenharmony_ci				mapping_set_error(&inode->i_data, -EIO);
16608c2ecf20Sopenharmony_ci		}
16618c2ecf20Sopenharmony_ci
16628c2ecf20Sopenharmony_ci		while (!list_empty(&ci->i_cap_flush_list)) {
16638c2ecf20Sopenharmony_ci			cf = list_first_entry(&ci->i_cap_flush_list,
16648c2ecf20Sopenharmony_ci					      struct ceph_cap_flush, i_list);
16658c2ecf20Sopenharmony_ci			list_move(&cf->i_list, &to_remove);
16668c2ecf20Sopenharmony_ci		}
16678c2ecf20Sopenharmony_ci
16688c2ecf20Sopenharmony_ci		spin_lock(&mdsc->cap_dirty_lock);
16698c2ecf20Sopenharmony_ci
16708c2ecf20Sopenharmony_ci		list_for_each_entry(cf, &to_remove, i_list)
16718c2ecf20Sopenharmony_ci			list_del_init(&cf->g_list);
16728c2ecf20Sopenharmony_ci
16738c2ecf20Sopenharmony_ci		if (!list_empty(&ci->i_dirty_item)) {
16748c2ecf20Sopenharmony_ci			pr_warn_ratelimited(
16758c2ecf20Sopenharmony_ci				" dropping dirty %s state for %p %lld\n",
16768c2ecf20Sopenharmony_ci				ceph_cap_string(ci->i_dirty_caps),
16778c2ecf20Sopenharmony_ci				inode, ceph_ino(inode));
16788c2ecf20Sopenharmony_ci			ci->i_dirty_caps = 0;
16798c2ecf20Sopenharmony_ci			list_del_init(&ci->i_dirty_item);
16808c2ecf20Sopenharmony_ci			dirty_dropped = true;
16818c2ecf20Sopenharmony_ci		}
16828c2ecf20Sopenharmony_ci		if (!list_empty(&ci->i_flushing_item)) {
16838c2ecf20Sopenharmony_ci			pr_warn_ratelimited(
16848c2ecf20Sopenharmony_ci				" dropping dirty+flushing %s state for %p %lld\n",
16858c2ecf20Sopenharmony_ci				ceph_cap_string(ci->i_flushing_caps),
16868c2ecf20Sopenharmony_ci				inode, ceph_ino(inode));
16878c2ecf20Sopenharmony_ci			ci->i_flushing_caps = 0;
16888c2ecf20Sopenharmony_ci			list_del_init(&ci->i_flushing_item);
16898c2ecf20Sopenharmony_ci			mdsc->num_cap_flushing--;
16908c2ecf20Sopenharmony_ci			dirty_dropped = true;
16918c2ecf20Sopenharmony_ci		}
16928c2ecf20Sopenharmony_ci		spin_unlock(&mdsc->cap_dirty_lock);
16938c2ecf20Sopenharmony_ci
16948c2ecf20Sopenharmony_ci		if (dirty_dropped) {
16958c2ecf20Sopenharmony_ci			mapping_set_error(inode->i_mapping, -EIO);
16968c2ecf20Sopenharmony_ci
16978c2ecf20Sopenharmony_ci			if (ci->i_wrbuffer_ref_head == 0 &&
16988c2ecf20Sopenharmony_ci			    ci->i_wr_ref == 0 &&
16998c2ecf20Sopenharmony_ci			    ci->i_dirty_caps == 0 &&
17008c2ecf20Sopenharmony_ci			    ci->i_flushing_caps == 0) {
17018c2ecf20Sopenharmony_ci				ceph_put_snap_context(ci->i_head_snapc);
17028c2ecf20Sopenharmony_ci				ci->i_head_snapc = NULL;
17038c2ecf20Sopenharmony_ci			}
17048c2ecf20Sopenharmony_ci		}
17058c2ecf20Sopenharmony_ci
17068c2ecf20Sopenharmony_ci		if (atomic_read(&ci->i_filelock_ref) > 0) {
17078c2ecf20Sopenharmony_ci			/* make further file lock syscall return -EIO */
17088c2ecf20Sopenharmony_ci			ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
17098c2ecf20Sopenharmony_ci			pr_warn_ratelimited(" dropping file locks for %p %lld\n",
17108c2ecf20Sopenharmony_ci					    inode, ceph_ino(inode));
17118c2ecf20Sopenharmony_ci		}
17128c2ecf20Sopenharmony_ci
17138c2ecf20Sopenharmony_ci		if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
17148c2ecf20Sopenharmony_ci			list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
17158c2ecf20Sopenharmony_ci			ci->i_prealloc_cap_flush = NULL;
17168c2ecf20Sopenharmony_ci		}
17178c2ecf20Sopenharmony_ci
17188c2ecf20Sopenharmony_ci		if (!list_empty(&ci->i_cap_snaps))
17198c2ecf20Sopenharmony_ci			capsnap_release = remove_capsnaps(mdsc, inode);
17208c2ecf20Sopenharmony_ci	}
17218c2ecf20Sopenharmony_ci	spin_unlock(&ci->i_ceph_lock);
17228c2ecf20Sopenharmony_ci	while (!list_empty(&to_remove)) {
17238c2ecf20Sopenharmony_ci		struct ceph_cap_flush *cf;
17248c2ecf20Sopenharmony_ci		cf = list_first_entry(&to_remove,
17258c2ecf20Sopenharmony_ci				      struct ceph_cap_flush, i_list);
17268c2ecf20Sopenharmony_ci		list_del_init(&cf->i_list);
17278c2ecf20Sopenharmony_ci		if (!cf->is_capsnap)
17288c2ecf20Sopenharmony_ci			ceph_free_cap_flush(cf);
17298c2ecf20Sopenharmony_ci	}
17308c2ecf20Sopenharmony_ci
17318c2ecf20Sopenharmony_ci	wake_up_all(&ci->i_cap_wq);
17328c2ecf20Sopenharmony_ci	if (invalidate)
17338c2ecf20Sopenharmony_ci		ceph_queue_invalidate(inode);
17348c2ecf20Sopenharmony_ci	if (dirty_dropped)
17358c2ecf20Sopenharmony_ci		iput(inode);
17368c2ecf20Sopenharmony_ci	while (capsnap_release--)
17378c2ecf20Sopenharmony_ci		iput(inode);
17388c2ecf20Sopenharmony_ci	return 0;
17398c2ecf20Sopenharmony_ci}
17408c2ecf20Sopenharmony_ci
17418c2ecf20Sopenharmony_ci/*
17428c2ecf20Sopenharmony_ci * caller must hold session s_mutex
17438c2ecf20Sopenharmony_ci */
17448c2ecf20Sopenharmony_cistatic void remove_session_caps(struct ceph_mds_session *session)
17458c2ecf20Sopenharmony_ci{
17468c2ecf20Sopenharmony_ci	struct ceph_fs_client *fsc = session->s_mdsc->fsc;
17478c2ecf20Sopenharmony_ci	struct super_block *sb = fsc->sb;
17488c2ecf20Sopenharmony_ci	LIST_HEAD(dispose);
17498c2ecf20Sopenharmony_ci
17508c2ecf20Sopenharmony_ci	dout("remove_session_caps on %p\n", session);
17518c2ecf20Sopenharmony_ci	ceph_iterate_session_caps(session, remove_session_caps_cb, fsc);
17528c2ecf20Sopenharmony_ci
17538c2ecf20Sopenharmony_ci	wake_up_all(&fsc->mdsc->cap_flushing_wq);
17548c2ecf20Sopenharmony_ci
17558c2ecf20Sopenharmony_ci	spin_lock(&session->s_cap_lock);
17568c2ecf20Sopenharmony_ci	if (session->s_nr_caps > 0) {
17578c2ecf20Sopenharmony_ci		struct inode *inode;
17588c2ecf20Sopenharmony_ci		struct ceph_cap *cap, *prev = NULL;
17598c2ecf20Sopenharmony_ci		struct ceph_vino vino;
17608c2ecf20Sopenharmony_ci		/*
17618c2ecf20Sopenharmony_ci		 * iterate_session_caps() skips inodes that are being
17628c2ecf20Sopenharmony_ci		 * deleted, we need to wait until deletions are complete.
17638c2ecf20Sopenharmony_ci		 * __wait_on_freeing_inode() is designed for the job,
17648c2ecf20Sopenharmony_ci		 * but it is not exported, so use lookup inode function
17658c2ecf20Sopenharmony_ci		 * to access it.
17668c2ecf20Sopenharmony_ci		 */
17678c2ecf20Sopenharmony_ci		while (!list_empty(&session->s_caps)) {
17688c2ecf20Sopenharmony_ci			cap = list_entry(session->s_caps.next,
17698c2ecf20Sopenharmony_ci					 struct ceph_cap, session_caps);
17708c2ecf20Sopenharmony_ci			if (cap == prev)
17718c2ecf20Sopenharmony_ci				break;
17728c2ecf20Sopenharmony_ci			prev = cap;
17738c2ecf20Sopenharmony_ci			vino = cap->ci->i_vino;
17748c2ecf20Sopenharmony_ci			spin_unlock(&session->s_cap_lock);
17758c2ecf20Sopenharmony_ci
17768c2ecf20Sopenharmony_ci			inode = ceph_find_inode(sb, vino);
17778c2ecf20Sopenharmony_ci			 /* avoid calling iput_final() while holding s_mutex */
17788c2ecf20Sopenharmony_ci			ceph_async_iput(inode);
17798c2ecf20Sopenharmony_ci
17808c2ecf20Sopenharmony_ci			spin_lock(&session->s_cap_lock);
17818c2ecf20Sopenharmony_ci		}
17828c2ecf20Sopenharmony_ci	}
17838c2ecf20Sopenharmony_ci
17848c2ecf20Sopenharmony_ci	// drop cap expires and unlock s_cap_lock
17858c2ecf20Sopenharmony_ci	detach_cap_releases(session, &dispose);
17868c2ecf20Sopenharmony_ci
17878c2ecf20Sopenharmony_ci	BUG_ON(session->s_nr_caps > 0);
17888c2ecf20Sopenharmony_ci	BUG_ON(!list_empty(&session->s_cap_flushing));
17898c2ecf20Sopenharmony_ci	spin_unlock(&session->s_cap_lock);
17908c2ecf20Sopenharmony_ci	dispose_cap_releases(session->s_mdsc, &dispose);
17918c2ecf20Sopenharmony_ci}
17928c2ecf20Sopenharmony_ci
17938c2ecf20Sopenharmony_cienum {
17948c2ecf20Sopenharmony_ci	RECONNECT,
17958c2ecf20Sopenharmony_ci	RENEWCAPS,
17968c2ecf20Sopenharmony_ci	FORCE_RO,
17978c2ecf20Sopenharmony_ci};
17988c2ecf20Sopenharmony_ci
17998c2ecf20Sopenharmony_ci/*
18008c2ecf20Sopenharmony_ci * wake up any threads waiting on this session's caps.  if the cap is
18018c2ecf20Sopenharmony_ci * old (didn't get renewed on the client reconnect), remove it now.
18028c2ecf20Sopenharmony_ci *
18038c2ecf20Sopenharmony_ci * caller must hold s_mutex.
18048c2ecf20Sopenharmony_ci */
18058c2ecf20Sopenharmony_cistatic int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
18068c2ecf20Sopenharmony_ci			      void *arg)
18078c2ecf20Sopenharmony_ci{
18088c2ecf20Sopenharmony_ci	struct ceph_inode_info *ci = ceph_inode(inode);
18098c2ecf20Sopenharmony_ci	unsigned long ev = (unsigned long)arg;
18108c2ecf20Sopenharmony_ci
18118c2ecf20Sopenharmony_ci	if (ev == RECONNECT) {
18128c2ecf20Sopenharmony_ci		spin_lock(&ci->i_ceph_lock);
18138c2ecf20Sopenharmony_ci		ci->i_wanted_max_size = 0;
18148c2ecf20Sopenharmony_ci		ci->i_requested_max_size = 0;
18158c2ecf20Sopenharmony_ci		spin_unlock(&ci->i_ceph_lock);
18168c2ecf20Sopenharmony_ci	} else if (ev == RENEWCAPS) {
18178c2ecf20Sopenharmony_ci		if (cap->cap_gen < cap->session->s_cap_gen) {
18188c2ecf20Sopenharmony_ci			/* mds did not re-issue stale cap */
18198c2ecf20Sopenharmony_ci			spin_lock(&ci->i_ceph_lock);
18208c2ecf20Sopenharmony_ci			cap->issued = cap->implemented = CEPH_CAP_PIN;
18218c2ecf20Sopenharmony_ci			spin_unlock(&ci->i_ceph_lock);
18228c2ecf20Sopenharmony_ci		}
18238c2ecf20Sopenharmony_ci	} else if (ev == FORCE_RO) {
18248c2ecf20Sopenharmony_ci	}
18258c2ecf20Sopenharmony_ci	wake_up_all(&ci->i_cap_wq);
18268c2ecf20Sopenharmony_ci	return 0;
18278c2ecf20Sopenharmony_ci}
18288c2ecf20Sopenharmony_ci
18298c2ecf20Sopenharmony_cistatic void wake_up_session_caps(struct ceph_mds_session *session, int ev)
18308c2ecf20Sopenharmony_ci{
18318c2ecf20Sopenharmony_ci	dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
18328c2ecf20Sopenharmony_ci	ceph_iterate_session_caps(session, wake_up_session_cb,
18338c2ecf20Sopenharmony_ci				  (void *)(unsigned long)ev);
18348c2ecf20Sopenharmony_ci}
18358c2ecf20Sopenharmony_ci
18368c2ecf20Sopenharmony_ci/*
18378c2ecf20Sopenharmony_ci * Send periodic message to MDS renewing all currently held caps.  The
18388c2ecf20Sopenharmony_ci * ack will reset the expiration for all caps from this session.
18398c2ecf20Sopenharmony_ci *
18408c2ecf20Sopenharmony_ci * caller holds s_mutex
18418c2ecf20Sopenharmony_ci */
18428c2ecf20Sopenharmony_cistatic int send_renew_caps(struct ceph_mds_client *mdsc,
18438c2ecf20Sopenharmony_ci			   struct ceph_mds_session *session)
18448c2ecf20Sopenharmony_ci{
18458c2ecf20Sopenharmony_ci	struct ceph_msg *msg;
18468c2ecf20Sopenharmony_ci	int state;
18478c2ecf20Sopenharmony_ci
18488c2ecf20Sopenharmony_ci	if (time_after_eq(jiffies, session->s_cap_ttl) &&
18498c2ecf20Sopenharmony_ci	    time_after_eq(session->s_cap_ttl, session->s_renew_requested))
18508c2ecf20Sopenharmony_ci		pr_info("mds%d caps stale\n", session->s_mds);
18518c2ecf20Sopenharmony_ci	session->s_renew_requested = jiffies;
18528c2ecf20Sopenharmony_ci
18538c2ecf20Sopenharmony_ci	/* do not try to renew caps until a recovering mds has reconnected
18548c2ecf20Sopenharmony_ci	 * with its clients. */
18558c2ecf20Sopenharmony_ci	state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
18568c2ecf20Sopenharmony_ci	if (state < CEPH_MDS_STATE_RECONNECT) {
18578c2ecf20Sopenharmony_ci		dout("send_renew_caps ignoring mds%d (%s)\n",
18588c2ecf20Sopenharmony_ci		     session->s_mds, ceph_mds_state_name(state));
18598c2ecf20Sopenharmony_ci		return 0;
18608c2ecf20Sopenharmony_ci	}
18618c2ecf20Sopenharmony_ci
18628c2ecf20Sopenharmony_ci	dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
18638c2ecf20Sopenharmony_ci		ceph_mds_state_name(state));
18648c2ecf20Sopenharmony_ci	msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
18658c2ecf20Sopenharmony_ci				      ++session->s_renew_seq);
18668c2ecf20Sopenharmony_ci	if (!msg)
18678c2ecf20Sopenharmony_ci		return -ENOMEM;
18688c2ecf20Sopenharmony_ci	ceph_con_send(&session->s_con, msg);
18698c2ecf20Sopenharmony_ci	return 0;
18708c2ecf20Sopenharmony_ci}
18718c2ecf20Sopenharmony_ci
18728c2ecf20Sopenharmony_cistatic int send_flushmsg_ack(struct ceph_mds_client *mdsc,
18738c2ecf20Sopenharmony_ci			     struct ceph_mds_session *session, u64 seq)
18748c2ecf20Sopenharmony_ci{
18758c2ecf20Sopenharmony_ci	struct ceph_msg *msg;
18768c2ecf20Sopenharmony_ci
18778c2ecf20Sopenharmony_ci	dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
18788c2ecf20Sopenharmony_ci	     session->s_mds, ceph_session_state_name(session->s_state), seq);
18798c2ecf20Sopenharmony_ci	msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
18808c2ecf20Sopenharmony_ci	if (!msg)
18818c2ecf20Sopenharmony_ci		return -ENOMEM;
18828c2ecf20Sopenharmony_ci	ceph_con_send(&session->s_con, msg);
18838c2ecf20Sopenharmony_ci	return 0;
18848c2ecf20Sopenharmony_ci}
18858c2ecf20Sopenharmony_ci
18868c2ecf20Sopenharmony_ci
18878c2ecf20Sopenharmony_ci/*
18888c2ecf20Sopenharmony_ci * Note new cap ttl, and any transition from stale -> not stale (fresh?).
18898c2ecf20Sopenharmony_ci *
18908c2ecf20Sopenharmony_ci * Called under session->s_mutex
18918c2ecf20Sopenharmony_ci */
18928c2ecf20Sopenharmony_cistatic void renewed_caps(struct ceph_mds_client *mdsc,
18938c2ecf20Sopenharmony_ci			 struct ceph_mds_session *session, int is_renew)
18948c2ecf20Sopenharmony_ci{
18958c2ecf20Sopenharmony_ci	int was_stale;
18968c2ecf20Sopenharmony_ci	int wake = 0;
18978c2ecf20Sopenharmony_ci
18988c2ecf20Sopenharmony_ci	spin_lock(&session->s_cap_lock);
18998c2ecf20Sopenharmony_ci	was_stale = is_renew && time_after_eq(jiffies, session->s_cap_ttl);
19008c2ecf20Sopenharmony_ci
19018c2ecf20Sopenharmony_ci	session->s_cap_ttl = session->s_renew_requested +
19028c2ecf20Sopenharmony_ci		mdsc->mdsmap->m_session_timeout*HZ;
19038c2ecf20Sopenharmony_ci
19048c2ecf20Sopenharmony_ci	if (was_stale) {
19058c2ecf20Sopenharmony_ci		if (time_before(jiffies, session->s_cap_ttl)) {
19068c2ecf20Sopenharmony_ci			pr_info("mds%d caps renewed\n", session->s_mds);
19078c2ecf20Sopenharmony_ci			wake = 1;
19088c2ecf20Sopenharmony_ci		} else {
19098c2ecf20Sopenharmony_ci			pr_info("mds%d caps still stale\n", session->s_mds);
19108c2ecf20Sopenharmony_ci		}
19118c2ecf20Sopenharmony_ci	}
19128c2ecf20Sopenharmony_ci	dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
19138c2ecf20Sopenharmony_ci	     session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
19148c2ecf20Sopenharmony_ci	     time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
19158c2ecf20Sopenharmony_ci	spin_unlock(&session->s_cap_lock);
19168c2ecf20Sopenharmony_ci
19178c2ecf20Sopenharmony_ci	if (wake)
19188c2ecf20Sopenharmony_ci		wake_up_session_caps(session, RENEWCAPS);
19198c2ecf20Sopenharmony_ci}
19208c2ecf20Sopenharmony_ci
19218c2ecf20Sopenharmony_ci/*
19228c2ecf20Sopenharmony_ci * send a session close request
19238c2ecf20Sopenharmony_ci */
19248c2ecf20Sopenharmony_cistatic int request_close_session(struct ceph_mds_session *session)
19258c2ecf20Sopenharmony_ci{
19268c2ecf20Sopenharmony_ci	struct ceph_msg *msg;
19278c2ecf20Sopenharmony_ci
19288c2ecf20Sopenharmony_ci	dout("request_close_session mds%d state %s seq %lld\n",
19298c2ecf20Sopenharmony_ci	     session->s_mds, ceph_session_state_name(session->s_state),
19308c2ecf20Sopenharmony_ci	     session->s_seq);
19318c2ecf20Sopenharmony_ci	msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE,
19328c2ecf20Sopenharmony_ci				      session->s_seq);
19338c2ecf20Sopenharmony_ci	if (!msg)
19348c2ecf20Sopenharmony_ci		return -ENOMEM;
19358c2ecf20Sopenharmony_ci	ceph_con_send(&session->s_con, msg);
19368c2ecf20Sopenharmony_ci	return 1;
19378c2ecf20Sopenharmony_ci}
19388c2ecf20Sopenharmony_ci
19398c2ecf20Sopenharmony_ci/*
19408c2ecf20Sopenharmony_ci * Called with s_mutex held.
19418c2ecf20Sopenharmony_ci */
19428c2ecf20Sopenharmony_cistatic int __close_session(struct ceph_mds_client *mdsc,
19438c2ecf20Sopenharmony_ci			 struct ceph_mds_session *session)
19448c2ecf20Sopenharmony_ci{
19458c2ecf20Sopenharmony_ci	if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
19468c2ecf20Sopenharmony_ci		return 0;
19478c2ecf20Sopenharmony_ci	session->s_state = CEPH_MDS_SESSION_CLOSING;
19488c2ecf20Sopenharmony_ci	return request_close_session(session);
19498c2ecf20Sopenharmony_ci}
19508c2ecf20Sopenharmony_ci
19518c2ecf20Sopenharmony_cistatic bool drop_negative_children(struct dentry *dentry)
19528c2ecf20Sopenharmony_ci{
19538c2ecf20Sopenharmony_ci	struct dentry *child;
19548c2ecf20Sopenharmony_ci	bool all_negative = true;
19558c2ecf20Sopenharmony_ci
19568c2ecf20Sopenharmony_ci	if (!d_is_dir(dentry))
19578c2ecf20Sopenharmony_ci		goto out;
19588c2ecf20Sopenharmony_ci
19598c2ecf20Sopenharmony_ci	spin_lock(&dentry->d_lock);
19608c2ecf20Sopenharmony_ci	list_for_each_entry(child, &dentry->d_subdirs, d_child) {
19618c2ecf20Sopenharmony_ci		if (d_really_is_positive(child)) {
19628c2ecf20Sopenharmony_ci			all_negative = false;
19638c2ecf20Sopenharmony_ci			break;
19648c2ecf20Sopenharmony_ci		}
19658c2ecf20Sopenharmony_ci	}
19668c2ecf20Sopenharmony_ci	spin_unlock(&dentry->d_lock);
19678c2ecf20Sopenharmony_ci
19688c2ecf20Sopenharmony_ci	if (all_negative)
19698c2ecf20Sopenharmony_ci		shrink_dcache_parent(dentry);
19708c2ecf20Sopenharmony_ciout:
19718c2ecf20Sopenharmony_ci	return all_negative;
19728c2ecf20Sopenharmony_ci}
19738c2ecf20Sopenharmony_ci
19748c2ecf20Sopenharmony_ci/*
19758c2ecf20Sopenharmony_ci * Trim old(er) caps.
19768c2ecf20Sopenharmony_ci *
19778c2ecf20Sopenharmony_ci * Because we can't cache an inode without one or more caps, we do
19788c2ecf20Sopenharmony_ci * this indirectly: if a cap is unused, we prune its aliases, at which
19798c2ecf20Sopenharmony_ci * point the inode will hopefully get dropped to.
19808c2ecf20Sopenharmony_ci *
19818c2ecf20Sopenharmony_ci * Yes, this is a bit sloppy.  Our only real goal here is to respond to
19828c2ecf20Sopenharmony_ci * memory pressure from the MDS, though, so it needn't be perfect.
19838c2ecf20Sopenharmony_ci */
19848c2ecf20Sopenharmony_cistatic int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
19858c2ecf20Sopenharmony_ci{
19868c2ecf20Sopenharmony_ci	int *remaining = arg;
19878c2ecf20Sopenharmony_ci	struct ceph_inode_info *ci = ceph_inode(inode);
19888c2ecf20Sopenharmony_ci	int used, wanted, oissued, mine;
19898c2ecf20Sopenharmony_ci
19908c2ecf20Sopenharmony_ci	if (*remaining <= 0)
19918c2ecf20Sopenharmony_ci		return -1;
19928c2ecf20Sopenharmony_ci
19938c2ecf20Sopenharmony_ci	spin_lock(&ci->i_ceph_lock);
19948c2ecf20Sopenharmony_ci	mine = cap->issued | cap->implemented;
19958c2ecf20Sopenharmony_ci	used = __ceph_caps_used(ci);
19968c2ecf20Sopenharmony_ci	wanted = __ceph_caps_file_wanted(ci);
19978c2ecf20Sopenharmony_ci	oissued = __ceph_caps_issued_other(ci, cap);
19988c2ecf20Sopenharmony_ci
19998c2ecf20Sopenharmony_ci	dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
20008c2ecf20Sopenharmony_ci	     inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
20018c2ecf20Sopenharmony_ci	     ceph_cap_string(used), ceph_cap_string(wanted));
20028c2ecf20Sopenharmony_ci	if (cap == ci->i_auth_cap) {
20038c2ecf20Sopenharmony_ci		if (ci->i_dirty_caps || ci->i_flushing_caps ||
20048c2ecf20Sopenharmony_ci		    !list_empty(&ci->i_cap_snaps))
20058c2ecf20Sopenharmony_ci			goto out;
20068c2ecf20Sopenharmony_ci		if ((used | wanted) & CEPH_CAP_ANY_WR)
20078c2ecf20Sopenharmony_ci			goto out;
20088c2ecf20Sopenharmony_ci		/* Note: it's possible that i_filelock_ref becomes non-zero
20098c2ecf20Sopenharmony_ci		 * after dropping auth caps. It doesn't hurt because reply
20108c2ecf20Sopenharmony_ci		 * of lock mds request will re-add auth caps. */
20118c2ecf20Sopenharmony_ci		if (atomic_read(&ci->i_filelock_ref) > 0)
20128c2ecf20Sopenharmony_ci			goto out;
20138c2ecf20Sopenharmony_ci	}
20148c2ecf20Sopenharmony_ci	/* The inode has cached pages, but it's no longer used.
20158c2ecf20Sopenharmony_ci	 * we can safely drop it */
20168c2ecf20Sopenharmony_ci	if (S_ISREG(inode->i_mode) &&
20178c2ecf20Sopenharmony_ci	    wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
20188c2ecf20Sopenharmony_ci	    !(oissued & CEPH_CAP_FILE_CACHE)) {
20198c2ecf20Sopenharmony_ci	  used = 0;
20208c2ecf20Sopenharmony_ci	  oissued = 0;
20218c2ecf20Sopenharmony_ci	}
20228c2ecf20Sopenharmony_ci	if ((used | wanted) & ~oissued & mine)
20238c2ecf20Sopenharmony_ci		goto out;   /* we need these caps */
20248c2ecf20Sopenharmony_ci
20258c2ecf20Sopenharmony_ci	if (oissued) {
20268c2ecf20Sopenharmony_ci		/* we aren't the only cap.. just remove us */
20278c2ecf20Sopenharmony_ci		__ceph_remove_cap(cap, true);
20288c2ecf20Sopenharmony_ci		(*remaining)--;
20298c2ecf20Sopenharmony_ci	} else {
20308c2ecf20Sopenharmony_ci		struct dentry *dentry;
20318c2ecf20Sopenharmony_ci		/* try dropping referring dentries */
20328c2ecf20Sopenharmony_ci		spin_unlock(&ci->i_ceph_lock);
20338c2ecf20Sopenharmony_ci		dentry = d_find_any_alias(inode);
20348c2ecf20Sopenharmony_ci		if (dentry && drop_negative_children(dentry)) {
20358c2ecf20Sopenharmony_ci			int count;
20368c2ecf20Sopenharmony_ci			dput(dentry);
20378c2ecf20Sopenharmony_ci			d_prune_aliases(inode);
20388c2ecf20Sopenharmony_ci			count = atomic_read(&inode->i_count);
20398c2ecf20Sopenharmony_ci			if (count == 1)
20408c2ecf20Sopenharmony_ci				(*remaining)--;
20418c2ecf20Sopenharmony_ci			dout("trim_caps_cb %p cap %p pruned, count now %d\n",
20428c2ecf20Sopenharmony_ci			     inode, cap, count);
20438c2ecf20Sopenharmony_ci		} else {
20448c2ecf20Sopenharmony_ci			dput(dentry);
20458c2ecf20Sopenharmony_ci		}
20468c2ecf20Sopenharmony_ci		return 0;
20478c2ecf20Sopenharmony_ci	}
20488c2ecf20Sopenharmony_ci
20498c2ecf20Sopenharmony_ciout:
20508c2ecf20Sopenharmony_ci	spin_unlock(&ci->i_ceph_lock);
20518c2ecf20Sopenharmony_ci	return 0;
20528c2ecf20Sopenharmony_ci}
20538c2ecf20Sopenharmony_ci
20548c2ecf20Sopenharmony_ci/*
20558c2ecf20Sopenharmony_ci * Trim session cap count down to some max number.
20568c2ecf20Sopenharmony_ci */
20578c2ecf20Sopenharmony_ciint ceph_trim_caps(struct ceph_mds_client *mdsc,
20588c2ecf20Sopenharmony_ci		   struct ceph_mds_session *session,
20598c2ecf20Sopenharmony_ci		   int max_caps)
20608c2ecf20Sopenharmony_ci{
20618c2ecf20Sopenharmony_ci	int trim_caps = session->s_nr_caps - max_caps;
20628c2ecf20Sopenharmony_ci
20638c2ecf20Sopenharmony_ci	dout("trim_caps mds%d start: %d / %d, trim %d\n",
20648c2ecf20Sopenharmony_ci	     session->s_mds, session->s_nr_caps, max_caps, trim_caps);
20658c2ecf20Sopenharmony_ci	if (trim_caps > 0) {
20668c2ecf20Sopenharmony_ci		int remaining = trim_caps;
20678c2ecf20Sopenharmony_ci
20688c2ecf20Sopenharmony_ci		ceph_iterate_session_caps(session, trim_caps_cb, &remaining);
20698c2ecf20Sopenharmony_ci		dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
20708c2ecf20Sopenharmony_ci		     session->s_mds, session->s_nr_caps, max_caps,
20718c2ecf20Sopenharmony_ci			trim_caps - remaining);
20728c2ecf20Sopenharmony_ci	}
20738c2ecf20Sopenharmony_ci
20748c2ecf20Sopenharmony_ci	ceph_flush_cap_releases(mdsc, session);
20758c2ecf20Sopenharmony_ci	return 0;
20768c2ecf20Sopenharmony_ci}
20778c2ecf20Sopenharmony_ci
20788c2ecf20Sopenharmony_cistatic int check_caps_flush(struct ceph_mds_client *mdsc,
20798c2ecf20Sopenharmony_ci			    u64 want_flush_tid)
20808c2ecf20Sopenharmony_ci{
20818c2ecf20Sopenharmony_ci	int ret = 1;
20828c2ecf20Sopenharmony_ci
20838c2ecf20Sopenharmony_ci	spin_lock(&mdsc->cap_dirty_lock);
20848c2ecf20Sopenharmony_ci	if (!list_empty(&mdsc->cap_flush_list)) {
20858c2ecf20Sopenharmony_ci		struct ceph_cap_flush *cf =
20868c2ecf20Sopenharmony_ci			list_first_entry(&mdsc->cap_flush_list,
20878c2ecf20Sopenharmony_ci					 struct ceph_cap_flush, g_list);
20888c2ecf20Sopenharmony_ci		if (cf->tid <= want_flush_tid) {
20898c2ecf20Sopenharmony_ci			dout("check_caps_flush still flushing tid "
20908c2ecf20Sopenharmony_ci			     "%llu <= %llu\n", cf->tid, want_flush_tid);
20918c2ecf20Sopenharmony_ci			ret = 0;
20928c2ecf20Sopenharmony_ci		}
20938c2ecf20Sopenharmony_ci	}
20948c2ecf20Sopenharmony_ci	spin_unlock(&mdsc->cap_dirty_lock);
20958c2ecf20Sopenharmony_ci	return ret;
20968c2ecf20Sopenharmony_ci}
20978c2ecf20Sopenharmony_ci
20988c2ecf20Sopenharmony_ci/*
20998c2ecf20Sopenharmony_ci * flush all dirty inode data to disk.
21008c2ecf20Sopenharmony_ci *
21018c2ecf20Sopenharmony_ci * returns true if we've flushed through want_flush_tid
21028c2ecf20Sopenharmony_ci */
21038c2ecf20Sopenharmony_cistatic void wait_caps_flush(struct ceph_mds_client *mdsc,
21048c2ecf20Sopenharmony_ci			    u64 want_flush_tid)
21058c2ecf20Sopenharmony_ci{
21068c2ecf20Sopenharmony_ci	dout("check_caps_flush want %llu\n", want_flush_tid);
21078c2ecf20Sopenharmony_ci
21088c2ecf20Sopenharmony_ci	wait_event(mdsc->cap_flushing_wq,
21098c2ecf20Sopenharmony_ci		   check_caps_flush(mdsc, want_flush_tid));
21108c2ecf20Sopenharmony_ci
21118c2ecf20Sopenharmony_ci	dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
21128c2ecf20Sopenharmony_ci}
21138c2ecf20Sopenharmony_ci
21148c2ecf20Sopenharmony_ci/*
21158c2ecf20Sopenharmony_ci * called under s_mutex
21168c2ecf20Sopenharmony_ci */
21178c2ecf20Sopenharmony_cistatic void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
21188c2ecf20Sopenharmony_ci				   struct ceph_mds_session *session)
21198c2ecf20Sopenharmony_ci{
21208c2ecf20Sopenharmony_ci	struct ceph_msg *msg = NULL;
21218c2ecf20Sopenharmony_ci	struct ceph_mds_cap_release *head;
21228c2ecf20Sopenharmony_ci	struct ceph_mds_cap_item *item;
21238c2ecf20Sopenharmony_ci	struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
21248c2ecf20Sopenharmony_ci	struct ceph_cap *cap;
21258c2ecf20Sopenharmony_ci	LIST_HEAD(tmp_list);
21268c2ecf20Sopenharmony_ci	int num_cap_releases;
21278c2ecf20Sopenharmony_ci	__le32	barrier, *cap_barrier;
21288c2ecf20Sopenharmony_ci
21298c2ecf20Sopenharmony_ci	down_read(&osdc->lock);
21308c2ecf20Sopenharmony_ci	barrier = cpu_to_le32(osdc->epoch_barrier);
21318c2ecf20Sopenharmony_ci	up_read(&osdc->lock);
21328c2ecf20Sopenharmony_ci
21338c2ecf20Sopenharmony_ci	spin_lock(&session->s_cap_lock);
21348c2ecf20Sopenharmony_ciagain:
21358c2ecf20Sopenharmony_ci	list_splice_init(&session->s_cap_releases, &tmp_list);
21368c2ecf20Sopenharmony_ci	num_cap_releases = session->s_num_cap_releases;
21378c2ecf20Sopenharmony_ci	session->s_num_cap_releases = 0;
21388c2ecf20Sopenharmony_ci	spin_unlock(&session->s_cap_lock);
21398c2ecf20Sopenharmony_ci
21408c2ecf20Sopenharmony_ci	while (!list_empty(&tmp_list)) {
21418c2ecf20Sopenharmony_ci		if (!msg) {
21428c2ecf20Sopenharmony_ci			msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE,
21438c2ecf20Sopenharmony_ci					PAGE_SIZE, GFP_NOFS, false);
21448c2ecf20Sopenharmony_ci			if (!msg)
21458c2ecf20Sopenharmony_ci				goto out_err;
21468c2ecf20Sopenharmony_ci			head = msg->front.iov_base;
21478c2ecf20Sopenharmony_ci			head->num = cpu_to_le32(0);
21488c2ecf20Sopenharmony_ci			msg->front.iov_len = sizeof(*head);
21498c2ecf20Sopenharmony_ci
21508c2ecf20Sopenharmony_ci			msg->hdr.version = cpu_to_le16(2);
21518c2ecf20Sopenharmony_ci			msg->hdr.compat_version = cpu_to_le16(1);
21528c2ecf20Sopenharmony_ci		}
21538c2ecf20Sopenharmony_ci
21548c2ecf20Sopenharmony_ci		cap = list_first_entry(&tmp_list, struct ceph_cap,
21558c2ecf20Sopenharmony_ci					session_caps);
21568c2ecf20Sopenharmony_ci		list_del(&cap->session_caps);
21578c2ecf20Sopenharmony_ci		num_cap_releases--;
21588c2ecf20Sopenharmony_ci
21598c2ecf20Sopenharmony_ci		head = msg->front.iov_base;
21608c2ecf20Sopenharmony_ci		put_unaligned_le32(get_unaligned_le32(&head->num) + 1,
21618c2ecf20Sopenharmony_ci				   &head->num);
21628c2ecf20Sopenharmony_ci		item = msg->front.iov_base + msg->front.iov_len;
21638c2ecf20Sopenharmony_ci		item->ino = cpu_to_le64(cap->cap_ino);
21648c2ecf20Sopenharmony_ci		item->cap_id = cpu_to_le64(cap->cap_id);
21658c2ecf20Sopenharmony_ci		item->migrate_seq = cpu_to_le32(cap->mseq);
21668c2ecf20Sopenharmony_ci		item->seq = cpu_to_le32(cap->issue_seq);
21678c2ecf20Sopenharmony_ci		msg->front.iov_len += sizeof(*item);
21688c2ecf20Sopenharmony_ci
21698c2ecf20Sopenharmony_ci		ceph_put_cap(mdsc, cap);
21708c2ecf20Sopenharmony_ci
21718c2ecf20Sopenharmony_ci		if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
21728c2ecf20Sopenharmony_ci			// Append cap_barrier field
21738c2ecf20Sopenharmony_ci			cap_barrier = msg->front.iov_base + msg->front.iov_len;
21748c2ecf20Sopenharmony_ci			*cap_barrier = barrier;
21758c2ecf20Sopenharmony_ci			msg->front.iov_len += sizeof(*cap_barrier);
21768c2ecf20Sopenharmony_ci
21778c2ecf20Sopenharmony_ci			msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
21788c2ecf20Sopenharmony_ci			dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
21798c2ecf20Sopenharmony_ci			ceph_con_send(&session->s_con, msg);
21808c2ecf20Sopenharmony_ci			msg = NULL;
21818c2ecf20Sopenharmony_ci		}
21828c2ecf20Sopenharmony_ci	}
21838c2ecf20Sopenharmony_ci
21848c2ecf20Sopenharmony_ci	BUG_ON(num_cap_releases != 0);
21858c2ecf20Sopenharmony_ci
21868c2ecf20Sopenharmony_ci	spin_lock(&session->s_cap_lock);
21878c2ecf20Sopenharmony_ci	if (!list_empty(&session->s_cap_releases))
21888c2ecf20Sopenharmony_ci		goto again;
21898c2ecf20Sopenharmony_ci	spin_unlock(&session->s_cap_lock);
21908c2ecf20Sopenharmony_ci
21918c2ecf20Sopenharmony_ci	if (msg) {
21928c2ecf20Sopenharmony_ci		// Append cap_barrier field
21938c2ecf20Sopenharmony_ci		cap_barrier = msg->front.iov_base + msg->front.iov_len;
21948c2ecf20Sopenharmony_ci		*cap_barrier = barrier;
21958c2ecf20Sopenharmony_ci		msg->front.iov_len += sizeof(*cap_barrier);
21968c2ecf20Sopenharmony_ci
21978c2ecf20Sopenharmony_ci		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
21988c2ecf20Sopenharmony_ci		dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
21998c2ecf20Sopenharmony_ci		ceph_con_send(&session->s_con, msg);
22008c2ecf20Sopenharmony_ci	}
22018c2ecf20Sopenharmony_ci	return;
22028c2ecf20Sopenharmony_ciout_err:
22038c2ecf20Sopenharmony_ci	pr_err("send_cap_releases mds%d, failed to allocate message\n",
22048c2ecf20Sopenharmony_ci		session->s_mds);
22058c2ecf20Sopenharmony_ci	spin_lock(&session->s_cap_lock);
22068c2ecf20Sopenharmony_ci	list_splice(&tmp_list, &session->s_cap_releases);
22078c2ecf20Sopenharmony_ci	session->s_num_cap_releases += num_cap_releases;
22088c2ecf20Sopenharmony_ci	spin_unlock(&session->s_cap_lock);
22098c2ecf20Sopenharmony_ci}
22108c2ecf20Sopenharmony_ci
22118c2ecf20Sopenharmony_cistatic void ceph_cap_release_work(struct work_struct *work)
22128c2ecf20Sopenharmony_ci{
22138c2ecf20Sopenharmony_ci	struct ceph_mds_session *session =
22148c2ecf20Sopenharmony_ci		container_of(work, struct ceph_mds_session, s_cap_release_work);
22158c2ecf20Sopenharmony_ci
22168c2ecf20Sopenharmony_ci	mutex_lock(&session->s_mutex);
22178c2ecf20Sopenharmony_ci	if (session->s_state == CEPH_MDS_SESSION_OPEN ||
22188c2ecf20Sopenharmony_ci	    session->s_state == CEPH_MDS_SESSION_HUNG)
22198c2ecf20Sopenharmony_ci		ceph_send_cap_releases(session->s_mdsc, session);
22208c2ecf20Sopenharmony_ci	mutex_unlock(&session->s_mutex);
22218c2ecf20Sopenharmony_ci	ceph_put_mds_session(session);
22228c2ecf20Sopenharmony_ci}
22238c2ecf20Sopenharmony_ci
22248c2ecf20Sopenharmony_civoid ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
22258c2ecf20Sopenharmony_ci		             struct ceph_mds_session *session)
22268c2ecf20Sopenharmony_ci{
22278c2ecf20Sopenharmony_ci	if (mdsc->stopping)
22288c2ecf20Sopenharmony_ci		return;
22298c2ecf20Sopenharmony_ci
22308c2ecf20Sopenharmony_ci	ceph_get_mds_session(session);
22318c2ecf20Sopenharmony_ci	if (queue_work(mdsc->fsc->cap_wq,
22328c2ecf20Sopenharmony_ci		       &session->s_cap_release_work)) {
22338c2ecf20Sopenharmony_ci		dout("cap release work queued\n");
22348c2ecf20Sopenharmony_ci	} else {
22358c2ecf20Sopenharmony_ci		ceph_put_mds_session(session);
22368c2ecf20Sopenharmony_ci		dout("failed to queue cap release work\n");
22378c2ecf20Sopenharmony_ci	}
22388c2ecf20Sopenharmony_ci}
22398c2ecf20Sopenharmony_ci
22408c2ecf20Sopenharmony_ci/*
22418c2ecf20Sopenharmony_ci * caller holds session->s_cap_lock
22428c2ecf20Sopenharmony_ci */
22438c2ecf20Sopenharmony_civoid __ceph_queue_cap_release(struct ceph_mds_session *session,
22448c2ecf20Sopenharmony_ci			      struct ceph_cap *cap)
22458c2ecf20Sopenharmony_ci{
22468c2ecf20Sopenharmony_ci	list_add_tail(&cap->session_caps, &session->s_cap_releases);
22478c2ecf20Sopenharmony_ci	session->s_num_cap_releases++;
22488c2ecf20Sopenharmony_ci
22498c2ecf20Sopenharmony_ci	if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE))
22508c2ecf20Sopenharmony_ci		ceph_flush_cap_releases(session->s_mdsc, session);
22518c2ecf20Sopenharmony_ci}
22528c2ecf20Sopenharmony_ci
22538c2ecf20Sopenharmony_cistatic void ceph_cap_reclaim_work(struct work_struct *work)
22548c2ecf20Sopenharmony_ci{
22558c2ecf20Sopenharmony_ci	struct ceph_mds_client *mdsc =
22568c2ecf20Sopenharmony_ci		container_of(work, struct ceph_mds_client, cap_reclaim_work);
22578c2ecf20Sopenharmony_ci	int ret = ceph_trim_dentries(mdsc);
22588c2ecf20Sopenharmony_ci	if (ret == -EAGAIN)
22598c2ecf20Sopenharmony_ci		ceph_queue_cap_reclaim_work(mdsc);
22608c2ecf20Sopenharmony_ci}
22618c2ecf20Sopenharmony_ci
22628c2ecf20Sopenharmony_civoid ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc)
22638c2ecf20Sopenharmony_ci{
22648c2ecf20Sopenharmony_ci	if (mdsc->stopping)
22658c2ecf20Sopenharmony_ci		return;
22668c2ecf20Sopenharmony_ci
22678c2ecf20Sopenharmony_ci        if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) {
22688c2ecf20Sopenharmony_ci                dout("caps reclaim work queued\n");
22698c2ecf20Sopenharmony_ci        } else {
22708c2ecf20Sopenharmony_ci                dout("failed to queue caps release work\n");
22718c2ecf20Sopenharmony_ci        }
22728c2ecf20Sopenharmony_ci}
22738c2ecf20Sopenharmony_ci
22748c2ecf20Sopenharmony_civoid ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)
22758c2ecf20Sopenharmony_ci{
22768c2ecf20Sopenharmony_ci	int val;
22778c2ecf20Sopenharmony_ci	if (!nr)
22788c2ecf20Sopenharmony_ci		return;
22798c2ecf20Sopenharmony_ci	val = atomic_add_return(nr, &mdsc->cap_reclaim_pending);
22808c2ecf20Sopenharmony_ci	if ((val % CEPH_CAPS_PER_RELEASE) < nr) {
22818c2ecf20Sopenharmony_ci		atomic_set(&mdsc->cap_reclaim_pending, 0);
22828c2ecf20Sopenharmony_ci		ceph_queue_cap_reclaim_work(mdsc);
22838c2ecf20Sopenharmony_ci	}
22848c2ecf20Sopenharmony_ci}
22858c2ecf20Sopenharmony_ci
22868c2ecf20Sopenharmony_ci/*
22878c2ecf20Sopenharmony_ci * requests
22888c2ecf20Sopenharmony_ci */
22898c2ecf20Sopenharmony_ci
22908c2ecf20Sopenharmony_ciint ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
22918c2ecf20Sopenharmony_ci				    struct inode *dir)
22928c2ecf20Sopenharmony_ci{
22938c2ecf20Sopenharmony_ci	struct ceph_inode_info *ci = ceph_inode(dir);
22948c2ecf20Sopenharmony_ci	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
22958c2ecf20Sopenharmony_ci	struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
22968c2ecf20Sopenharmony_ci	size_t size = sizeof(struct ceph_mds_reply_dir_entry);
22978c2ecf20Sopenharmony_ci	unsigned int num_entries;
22988c2ecf20Sopenharmony_ci	int order;
22998c2ecf20Sopenharmony_ci
23008c2ecf20Sopenharmony_ci	spin_lock(&ci->i_ceph_lock);
23018c2ecf20Sopenharmony_ci	num_entries = ci->i_files + ci->i_subdirs;
23028c2ecf20Sopenharmony_ci	spin_unlock(&ci->i_ceph_lock);
23038c2ecf20Sopenharmony_ci	num_entries = max(num_entries, 1U);
23048c2ecf20Sopenharmony_ci	num_entries = min(num_entries, opt->max_readdir);
23058c2ecf20Sopenharmony_ci
23068c2ecf20Sopenharmony_ci	order = get_order(size * num_entries);
23078c2ecf20Sopenharmony_ci	while (order >= 0) {
23088c2ecf20Sopenharmony_ci		rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
23098c2ecf20Sopenharmony_ci							     __GFP_NOWARN,
23108c2ecf20Sopenharmony_ci							     order);
23118c2ecf20Sopenharmony_ci		if (rinfo->dir_entries)
23128c2ecf20Sopenharmony_ci			break;
23138c2ecf20Sopenharmony_ci		order--;
23148c2ecf20Sopenharmony_ci	}
23158c2ecf20Sopenharmony_ci	if (!rinfo->dir_entries)
23168c2ecf20Sopenharmony_ci		return -ENOMEM;
23178c2ecf20Sopenharmony_ci
23188c2ecf20Sopenharmony_ci	num_entries = (PAGE_SIZE << order) / size;
23198c2ecf20Sopenharmony_ci	num_entries = min(num_entries, opt->max_readdir);
23208c2ecf20Sopenharmony_ci
23218c2ecf20Sopenharmony_ci	rinfo->dir_buf_size = PAGE_SIZE << order;
23228c2ecf20Sopenharmony_ci	req->r_num_caps = num_entries + 1;
23238c2ecf20Sopenharmony_ci	req->r_args.readdir.max_entries = cpu_to_le32(num_entries);
23248c2ecf20Sopenharmony_ci	req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes);
23258c2ecf20Sopenharmony_ci	return 0;
23268c2ecf20Sopenharmony_ci}
23278c2ecf20Sopenharmony_ci
23288c2ecf20Sopenharmony_ci/*
23298c2ecf20Sopenharmony_ci * Create an mds request.
23308c2ecf20Sopenharmony_ci */
23318c2ecf20Sopenharmony_cistruct ceph_mds_request *
23328c2ecf20Sopenharmony_ciceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
23338c2ecf20Sopenharmony_ci{
23348c2ecf20Sopenharmony_ci	struct ceph_mds_request *req;
23358c2ecf20Sopenharmony_ci
23368c2ecf20Sopenharmony_ci	req = kmem_cache_zalloc(ceph_mds_request_cachep, GFP_NOFS);
23378c2ecf20Sopenharmony_ci	if (!req)
23388c2ecf20Sopenharmony_ci		return ERR_PTR(-ENOMEM);
23398c2ecf20Sopenharmony_ci
23408c2ecf20Sopenharmony_ci	mutex_init(&req->r_fill_mutex);
23418c2ecf20Sopenharmony_ci	req->r_mdsc = mdsc;
23428c2ecf20Sopenharmony_ci	req->r_started = jiffies;
23438c2ecf20Sopenharmony_ci	req->r_start_latency = ktime_get();
23448c2ecf20Sopenharmony_ci	req->r_resend_mds = -1;
23458c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&req->r_unsafe_dir_item);
23468c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&req->r_unsafe_target_item);
23478c2ecf20Sopenharmony_ci	req->r_fmode = -1;
23488c2ecf20Sopenharmony_ci	kref_init(&req->r_kref);
23498c2ecf20Sopenharmony_ci	RB_CLEAR_NODE(&req->r_node);
23508c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&req->r_wait);
23518c2ecf20Sopenharmony_ci	init_completion(&req->r_completion);
23528c2ecf20Sopenharmony_ci	init_completion(&req->r_safe_completion);
23538c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&req->r_unsafe_item);
23548c2ecf20Sopenharmony_ci
23558c2ecf20Sopenharmony_ci	ktime_get_coarse_real_ts64(&req->r_stamp);
23568c2ecf20Sopenharmony_ci
23578c2ecf20Sopenharmony_ci	req->r_op = op;
23588c2ecf20Sopenharmony_ci	req->r_direct_mode = mode;
23598c2ecf20Sopenharmony_ci	return req;
23608c2ecf20Sopenharmony_ci}
23618c2ecf20Sopenharmony_ci
23628c2ecf20Sopenharmony_ci/*
23638c2ecf20Sopenharmony_ci * return oldest (lowest) request, tid in request tree, 0 if none.
23648c2ecf20Sopenharmony_ci *
23658c2ecf20Sopenharmony_ci * called under mdsc->mutex.
23668c2ecf20Sopenharmony_ci */
23678c2ecf20Sopenharmony_cistatic struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
23688c2ecf20Sopenharmony_ci{
23698c2ecf20Sopenharmony_ci	if (RB_EMPTY_ROOT(&mdsc->request_tree))
23708c2ecf20Sopenharmony_ci		return NULL;
23718c2ecf20Sopenharmony_ci	return rb_entry(rb_first(&mdsc->request_tree),
23728c2ecf20Sopenharmony_ci			struct ceph_mds_request, r_node);
23738c2ecf20Sopenharmony_ci}
23748c2ecf20Sopenharmony_ci
23758c2ecf20Sopenharmony_cistatic inline  u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
23768c2ecf20Sopenharmony_ci{
23778c2ecf20Sopenharmony_ci	return mdsc->oldest_tid;
23788c2ecf20Sopenharmony_ci}
23798c2ecf20Sopenharmony_ci
23808c2ecf20Sopenharmony_ci/*
23818c2ecf20Sopenharmony_ci * Build a dentry's path.  Allocate on heap; caller must kfree.  Based
23828c2ecf20Sopenharmony_ci * on build_path_from_dentry in fs/cifs/dir.c.
23838c2ecf20Sopenharmony_ci *
23848c2ecf20Sopenharmony_ci * If @stop_on_nosnap, generate path relative to the first non-snapped
23858c2ecf20Sopenharmony_ci * inode.
23868c2ecf20Sopenharmony_ci *
23878c2ecf20Sopenharmony_ci * Encode hidden .snap dirs as a double /, i.e.
23888c2ecf20Sopenharmony_ci *   foo/.snap/bar -> foo//bar
23898c2ecf20Sopenharmony_ci */
23908c2ecf20Sopenharmony_cichar *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase,
23918c2ecf20Sopenharmony_ci			   int stop_on_nosnap)
23928c2ecf20Sopenharmony_ci{
23938c2ecf20Sopenharmony_ci	struct dentry *temp;
23948c2ecf20Sopenharmony_ci	char *path;
23958c2ecf20Sopenharmony_ci	int pos;
23968c2ecf20Sopenharmony_ci	unsigned seq;
23978c2ecf20Sopenharmony_ci	u64 base;
23988c2ecf20Sopenharmony_ci
23998c2ecf20Sopenharmony_ci	if (!dentry)
24008c2ecf20Sopenharmony_ci		return ERR_PTR(-EINVAL);
24018c2ecf20Sopenharmony_ci
24028c2ecf20Sopenharmony_ci	path = __getname();
24038c2ecf20Sopenharmony_ci	if (!path)
24048c2ecf20Sopenharmony_ci		return ERR_PTR(-ENOMEM);
24058c2ecf20Sopenharmony_ciretry:
24068c2ecf20Sopenharmony_ci	pos = PATH_MAX - 1;
24078c2ecf20Sopenharmony_ci	path[pos] = '\0';
24088c2ecf20Sopenharmony_ci
24098c2ecf20Sopenharmony_ci	seq = read_seqbegin(&rename_lock);
24108c2ecf20Sopenharmony_ci	rcu_read_lock();
24118c2ecf20Sopenharmony_ci	temp = dentry;
24128c2ecf20Sopenharmony_ci	for (;;) {
24138c2ecf20Sopenharmony_ci		struct inode *inode;
24148c2ecf20Sopenharmony_ci
24158c2ecf20Sopenharmony_ci		spin_lock(&temp->d_lock);
24168c2ecf20Sopenharmony_ci		inode = d_inode(temp);
24178c2ecf20Sopenharmony_ci		if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
24188c2ecf20Sopenharmony_ci			dout("build_path path+%d: %p SNAPDIR\n",
24198c2ecf20Sopenharmony_ci			     pos, temp);
24208c2ecf20Sopenharmony_ci		} else if (stop_on_nosnap && inode && dentry != temp &&
24218c2ecf20Sopenharmony_ci			   ceph_snap(inode) == CEPH_NOSNAP) {
24228c2ecf20Sopenharmony_ci			spin_unlock(&temp->d_lock);
24238c2ecf20Sopenharmony_ci			pos++; /* get rid of any prepended '/' */
24248c2ecf20Sopenharmony_ci			break;
24258c2ecf20Sopenharmony_ci		} else {
24268c2ecf20Sopenharmony_ci			pos -= temp->d_name.len;
24278c2ecf20Sopenharmony_ci			if (pos < 0) {
24288c2ecf20Sopenharmony_ci				spin_unlock(&temp->d_lock);
24298c2ecf20Sopenharmony_ci				break;
24308c2ecf20Sopenharmony_ci			}
24318c2ecf20Sopenharmony_ci			memcpy(path + pos, temp->d_name.name, temp->d_name.len);
24328c2ecf20Sopenharmony_ci		}
24338c2ecf20Sopenharmony_ci		spin_unlock(&temp->d_lock);
24348c2ecf20Sopenharmony_ci		temp = READ_ONCE(temp->d_parent);
24358c2ecf20Sopenharmony_ci
24368c2ecf20Sopenharmony_ci		/* Are we at the root? */
24378c2ecf20Sopenharmony_ci		if (IS_ROOT(temp))
24388c2ecf20Sopenharmony_ci			break;
24398c2ecf20Sopenharmony_ci
24408c2ecf20Sopenharmony_ci		/* Are we out of buffer? */
24418c2ecf20Sopenharmony_ci		if (--pos < 0)
24428c2ecf20Sopenharmony_ci			break;
24438c2ecf20Sopenharmony_ci
24448c2ecf20Sopenharmony_ci		path[pos] = '/';
24458c2ecf20Sopenharmony_ci	}
24468c2ecf20Sopenharmony_ci	base = ceph_ino(d_inode(temp));
24478c2ecf20Sopenharmony_ci	rcu_read_unlock();
24488c2ecf20Sopenharmony_ci
24498c2ecf20Sopenharmony_ci	if (read_seqretry(&rename_lock, seq))
24508c2ecf20Sopenharmony_ci		goto retry;
24518c2ecf20Sopenharmony_ci
24528c2ecf20Sopenharmony_ci	if (pos < 0) {
24538c2ecf20Sopenharmony_ci		/*
24548c2ecf20Sopenharmony_ci		 * A rename didn't occur, but somehow we didn't end up where
24558c2ecf20Sopenharmony_ci		 * we thought we would. Throw a warning and try again.
24568c2ecf20Sopenharmony_ci		 */
24578c2ecf20Sopenharmony_ci		pr_warn("build_path did not end path lookup where "
24588c2ecf20Sopenharmony_ci			"expected, pos is %d\n", pos);
24598c2ecf20Sopenharmony_ci		goto retry;
24608c2ecf20Sopenharmony_ci	}
24618c2ecf20Sopenharmony_ci
24628c2ecf20Sopenharmony_ci	*pbase = base;
24638c2ecf20Sopenharmony_ci	*plen = PATH_MAX - 1 - pos;
24648c2ecf20Sopenharmony_ci	dout("build_path on %p %d built %llx '%.*s'\n",
24658c2ecf20Sopenharmony_ci	     dentry, d_count(dentry), base, *plen, path + pos);
24668c2ecf20Sopenharmony_ci	return path + pos;
24678c2ecf20Sopenharmony_ci}
24688c2ecf20Sopenharmony_ci
24698c2ecf20Sopenharmony_cistatic int build_dentry_path(struct dentry *dentry, struct inode *dir,
24708c2ecf20Sopenharmony_ci			     const char **ppath, int *ppathlen, u64 *pino,
24718c2ecf20Sopenharmony_ci			     bool *pfreepath, bool parent_locked)
24728c2ecf20Sopenharmony_ci{
24738c2ecf20Sopenharmony_ci	char *path;
24748c2ecf20Sopenharmony_ci
24758c2ecf20Sopenharmony_ci	rcu_read_lock();
24768c2ecf20Sopenharmony_ci	if (!dir)
24778c2ecf20Sopenharmony_ci		dir = d_inode_rcu(dentry->d_parent);
24788c2ecf20Sopenharmony_ci	if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP) {
24798c2ecf20Sopenharmony_ci		*pino = ceph_ino(dir);
24808c2ecf20Sopenharmony_ci		rcu_read_unlock();
24818c2ecf20Sopenharmony_ci		*ppath = dentry->d_name.name;
24828c2ecf20Sopenharmony_ci		*ppathlen = dentry->d_name.len;
24838c2ecf20Sopenharmony_ci		return 0;
24848c2ecf20Sopenharmony_ci	}
24858c2ecf20Sopenharmony_ci	rcu_read_unlock();
24868c2ecf20Sopenharmony_ci	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
24878c2ecf20Sopenharmony_ci	if (IS_ERR(path))
24888c2ecf20Sopenharmony_ci		return PTR_ERR(path);
24898c2ecf20Sopenharmony_ci	*ppath = path;
24908c2ecf20Sopenharmony_ci	*pfreepath = true;
24918c2ecf20Sopenharmony_ci	return 0;
24928c2ecf20Sopenharmony_ci}
24938c2ecf20Sopenharmony_ci
24948c2ecf20Sopenharmony_cistatic int build_inode_path(struct inode *inode,
24958c2ecf20Sopenharmony_ci			    const char **ppath, int *ppathlen, u64 *pino,
24968c2ecf20Sopenharmony_ci			    bool *pfreepath)
24978c2ecf20Sopenharmony_ci{
24988c2ecf20Sopenharmony_ci	struct dentry *dentry;
24998c2ecf20Sopenharmony_ci	char *path;
25008c2ecf20Sopenharmony_ci
25018c2ecf20Sopenharmony_ci	if (ceph_snap(inode) == CEPH_NOSNAP) {
25028c2ecf20Sopenharmony_ci		*pino = ceph_ino(inode);
25038c2ecf20Sopenharmony_ci		*ppathlen = 0;
25048c2ecf20Sopenharmony_ci		return 0;
25058c2ecf20Sopenharmony_ci	}
25068c2ecf20Sopenharmony_ci	dentry = d_find_alias(inode);
25078c2ecf20Sopenharmony_ci	path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
25088c2ecf20Sopenharmony_ci	dput(dentry);
25098c2ecf20Sopenharmony_ci	if (IS_ERR(path))
25108c2ecf20Sopenharmony_ci		return PTR_ERR(path);
25118c2ecf20Sopenharmony_ci	*ppath = path;
25128c2ecf20Sopenharmony_ci	*pfreepath = true;
25138c2ecf20Sopenharmony_ci	return 0;
25148c2ecf20Sopenharmony_ci}
25158c2ecf20Sopenharmony_ci
25168c2ecf20Sopenharmony_ci/*
25178c2ecf20Sopenharmony_ci * request arguments may be specified via an inode *, a dentry *, or
25188c2ecf20Sopenharmony_ci * an explicit ino+path.
25198c2ecf20Sopenharmony_ci */
25208c2ecf20Sopenharmony_cistatic int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
25218c2ecf20Sopenharmony_ci				  struct inode *rdiri, const char *rpath,
25228c2ecf20Sopenharmony_ci				  u64 rino, const char **ppath, int *pathlen,
25238c2ecf20Sopenharmony_ci				  u64 *ino, bool *freepath, bool parent_locked)
25248c2ecf20Sopenharmony_ci{
25258c2ecf20Sopenharmony_ci	int r = 0;
25268c2ecf20Sopenharmony_ci
25278c2ecf20Sopenharmony_ci	if (rinode) {
25288c2ecf20Sopenharmony_ci		r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
25298c2ecf20Sopenharmony_ci		dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
25308c2ecf20Sopenharmony_ci		     ceph_snap(rinode));
25318c2ecf20Sopenharmony_ci	} else if (rdentry) {
25328c2ecf20Sopenharmony_ci		r = build_dentry_path(rdentry, rdiri, ppath, pathlen, ino,
25338c2ecf20Sopenharmony_ci					freepath, parent_locked);
25348c2ecf20Sopenharmony_ci		dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
25358c2ecf20Sopenharmony_ci		     *ppath);
25368c2ecf20Sopenharmony_ci	} else if (rpath || rino) {
25378c2ecf20Sopenharmony_ci		*ino = rino;
25388c2ecf20Sopenharmony_ci		*ppath = rpath;
25398c2ecf20Sopenharmony_ci		*pathlen = rpath ? strlen(rpath) : 0;
25408c2ecf20Sopenharmony_ci		dout(" path %.*s\n", *pathlen, rpath);
25418c2ecf20Sopenharmony_ci	}
25428c2ecf20Sopenharmony_ci
25438c2ecf20Sopenharmony_ci	return r;
25448c2ecf20Sopenharmony_ci}
25458c2ecf20Sopenharmony_ci
25468c2ecf20Sopenharmony_ci/*
25478c2ecf20Sopenharmony_ci * called under mdsc->mutex
25488c2ecf20Sopenharmony_ci */
25498c2ecf20Sopenharmony_cistatic struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
25508c2ecf20Sopenharmony_ci					       struct ceph_mds_request *req,
25518c2ecf20Sopenharmony_ci					       int mds, bool drop_cap_releases)
25528c2ecf20Sopenharmony_ci{
25538c2ecf20Sopenharmony_ci	struct ceph_msg *msg;
25548c2ecf20Sopenharmony_ci	struct ceph_mds_request_head *head;
25558c2ecf20Sopenharmony_ci	const char *path1 = NULL;
25568c2ecf20Sopenharmony_ci	const char *path2 = NULL;
25578c2ecf20Sopenharmony_ci	u64 ino1 = 0, ino2 = 0;
25588c2ecf20Sopenharmony_ci	int pathlen1 = 0, pathlen2 = 0;
25598c2ecf20Sopenharmony_ci	bool freepath1 = false, freepath2 = false;
25608c2ecf20Sopenharmony_ci	int len;
25618c2ecf20Sopenharmony_ci	u16 releases;
25628c2ecf20Sopenharmony_ci	void *p, *end;
25638c2ecf20Sopenharmony_ci	int ret;
25648c2ecf20Sopenharmony_ci
25658c2ecf20Sopenharmony_ci	ret = set_request_path_attr(req->r_inode, req->r_dentry,
25668c2ecf20Sopenharmony_ci			      req->r_parent, req->r_path1, req->r_ino1.ino,
25678c2ecf20Sopenharmony_ci			      &path1, &pathlen1, &ino1, &freepath1,
25688c2ecf20Sopenharmony_ci			      test_bit(CEPH_MDS_R_PARENT_LOCKED,
25698c2ecf20Sopenharmony_ci					&req->r_req_flags));
25708c2ecf20Sopenharmony_ci	if (ret < 0) {
25718c2ecf20Sopenharmony_ci		msg = ERR_PTR(ret);
25728c2ecf20Sopenharmony_ci		goto out;
25738c2ecf20Sopenharmony_ci	}
25748c2ecf20Sopenharmony_ci
25758c2ecf20Sopenharmony_ci	/* If r_old_dentry is set, then assume that its parent is locked */
25768c2ecf20Sopenharmony_ci	ret = set_request_path_attr(NULL, req->r_old_dentry,
25778c2ecf20Sopenharmony_ci			      req->r_old_dentry_dir,
25788c2ecf20Sopenharmony_ci			      req->r_path2, req->r_ino2.ino,
25798c2ecf20Sopenharmony_ci			      &path2, &pathlen2, &ino2, &freepath2, true);
25808c2ecf20Sopenharmony_ci	if (ret < 0) {
25818c2ecf20Sopenharmony_ci		msg = ERR_PTR(ret);
25828c2ecf20Sopenharmony_ci		goto out_free1;
25838c2ecf20Sopenharmony_ci	}
25848c2ecf20Sopenharmony_ci
25858c2ecf20Sopenharmony_ci	len = sizeof(*head) +
25868c2ecf20Sopenharmony_ci		pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
25878c2ecf20Sopenharmony_ci		sizeof(struct ceph_timespec);
25888c2ecf20Sopenharmony_ci
25898c2ecf20Sopenharmony_ci	/* calculate (max) length for cap releases */
25908c2ecf20Sopenharmony_ci	len += sizeof(struct ceph_mds_request_release) *
25918c2ecf20Sopenharmony_ci		(!!req->r_inode_drop + !!req->r_dentry_drop +
25928c2ecf20Sopenharmony_ci		 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
25938c2ecf20Sopenharmony_ci	if (req->r_dentry_drop)
25948c2ecf20Sopenharmony_ci		len += pathlen1;
25958c2ecf20Sopenharmony_ci	if (req->r_old_dentry_drop)
25968c2ecf20Sopenharmony_ci		len += pathlen2;
25978c2ecf20Sopenharmony_ci
25988c2ecf20Sopenharmony_ci	msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false);
25998c2ecf20Sopenharmony_ci	if (!msg) {
26008c2ecf20Sopenharmony_ci		msg = ERR_PTR(-ENOMEM);
26018c2ecf20Sopenharmony_ci		goto out_free2;
26028c2ecf20Sopenharmony_ci	}
26038c2ecf20Sopenharmony_ci
26048c2ecf20Sopenharmony_ci	msg->hdr.version = cpu_to_le16(2);
26058c2ecf20Sopenharmony_ci	msg->hdr.tid = cpu_to_le64(req->r_tid);
26068c2ecf20Sopenharmony_ci
26078c2ecf20Sopenharmony_ci	head = msg->front.iov_base;
26088c2ecf20Sopenharmony_ci	p = msg->front.iov_base + sizeof(*head);
26098c2ecf20Sopenharmony_ci	end = msg->front.iov_base + msg->front.iov_len;
26108c2ecf20Sopenharmony_ci
26118c2ecf20Sopenharmony_ci	head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
26128c2ecf20Sopenharmony_ci	head->op = cpu_to_le32(req->r_op);
26138c2ecf20Sopenharmony_ci	head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid));
26148c2ecf20Sopenharmony_ci	head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid));
26158c2ecf20Sopenharmony_ci	head->ino = cpu_to_le64(req->r_deleg_ino);
26168c2ecf20Sopenharmony_ci	head->args = req->r_args;
26178c2ecf20Sopenharmony_ci
26188c2ecf20Sopenharmony_ci	ceph_encode_filepath(&p, end, ino1, path1);
26198c2ecf20Sopenharmony_ci	ceph_encode_filepath(&p, end, ino2, path2);
26208c2ecf20Sopenharmony_ci
26218c2ecf20Sopenharmony_ci	/* make note of release offset, in case we need to replay */
26228c2ecf20Sopenharmony_ci	req->r_request_release_offset = p - msg->front.iov_base;
26238c2ecf20Sopenharmony_ci
26248c2ecf20Sopenharmony_ci	/* cap releases */
26258c2ecf20Sopenharmony_ci	releases = 0;
26268c2ecf20Sopenharmony_ci	if (req->r_inode_drop)
26278c2ecf20Sopenharmony_ci		releases += ceph_encode_inode_release(&p,
26288c2ecf20Sopenharmony_ci		      req->r_inode ? req->r_inode : d_inode(req->r_dentry),
26298c2ecf20Sopenharmony_ci		      mds, req->r_inode_drop, req->r_inode_unless,
26308c2ecf20Sopenharmony_ci		      req->r_op == CEPH_MDS_OP_READDIR);
26318c2ecf20Sopenharmony_ci	if (req->r_dentry_drop)
26328c2ecf20Sopenharmony_ci		releases += ceph_encode_dentry_release(&p, req->r_dentry,
26338c2ecf20Sopenharmony_ci				req->r_parent, mds, req->r_dentry_drop,
26348c2ecf20Sopenharmony_ci				req->r_dentry_unless);
26358c2ecf20Sopenharmony_ci	if (req->r_old_dentry_drop)
26368c2ecf20Sopenharmony_ci		releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
26378c2ecf20Sopenharmony_ci				req->r_old_dentry_dir, mds,
26388c2ecf20Sopenharmony_ci				req->r_old_dentry_drop,
26398c2ecf20Sopenharmony_ci				req->r_old_dentry_unless);
26408c2ecf20Sopenharmony_ci	if (req->r_old_inode_drop)
26418c2ecf20Sopenharmony_ci		releases += ceph_encode_inode_release(&p,
26428c2ecf20Sopenharmony_ci		      d_inode(req->r_old_dentry),
26438c2ecf20Sopenharmony_ci		      mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
26448c2ecf20Sopenharmony_ci
26458c2ecf20Sopenharmony_ci	if (drop_cap_releases) {
26468c2ecf20Sopenharmony_ci		releases = 0;
26478c2ecf20Sopenharmony_ci		p = msg->front.iov_base + req->r_request_release_offset;
26488c2ecf20Sopenharmony_ci	}
26498c2ecf20Sopenharmony_ci
26508c2ecf20Sopenharmony_ci	head->num_releases = cpu_to_le16(releases);
26518c2ecf20Sopenharmony_ci
26528c2ecf20Sopenharmony_ci	/* time stamp */
26538c2ecf20Sopenharmony_ci	{
26548c2ecf20Sopenharmony_ci		struct ceph_timespec ts;
26558c2ecf20Sopenharmony_ci		ceph_encode_timespec64(&ts, &req->r_stamp);
26568c2ecf20Sopenharmony_ci		ceph_encode_copy(&p, &ts, sizeof(ts));
26578c2ecf20Sopenharmony_ci	}
26588c2ecf20Sopenharmony_ci
26598c2ecf20Sopenharmony_ci	if (WARN_ON_ONCE(p > end)) {
26608c2ecf20Sopenharmony_ci		ceph_msg_put(msg);
26618c2ecf20Sopenharmony_ci		msg = ERR_PTR(-ERANGE);
26628c2ecf20Sopenharmony_ci		goto out_free2;
26638c2ecf20Sopenharmony_ci	}
26648c2ecf20Sopenharmony_ci
26658c2ecf20Sopenharmony_ci	msg->front.iov_len = p - msg->front.iov_base;
26668c2ecf20Sopenharmony_ci	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
26678c2ecf20Sopenharmony_ci
26688c2ecf20Sopenharmony_ci	if (req->r_pagelist) {
26698c2ecf20Sopenharmony_ci		struct ceph_pagelist *pagelist = req->r_pagelist;
26708c2ecf20Sopenharmony_ci		ceph_msg_data_add_pagelist(msg, pagelist);
26718c2ecf20Sopenharmony_ci		msg->hdr.data_len = cpu_to_le32(pagelist->length);
26728c2ecf20Sopenharmony_ci	} else {
26738c2ecf20Sopenharmony_ci		msg->hdr.data_len = 0;
26748c2ecf20Sopenharmony_ci	}
26758c2ecf20Sopenharmony_ci
26768c2ecf20Sopenharmony_ci	msg->hdr.data_off = cpu_to_le16(0);
26778c2ecf20Sopenharmony_ci
26788c2ecf20Sopenharmony_ciout_free2:
26798c2ecf20Sopenharmony_ci	if (freepath2)
26808c2ecf20Sopenharmony_ci		ceph_mdsc_free_path((char *)path2, pathlen2);
26818c2ecf20Sopenharmony_ciout_free1:
26828c2ecf20Sopenharmony_ci	if (freepath1)
26838c2ecf20Sopenharmony_ci		ceph_mdsc_free_path((char *)path1, pathlen1);
26848c2ecf20Sopenharmony_ciout:
26858c2ecf20Sopenharmony_ci	return msg;
26868c2ecf20Sopenharmony_ci}
26878c2ecf20Sopenharmony_ci
26888c2ecf20Sopenharmony_ci/*
26898c2ecf20Sopenharmony_ci * called under mdsc->mutex if error, under no mutex if
26908c2ecf20Sopenharmony_ci * success.
26918c2ecf20Sopenharmony_ci */
26928c2ecf20Sopenharmony_cistatic void complete_request(struct ceph_mds_client *mdsc,
26938c2ecf20Sopenharmony_ci			     struct ceph_mds_request *req)
26948c2ecf20Sopenharmony_ci{
26958c2ecf20Sopenharmony_ci	req->r_end_latency = ktime_get();
26968c2ecf20Sopenharmony_ci
26978c2ecf20Sopenharmony_ci	if (req->r_callback)
26988c2ecf20Sopenharmony_ci		req->r_callback(mdsc, req);
26998c2ecf20Sopenharmony_ci	complete_all(&req->r_completion);
27008c2ecf20Sopenharmony_ci}
27018c2ecf20Sopenharmony_ci
27028c2ecf20Sopenharmony_ci/*
27038c2ecf20Sopenharmony_ci * called under mdsc->mutex
27048c2ecf20Sopenharmony_ci */
27058c2ecf20Sopenharmony_cistatic int __prepare_send_request(struct ceph_mds_client *mdsc,
27068c2ecf20Sopenharmony_ci				  struct ceph_mds_request *req,
27078c2ecf20Sopenharmony_ci				  int mds, bool drop_cap_releases)
27088c2ecf20Sopenharmony_ci{
27098c2ecf20Sopenharmony_ci	struct ceph_mds_request_head *rhead;
27108c2ecf20Sopenharmony_ci	struct ceph_msg *msg;
27118c2ecf20Sopenharmony_ci	int flags = 0;
27128c2ecf20Sopenharmony_ci
27138c2ecf20Sopenharmony_ci	req->r_attempts++;
27148c2ecf20Sopenharmony_ci	if (req->r_inode) {
27158c2ecf20Sopenharmony_ci		struct ceph_cap *cap =
27168c2ecf20Sopenharmony_ci			ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
27178c2ecf20Sopenharmony_ci
27188c2ecf20Sopenharmony_ci		if (cap)
27198c2ecf20Sopenharmony_ci			req->r_sent_on_mseq = cap->mseq;
27208c2ecf20Sopenharmony_ci		else
27218c2ecf20Sopenharmony_ci			req->r_sent_on_mseq = -1;
27228c2ecf20Sopenharmony_ci	}
27238c2ecf20Sopenharmony_ci	dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
27248c2ecf20Sopenharmony_ci	     req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
27258c2ecf20Sopenharmony_ci
27268c2ecf20Sopenharmony_ci	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
27278c2ecf20Sopenharmony_ci		void *p;
27288c2ecf20Sopenharmony_ci		/*
27298c2ecf20Sopenharmony_ci		 * Replay.  Do not regenerate message (and rebuild
27308c2ecf20Sopenharmony_ci		 * paths, etc.); just use the original message.
27318c2ecf20Sopenharmony_ci		 * Rebuilding paths will break for renames because
27328c2ecf20Sopenharmony_ci		 * d_move mangles the src name.
27338c2ecf20Sopenharmony_ci		 */
27348c2ecf20Sopenharmony_ci		msg = req->r_request;
27358c2ecf20Sopenharmony_ci		rhead = msg->front.iov_base;
27368c2ecf20Sopenharmony_ci
27378c2ecf20Sopenharmony_ci		flags = le32_to_cpu(rhead->flags);
27388c2ecf20Sopenharmony_ci		flags |= CEPH_MDS_FLAG_REPLAY;
27398c2ecf20Sopenharmony_ci		rhead->flags = cpu_to_le32(flags);
27408c2ecf20Sopenharmony_ci
27418c2ecf20Sopenharmony_ci		if (req->r_target_inode)
27428c2ecf20Sopenharmony_ci			rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
27438c2ecf20Sopenharmony_ci
27448c2ecf20Sopenharmony_ci		rhead->num_retry = req->r_attempts - 1;
27458c2ecf20Sopenharmony_ci
27468c2ecf20Sopenharmony_ci		/* remove cap/dentry releases from message */
27478c2ecf20Sopenharmony_ci		rhead->num_releases = 0;
27488c2ecf20Sopenharmony_ci
27498c2ecf20Sopenharmony_ci		/* time stamp */
27508c2ecf20Sopenharmony_ci		p = msg->front.iov_base + req->r_request_release_offset;
27518c2ecf20Sopenharmony_ci		{
27528c2ecf20Sopenharmony_ci			struct ceph_timespec ts;
27538c2ecf20Sopenharmony_ci			ceph_encode_timespec64(&ts, &req->r_stamp);
27548c2ecf20Sopenharmony_ci			ceph_encode_copy(&p, &ts, sizeof(ts));
27558c2ecf20Sopenharmony_ci		}
27568c2ecf20Sopenharmony_ci
27578c2ecf20Sopenharmony_ci		msg->front.iov_len = p - msg->front.iov_base;
27588c2ecf20Sopenharmony_ci		msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
27598c2ecf20Sopenharmony_ci		return 0;
27608c2ecf20Sopenharmony_ci	}
27618c2ecf20Sopenharmony_ci
27628c2ecf20Sopenharmony_ci	if (req->r_request) {
27638c2ecf20Sopenharmony_ci		ceph_msg_put(req->r_request);
27648c2ecf20Sopenharmony_ci		req->r_request = NULL;
27658c2ecf20Sopenharmony_ci	}
27668c2ecf20Sopenharmony_ci	msg = create_request_message(mdsc, req, mds, drop_cap_releases);
27678c2ecf20Sopenharmony_ci	if (IS_ERR(msg)) {
27688c2ecf20Sopenharmony_ci		req->r_err = PTR_ERR(msg);
27698c2ecf20Sopenharmony_ci		return PTR_ERR(msg);
27708c2ecf20Sopenharmony_ci	}
27718c2ecf20Sopenharmony_ci	req->r_request = msg;
27728c2ecf20Sopenharmony_ci
27738c2ecf20Sopenharmony_ci	rhead = msg->front.iov_base;
27748c2ecf20Sopenharmony_ci	rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
27758c2ecf20Sopenharmony_ci	if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
27768c2ecf20Sopenharmony_ci		flags |= CEPH_MDS_FLAG_REPLAY;
27778c2ecf20Sopenharmony_ci	if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
27788c2ecf20Sopenharmony_ci		flags |= CEPH_MDS_FLAG_ASYNC;
27798c2ecf20Sopenharmony_ci	if (req->r_parent)
27808c2ecf20Sopenharmony_ci		flags |= CEPH_MDS_FLAG_WANT_DENTRY;
27818c2ecf20Sopenharmony_ci	rhead->flags = cpu_to_le32(flags);
27828c2ecf20Sopenharmony_ci	rhead->num_fwd = req->r_num_fwd;
27838c2ecf20Sopenharmony_ci	rhead->num_retry = req->r_attempts - 1;
27848c2ecf20Sopenharmony_ci
27858c2ecf20Sopenharmony_ci	dout(" r_parent = %p\n", req->r_parent);
27868c2ecf20Sopenharmony_ci	return 0;
27878c2ecf20Sopenharmony_ci}
27888c2ecf20Sopenharmony_ci
27898c2ecf20Sopenharmony_ci/*
27908c2ecf20Sopenharmony_ci * called under mdsc->mutex
27918c2ecf20Sopenharmony_ci */
27928c2ecf20Sopenharmony_cistatic int __send_request(struct ceph_mds_client *mdsc,
27938c2ecf20Sopenharmony_ci			  struct ceph_mds_session *session,
27948c2ecf20Sopenharmony_ci			  struct ceph_mds_request *req,
27958c2ecf20Sopenharmony_ci			  bool drop_cap_releases)
27968c2ecf20Sopenharmony_ci{
27978c2ecf20Sopenharmony_ci	int err;
27988c2ecf20Sopenharmony_ci
27998c2ecf20Sopenharmony_ci	err = __prepare_send_request(mdsc, req, session->s_mds,
28008c2ecf20Sopenharmony_ci				     drop_cap_releases);
28018c2ecf20Sopenharmony_ci	if (!err) {
28028c2ecf20Sopenharmony_ci		ceph_msg_get(req->r_request);
28038c2ecf20Sopenharmony_ci		ceph_con_send(&session->s_con, req->r_request);
28048c2ecf20Sopenharmony_ci	}
28058c2ecf20Sopenharmony_ci
28068c2ecf20Sopenharmony_ci	return err;
28078c2ecf20Sopenharmony_ci}
28088c2ecf20Sopenharmony_ci
28098c2ecf20Sopenharmony_ci/*
28108c2ecf20Sopenharmony_ci * send request, or put it on the appropriate wait list.
28118c2ecf20Sopenharmony_ci */
28128c2ecf20Sopenharmony_cistatic void __do_request(struct ceph_mds_client *mdsc,
28138c2ecf20Sopenharmony_ci			struct ceph_mds_request *req)
28148c2ecf20Sopenharmony_ci{
28158c2ecf20Sopenharmony_ci	struct ceph_mds_session *session = NULL;
28168c2ecf20Sopenharmony_ci	int mds = -1;
28178c2ecf20Sopenharmony_ci	int err = 0;
28188c2ecf20Sopenharmony_ci	bool random;
28198c2ecf20Sopenharmony_ci
28208c2ecf20Sopenharmony_ci	if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
28218c2ecf20Sopenharmony_ci		if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
28228c2ecf20Sopenharmony_ci			__unregister_request(mdsc, req);
28238c2ecf20Sopenharmony_ci		return;
28248c2ecf20Sopenharmony_ci	}
28258c2ecf20Sopenharmony_ci
28268c2ecf20Sopenharmony_ci	if (req->r_timeout &&
28278c2ecf20Sopenharmony_ci	    time_after_eq(jiffies, req->r_started + req->r_timeout)) {
28288c2ecf20Sopenharmony_ci		dout("do_request timed out\n");
28298c2ecf20Sopenharmony_ci		err = -ETIMEDOUT;
28308c2ecf20Sopenharmony_ci		goto finish;
28318c2ecf20Sopenharmony_ci	}
28328c2ecf20Sopenharmony_ci	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
28338c2ecf20Sopenharmony_ci		dout("do_request forced umount\n");
28348c2ecf20Sopenharmony_ci		err = -EIO;
28358c2ecf20Sopenharmony_ci		goto finish;
28368c2ecf20Sopenharmony_ci	}
28378c2ecf20Sopenharmony_ci	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
28388c2ecf20Sopenharmony_ci		if (mdsc->mdsmap_err) {
28398c2ecf20Sopenharmony_ci			err = mdsc->mdsmap_err;
28408c2ecf20Sopenharmony_ci			dout("do_request mdsmap err %d\n", err);
28418c2ecf20Sopenharmony_ci			goto finish;
28428c2ecf20Sopenharmony_ci		}
28438c2ecf20Sopenharmony_ci		if (mdsc->mdsmap->m_epoch == 0) {
28448c2ecf20Sopenharmony_ci			dout("do_request no mdsmap, waiting for map\n");
28458c2ecf20Sopenharmony_ci			list_add(&req->r_wait, &mdsc->waiting_for_map);
28468c2ecf20Sopenharmony_ci			return;
28478c2ecf20Sopenharmony_ci		}
28488c2ecf20Sopenharmony_ci		if (!(mdsc->fsc->mount_options->flags &
28498c2ecf20Sopenharmony_ci		      CEPH_MOUNT_OPT_MOUNTWAIT) &&
28508c2ecf20Sopenharmony_ci		    !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
28518c2ecf20Sopenharmony_ci			err = -EHOSTUNREACH;
28528c2ecf20Sopenharmony_ci			goto finish;
28538c2ecf20Sopenharmony_ci		}
28548c2ecf20Sopenharmony_ci	}
28558c2ecf20Sopenharmony_ci
28568c2ecf20Sopenharmony_ci	put_request_session(req);
28578c2ecf20Sopenharmony_ci
28588c2ecf20Sopenharmony_ci	mds = __choose_mds(mdsc, req, &random);
28598c2ecf20Sopenharmony_ci	if (mds < 0 ||
28608c2ecf20Sopenharmony_ci	    ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
28618c2ecf20Sopenharmony_ci		if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
28628c2ecf20Sopenharmony_ci			err = -EJUKEBOX;
28638c2ecf20Sopenharmony_ci			goto finish;
28648c2ecf20Sopenharmony_ci		}
28658c2ecf20Sopenharmony_ci		dout("do_request no mds or not active, waiting for map\n");
28668c2ecf20Sopenharmony_ci		list_add(&req->r_wait, &mdsc->waiting_for_map);
28678c2ecf20Sopenharmony_ci		return;
28688c2ecf20Sopenharmony_ci	}
28698c2ecf20Sopenharmony_ci
28708c2ecf20Sopenharmony_ci	/* get, open session */
28718c2ecf20Sopenharmony_ci	session = __ceph_lookup_mds_session(mdsc, mds);
28728c2ecf20Sopenharmony_ci	if (!session) {
28738c2ecf20Sopenharmony_ci		session = register_session(mdsc, mds);
28748c2ecf20Sopenharmony_ci		if (IS_ERR(session)) {
28758c2ecf20Sopenharmony_ci			err = PTR_ERR(session);
28768c2ecf20Sopenharmony_ci			goto finish;
28778c2ecf20Sopenharmony_ci		}
28788c2ecf20Sopenharmony_ci	}
28798c2ecf20Sopenharmony_ci	req->r_session = ceph_get_mds_session(session);
28808c2ecf20Sopenharmony_ci
28818c2ecf20Sopenharmony_ci	dout("do_request mds%d session %p state %s\n", mds, session,
28828c2ecf20Sopenharmony_ci	     ceph_session_state_name(session->s_state));
28838c2ecf20Sopenharmony_ci	if (session->s_state != CEPH_MDS_SESSION_OPEN &&
28848c2ecf20Sopenharmony_ci	    session->s_state != CEPH_MDS_SESSION_HUNG) {
28858c2ecf20Sopenharmony_ci		if (session->s_state == CEPH_MDS_SESSION_REJECTED) {
28868c2ecf20Sopenharmony_ci			err = -EACCES;
28878c2ecf20Sopenharmony_ci			goto out_session;
28888c2ecf20Sopenharmony_ci		}
28898c2ecf20Sopenharmony_ci		/*
28908c2ecf20Sopenharmony_ci		 * We cannot queue async requests since the caps and delegated
28918c2ecf20Sopenharmony_ci		 * inodes are bound to the session. Just return -EJUKEBOX and
28928c2ecf20Sopenharmony_ci		 * let the caller retry a sync request in that case.
28938c2ecf20Sopenharmony_ci		 */
28948c2ecf20Sopenharmony_ci		if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) {
28958c2ecf20Sopenharmony_ci			err = -EJUKEBOX;
28968c2ecf20Sopenharmony_ci			goto out_session;
28978c2ecf20Sopenharmony_ci		}
28988c2ecf20Sopenharmony_ci		if (session->s_state == CEPH_MDS_SESSION_NEW ||
28998c2ecf20Sopenharmony_ci		    session->s_state == CEPH_MDS_SESSION_CLOSING) {
29008c2ecf20Sopenharmony_ci			err = __open_session(mdsc, session);
29018c2ecf20Sopenharmony_ci			if (err)
29028c2ecf20Sopenharmony_ci				goto out_session;
29038c2ecf20Sopenharmony_ci			/* retry the same mds later */
29048c2ecf20Sopenharmony_ci			if (random)
29058c2ecf20Sopenharmony_ci				req->r_resend_mds = mds;
29068c2ecf20Sopenharmony_ci		}
29078c2ecf20Sopenharmony_ci		list_add(&req->r_wait, &session->s_waiting);
29088c2ecf20Sopenharmony_ci		goto out_session;
29098c2ecf20Sopenharmony_ci	}
29108c2ecf20Sopenharmony_ci
29118c2ecf20Sopenharmony_ci	/* send request */
29128c2ecf20Sopenharmony_ci	req->r_resend_mds = -1;   /* forget any previous mds hint */
29138c2ecf20Sopenharmony_ci
29148c2ecf20Sopenharmony_ci	if (req->r_request_started == 0)   /* note request start time */
29158c2ecf20Sopenharmony_ci		req->r_request_started = jiffies;
29168c2ecf20Sopenharmony_ci
29178c2ecf20Sopenharmony_ci	err = __send_request(mdsc, session, req, false);
29188c2ecf20Sopenharmony_ci
29198c2ecf20Sopenharmony_ciout_session:
29208c2ecf20Sopenharmony_ci	ceph_put_mds_session(session);
29218c2ecf20Sopenharmony_cifinish:
29228c2ecf20Sopenharmony_ci	if (err) {
29238c2ecf20Sopenharmony_ci		dout("__do_request early error %d\n", err);
29248c2ecf20Sopenharmony_ci		req->r_err = err;
29258c2ecf20Sopenharmony_ci		complete_request(mdsc, req);
29268c2ecf20Sopenharmony_ci		__unregister_request(mdsc, req);
29278c2ecf20Sopenharmony_ci	}
29288c2ecf20Sopenharmony_ci	return;
29298c2ecf20Sopenharmony_ci}
29308c2ecf20Sopenharmony_ci
29318c2ecf20Sopenharmony_ci/*
29328c2ecf20Sopenharmony_ci * called under mdsc->mutex
29338c2ecf20Sopenharmony_ci */
29348c2ecf20Sopenharmony_cistatic void __wake_requests(struct ceph_mds_client *mdsc,
29358c2ecf20Sopenharmony_ci			    struct list_head *head)
29368c2ecf20Sopenharmony_ci{
29378c2ecf20Sopenharmony_ci	struct ceph_mds_request *req;
29388c2ecf20Sopenharmony_ci	LIST_HEAD(tmp_list);
29398c2ecf20Sopenharmony_ci
29408c2ecf20Sopenharmony_ci	list_splice_init(head, &tmp_list);
29418c2ecf20Sopenharmony_ci
29428c2ecf20Sopenharmony_ci	while (!list_empty(&tmp_list)) {
29438c2ecf20Sopenharmony_ci		req = list_entry(tmp_list.next,
29448c2ecf20Sopenharmony_ci				 struct ceph_mds_request, r_wait);
29458c2ecf20Sopenharmony_ci		list_del_init(&req->r_wait);
29468c2ecf20Sopenharmony_ci		dout(" wake request %p tid %llu\n", req, req->r_tid);
29478c2ecf20Sopenharmony_ci		__do_request(mdsc, req);
29488c2ecf20Sopenharmony_ci	}
29498c2ecf20Sopenharmony_ci}
29508c2ecf20Sopenharmony_ci
29518c2ecf20Sopenharmony_ci/*
29528c2ecf20Sopenharmony_ci * Wake up threads with requests pending for @mds, so that they can
29538c2ecf20Sopenharmony_ci * resubmit their requests to a possibly different mds.
29548c2ecf20Sopenharmony_ci */
29558c2ecf20Sopenharmony_cistatic void kick_requests(struct ceph_mds_client *mdsc, int mds)
29568c2ecf20Sopenharmony_ci{
29578c2ecf20Sopenharmony_ci	struct ceph_mds_request *req;
29588c2ecf20Sopenharmony_ci	struct rb_node *p = rb_first(&mdsc->request_tree);
29598c2ecf20Sopenharmony_ci
29608c2ecf20Sopenharmony_ci	dout("kick_requests mds%d\n", mds);
29618c2ecf20Sopenharmony_ci	while (p) {
29628c2ecf20Sopenharmony_ci		req = rb_entry(p, struct ceph_mds_request, r_node);
29638c2ecf20Sopenharmony_ci		p = rb_next(p);
29648c2ecf20Sopenharmony_ci		if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
29658c2ecf20Sopenharmony_ci			continue;
29668c2ecf20Sopenharmony_ci		if (req->r_attempts > 0)
29678c2ecf20Sopenharmony_ci			continue; /* only new requests */
29688c2ecf20Sopenharmony_ci		if (req->r_session &&
29698c2ecf20Sopenharmony_ci		    req->r_session->s_mds == mds) {
29708c2ecf20Sopenharmony_ci			dout(" kicking tid %llu\n", req->r_tid);
29718c2ecf20Sopenharmony_ci			list_del_init(&req->r_wait);
29728c2ecf20Sopenharmony_ci			__do_request(mdsc, req);
29738c2ecf20Sopenharmony_ci		}
29748c2ecf20Sopenharmony_ci	}
29758c2ecf20Sopenharmony_ci}
29768c2ecf20Sopenharmony_ci
29778c2ecf20Sopenharmony_ciint ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
29788c2ecf20Sopenharmony_ci			      struct ceph_mds_request *req)
29798c2ecf20Sopenharmony_ci{
29808c2ecf20Sopenharmony_ci	int err = 0;
29818c2ecf20Sopenharmony_ci
29828c2ecf20Sopenharmony_ci	/* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
29838c2ecf20Sopenharmony_ci	if (req->r_inode)
29848c2ecf20Sopenharmony_ci		ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
29858c2ecf20Sopenharmony_ci	if (req->r_parent) {
29868c2ecf20Sopenharmony_ci		struct ceph_inode_info *ci = ceph_inode(req->r_parent);
29878c2ecf20Sopenharmony_ci		int fmode = (req->r_op & CEPH_MDS_OP_WRITE) ?
29888c2ecf20Sopenharmony_ci			    CEPH_FILE_MODE_WR : CEPH_FILE_MODE_RD;
29898c2ecf20Sopenharmony_ci		spin_lock(&ci->i_ceph_lock);
29908c2ecf20Sopenharmony_ci		ceph_take_cap_refs(ci, CEPH_CAP_PIN, false);
29918c2ecf20Sopenharmony_ci		__ceph_touch_fmode(ci, mdsc, fmode);
29928c2ecf20Sopenharmony_ci		spin_unlock(&ci->i_ceph_lock);
29938c2ecf20Sopenharmony_ci		ihold(req->r_parent);
29948c2ecf20Sopenharmony_ci	}
29958c2ecf20Sopenharmony_ci	if (req->r_old_dentry_dir)
29968c2ecf20Sopenharmony_ci		ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
29978c2ecf20Sopenharmony_ci				  CEPH_CAP_PIN);
29988c2ecf20Sopenharmony_ci
29998c2ecf20Sopenharmony_ci	if (req->r_inode) {
30008c2ecf20Sopenharmony_ci		err = ceph_wait_on_async_create(req->r_inode);
30018c2ecf20Sopenharmony_ci		if (err) {
30028c2ecf20Sopenharmony_ci			dout("%s: wait for async create returned: %d\n",
30038c2ecf20Sopenharmony_ci			     __func__, err);
30048c2ecf20Sopenharmony_ci			return err;
30058c2ecf20Sopenharmony_ci		}
30068c2ecf20Sopenharmony_ci	}
30078c2ecf20Sopenharmony_ci
30088c2ecf20Sopenharmony_ci	if (!err && req->r_old_inode) {
30098c2ecf20Sopenharmony_ci		err = ceph_wait_on_async_create(req->r_old_inode);
30108c2ecf20Sopenharmony_ci		if (err) {
30118c2ecf20Sopenharmony_ci			dout("%s: wait for async create returned: %d\n",
30128c2ecf20Sopenharmony_ci			     __func__, err);
30138c2ecf20Sopenharmony_ci			return err;
30148c2ecf20Sopenharmony_ci		}
30158c2ecf20Sopenharmony_ci	}
30168c2ecf20Sopenharmony_ci
30178c2ecf20Sopenharmony_ci	dout("submit_request on %p for inode %p\n", req, dir);
30188c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
30198c2ecf20Sopenharmony_ci	__register_request(mdsc, req, dir);
30208c2ecf20Sopenharmony_ci	__do_request(mdsc, req);
30218c2ecf20Sopenharmony_ci	err = req->r_err;
30228c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
30238c2ecf20Sopenharmony_ci	return err;
30248c2ecf20Sopenharmony_ci}
30258c2ecf20Sopenharmony_ci
30268c2ecf20Sopenharmony_cistatic int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
30278c2ecf20Sopenharmony_ci				  struct ceph_mds_request *req)
30288c2ecf20Sopenharmony_ci{
30298c2ecf20Sopenharmony_ci	int err;
30308c2ecf20Sopenharmony_ci
30318c2ecf20Sopenharmony_ci	/* wait */
30328c2ecf20Sopenharmony_ci	dout("do_request waiting\n");
30338c2ecf20Sopenharmony_ci	if (!req->r_timeout && req->r_wait_for_completion) {
30348c2ecf20Sopenharmony_ci		err = req->r_wait_for_completion(mdsc, req);
30358c2ecf20Sopenharmony_ci	} else {
30368c2ecf20Sopenharmony_ci		long timeleft = wait_for_completion_killable_timeout(
30378c2ecf20Sopenharmony_ci					&req->r_completion,
30388c2ecf20Sopenharmony_ci					ceph_timeout_jiffies(req->r_timeout));
30398c2ecf20Sopenharmony_ci		if (timeleft > 0)
30408c2ecf20Sopenharmony_ci			err = 0;
30418c2ecf20Sopenharmony_ci		else if (!timeleft)
30428c2ecf20Sopenharmony_ci			err = -ETIMEDOUT;  /* timed out */
30438c2ecf20Sopenharmony_ci		else
30448c2ecf20Sopenharmony_ci			err = timeleft;  /* killed */
30458c2ecf20Sopenharmony_ci	}
30468c2ecf20Sopenharmony_ci	dout("do_request waited, got %d\n", err);
30478c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
30488c2ecf20Sopenharmony_ci
30498c2ecf20Sopenharmony_ci	/* only abort if we didn't race with a real reply */
30508c2ecf20Sopenharmony_ci	if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
30518c2ecf20Sopenharmony_ci		err = le32_to_cpu(req->r_reply_info.head->result);
30528c2ecf20Sopenharmony_ci	} else if (err < 0) {
30538c2ecf20Sopenharmony_ci		dout("aborted request %lld with %d\n", req->r_tid, err);
30548c2ecf20Sopenharmony_ci
30558c2ecf20Sopenharmony_ci		/*
30568c2ecf20Sopenharmony_ci		 * ensure we aren't running concurrently with
30578c2ecf20Sopenharmony_ci		 * ceph_fill_trace or ceph_readdir_prepopulate, which
30588c2ecf20Sopenharmony_ci		 * rely on locks (dir mutex) held by our caller.
30598c2ecf20Sopenharmony_ci		 */
30608c2ecf20Sopenharmony_ci		mutex_lock(&req->r_fill_mutex);
30618c2ecf20Sopenharmony_ci		req->r_err = err;
30628c2ecf20Sopenharmony_ci		set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
30638c2ecf20Sopenharmony_ci		mutex_unlock(&req->r_fill_mutex);
30648c2ecf20Sopenharmony_ci
30658c2ecf20Sopenharmony_ci		if (req->r_parent &&
30668c2ecf20Sopenharmony_ci		    (req->r_op & CEPH_MDS_OP_WRITE))
30678c2ecf20Sopenharmony_ci			ceph_invalidate_dir_request(req);
30688c2ecf20Sopenharmony_ci	} else {
30698c2ecf20Sopenharmony_ci		err = req->r_err;
30708c2ecf20Sopenharmony_ci	}
30718c2ecf20Sopenharmony_ci
30728c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
30738c2ecf20Sopenharmony_ci	return err;
30748c2ecf20Sopenharmony_ci}
30758c2ecf20Sopenharmony_ci
30768c2ecf20Sopenharmony_ci/*
30778c2ecf20Sopenharmony_ci * Synchrously perform an mds request.  Take care of all of the
30788c2ecf20Sopenharmony_ci * session setup, forwarding, retry details.
30798c2ecf20Sopenharmony_ci */
30808c2ecf20Sopenharmony_ciint ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
30818c2ecf20Sopenharmony_ci			 struct inode *dir,
30828c2ecf20Sopenharmony_ci			 struct ceph_mds_request *req)
30838c2ecf20Sopenharmony_ci{
30848c2ecf20Sopenharmony_ci	int err;
30858c2ecf20Sopenharmony_ci
30868c2ecf20Sopenharmony_ci	dout("do_request on %p\n", req);
30878c2ecf20Sopenharmony_ci
30888c2ecf20Sopenharmony_ci	/* issue */
30898c2ecf20Sopenharmony_ci	err = ceph_mdsc_submit_request(mdsc, dir, req);
30908c2ecf20Sopenharmony_ci	if (!err)
30918c2ecf20Sopenharmony_ci		err = ceph_mdsc_wait_request(mdsc, req);
30928c2ecf20Sopenharmony_ci	dout("do_request %p done, result %d\n", req, err);
30938c2ecf20Sopenharmony_ci	return err;
30948c2ecf20Sopenharmony_ci}
30958c2ecf20Sopenharmony_ci
30968c2ecf20Sopenharmony_ci/*
30978c2ecf20Sopenharmony_ci * Invalidate dir's completeness, dentry lease state on an aborted MDS
30988c2ecf20Sopenharmony_ci * namespace request.
30998c2ecf20Sopenharmony_ci */
31008c2ecf20Sopenharmony_civoid ceph_invalidate_dir_request(struct ceph_mds_request *req)
31018c2ecf20Sopenharmony_ci{
31028c2ecf20Sopenharmony_ci	struct inode *dir = req->r_parent;
31038c2ecf20Sopenharmony_ci	struct inode *old_dir = req->r_old_dentry_dir;
31048c2ecf20Sopenharmony_ci
31058c2ecf20Sopenharmony_ci	dout("invalidate_dir_request %p %p (complete, lease(s))\n", dir, old_dir);
31068c2ecf20Sopenharmony_ci
31078c2ecf20Sopenharmony_ci	ceph_dir_clear_complete(dir);
31088c2ecf20Sopenharmony_ci	if (old_dir)
31098c2ecf20Sopenharmony_ci		ceph_dir_clear_complete(old_dir);
31108c2ecf20Sopenharmony_ci	if (req->r_dentry)
31118c2ecf20Sopenharmony_ci		ceph_invalidate_dentry_lease(req->r_dentry);
31128c2ecf20Sopenharmony_ci	if (req->r_old_dentry)
31138c2ecf20Sopenharmony_ci		ceph_invalidate_dentry_lease(req->r_old_dentry);
31148c2ecf20Sopenharmony_ci}
31158c2ecf20Sopenharmony_ci
31168c2ecf20Sopenharmony_ci/*
31178c2ecf20Sopenharmony_ci * Handle mds reply.
31188c2ecf20Sopenharmony_ci *
31198c2ecf20Sopenharmony_ci * We take the session mutex and parse and process the reply immediately.
31208c2ecf20Sopenharmony_ci * This preserves the logical ordering of replies, capabilities, etc., sent
31218c2ecf20Sopenharmony_ci * by the MDS as they are applied to our local cache.
31228c2ecf20Sopenharmony_ci */
31238c2ecf20Sopenharmony_cistatic void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
31248c2ecf20Sopenharmony_ci{
31258c2ecf20Sopenharmony_ci	struct ceph_mds_client *mdsc = session->s_mdsc;
31268c2ecf20Sopenharmony_ci	struct ceph_mds_request *req;
31278c2ecf20Sopenharmony_ci	struct ceph_mds_reply_head *head = msg->front.iov_base;
31288c2ecf20Sopenharmony_ci	struct ceph_mds_reply_info_parsed *rinfo;  /* parsed reply info */
31298c2ecf20Sopenharmony_ci	struct ceph_snap_realm *realm;
31308c2ecf20Sopenharmony_ci	u64 tid;
31318c2ecf20Sopenharmony_ci	int err, result;
31328c2ecf20Sopenharmony_ci	int mds = session->s_mds;
31338c2ecf20Sopenharmony_ci
31348c2ecf20Sopenharmony_ci	if (msg->front.iov_len < sizeof(*head)) {
31358c2ecf20Sopenharmony_ci		pr_err("mdsc_handle_reply got corrupt (short) reply\n");
31368c2ecf20Sopenharmony_ci		ceph_msg_dump(msg);
31378c2ecf20Sopenharmony_ci		return;
31388c2ecf20Sopenharmony_ci	}
31398c2ecf20Sopenharmony_ci
31408c2ecf20Sopenharmony_ci	/* get request, session */
31418c2ecf20Sopenharmony_ci	tid = le64_to_cpu(msg->hdr.tid);
31428c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
31438c2ecf20Sopenharmony_ci	req = lookup_get_request(mdsc, tid);
31448c2ecf20Sopenharmony_ci	if (!req) {
31458c2ecf20Sopenharmony_ci		dout("handle_reply on unknown tid %llu\n", tid);
31468c2ecf20Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
31478c2ecf20Sopenharmony_ci		return;
31488c2ecf20Sopenharmony_ci	}
31498c2ecf20Sopenharmony_ci	dout("handle_reply %p\n", req);
31508c2ecf20Sopenharmony_ci
31518c2ecf20Sopenharmony_ci	/* correct session? */
31528c2ecf20Sopenharmony_ci	if (req->r_session != session) {
31538c2ecf20Sopenharmony_ci		pr_err("mdsc_handle_reply got %llu on session mds%d"
31548c2ecf20Sopenharmony_ci		       " not mds%d\n", tid, session->s_mds,
31558c2ecf20Sopenharmony_ci		       req->r_session ? req->r_session->s_mds : -1);
31568c2ecf20Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
31578c2ecf20Sopenharmony_ci		goto out;
31588c2ecf20Sopenharmony_ci	}
31598c2ecf20Sopenharmony_ci
31608c2ecf20Sopenharmony_ci	/* dup? */
31618c2ecf20Sopenharmony_ci	if ((test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags) && !head->safe) ||
31628c2ecf20Sopenharmony_ci	    (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags) && head->safe)) {
31638c2ecf20Sopenharmony_ci		pr_warn("got a dup %s reply on %llu from mds%d\n",
31648c2ecf20Sopenharmony_ci			   head->safe ? "safe" : "unsafe", tid, mds);
31658c2ecf20Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
31668c2ecf20Sopenharmony_ci		goto out;
31678c2ecf20Sopenharmony_ci	}
31688c2ecf20Sopenharmony_ci	if (test_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags)) {
31698c2ecf20Sopenharmony_ci		pr_warn("got unsafe after safe on %llu from mds%d\n",
31708c2ecf20Sopenharmony_ci			   tid, mds);
31718c2ecf20Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
31728c2ecf20Sopenharmony_ci		goto out;
31738c2ecf20Sopenharmony_ci	}
31748c2ecf20Sopenharmony_ci
31758c2ecf20Sopenharmony_ci	result = le32_to_cpu(head->result);
31768c2ecf20Sopenharmony_ci
31778c2ecf20Sopenharmony_ci	/*
31788c2ecf20Sopenharmony_ci	 * Handle an ESTALE
31798c2ecf20Sopenharmony_ci	 * if we're not talking to the authority, send to them
31808c2ecf20Sopenharmony_ci	 * if the authority has changed while we weren't looking,
31818c2ecf20Sopenharmony_ci	 * send to new authority
31828c2ecf20Sopenharmony_ci	 * Otherwise we just have to return an ESTALE
31838c2ecf20Sopenharmony_ci	 */
31848c2ecf20Sopenharmony_ci	if (result == -ESTALE) {
31858c2ecf20Sopenharmony_ci		dout("got ESTALE on request %llu\n", req->r_tid);
31868c2ecf20Sopenharmony_ci		req->r_resend_mds = -1;
31878c2ecf20Sopenharmony_ci		if (req->r_direct_mode != USE_AUTH_MDS) {
31888c2ecf20Sopenharmony_ci			dout("not using auth, setting for that now\n");
31898c2ecf20Sopenharmony_ci			req->r_direct_mode = USE_AUTH_MDS;
31908c2ecf20Sopenharmony_ci			__do_request(mdsc, req);
31918c2ecf20Sopenharmony_ci			mutex_unlock(&mdsc->mutex);
31928c2ecf20Sopenharmony_ci			goto out;
31938c2ecf20Sopenharmony_ci		} else  {
31948c2ecf20Sopenharmony_ci			int mds = __choose_mds(mdsc, req, NULL);
31958c2ecf20Sopenharmony_ci			if (mds >= 0 && mds != req->r_session->s_mds) {
31968c2ecf20Sopenharmony_ci				dout("but auth changed, so resending\n");
31978c2ecf20Sopenharmony_ci				__do_request(mdsc, req);
31988c2ecf20Sopenharmony_ci				mutex_unlock(&mdsc->mutex);
31998c2ecf20Sopenharmony_ci				goto out;
32008c2ecf20Sopenharmony_ci			}
32018c2ecf20Sopenharmony_ci		}
32028c2ecf20Sopenharmony_ci		dout("have to return ESTALE on request %llu\n", req->r_tid);
32038c2ecf20Sopenharmony_ci	}
32048c2ecf20Sopenharmony_ci
32058c2ecf20Sopenharmony_ci
32068c2ecf20Sopenharmony_ci	if (head->safe) {
32078c2ecf20Sopenharmony_ci		set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
32088c2ecf20Sopenharmony_ci		__unregister_request(mdsc, req);
32098c2ecf20Sopenharmony_ci
32108c2ecf20Sopenharmony_ci		/* last request during umount? */
32118c2ecf20Sopenharmony_ci		if (mdsc->stopping && !__get_oldest_req(mdsc))
32128c2ecf20Sopenharmony_ci			complete_all(&mdsc->safe_umount_waiters);
32138c2ecf20Sopenharmony_ci
32148c2ecf20Sopenharmony_ci		if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
32158c2ecf20Sopenharmony_ci			/*
32168c2ecf20Sopenharmony_ci			 * We already handled the unsafe response, now do the
32178c2ecf20Sopenharmony_ci			 * cleanup.  No need to examine the response; the MDS
32188c2ecf20Sopenharmony_ci			 * doesn't include any result info in the safe
32198c2ecf20Sopenharmony_ci			 * response.  And even if it did, there is nothing
32208c2ecf20Sopenharmony_ci			 * useful we could do with a revised return value.
32218c2ecf20Sopenharmony_ci			 */
32228c2ecf20Sopenharmony_ci			dout("got safe reply %llu, mds%d\n", tid, mds);
32238c2ecf20Sopenharmony_ci
32248c2ecf20Sopenharmony_ci			mutex_unlock(&mdsc->mutex);
32258c2ecf20Sopenharmony_ci			goto out;
32268c2ecf20Sopenharmony_ci		}
32278c2ecf20Sopenharmony_ci	} else {
32288c2ecf20Sopenharmony_ci		set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags);
32298c2ecf20Sopenharmony_ci		list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
32308c2ecf20Sopenharmony_ci	}
32318c2ecf20Sopenharmony_ci
32328c2ecf20Sopenharmony_ci	dout("handle_reply tid %lld result %d\n", tid, result);
32338c2ecf20Sopenharmony_ci	rinfo = &req->r_reply_info;
32348c2ecf20Sopenharmony_ci	if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
32358c2ecf20Sopenharmony_ci		err = parse_reply_info(session, msg, rinfo, (u64)-1);
32368c2ecf20Sopenharmony_ci	else
32378c2ecf20Sopenharmony_ci		err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features);
32388c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
32398c2ecf20Sopenharmony_ci
32408c2ecf20Sopenharmony_ci	mutex_lock(&session->s_mutex);
32418c2ecf20Sopenharmony_ci	if (err < 0) {
32428c2ecf20Sopenharmony_ci		pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
32438c2ecf20Sopenharmony_ci		ceph_msg_dump(msg);
32448c2ecf20Sopenharmony_ci		goto out_err;
32458c2ecf20Sopenharmony_ci	}
32468c2ecf20Sopenharmony_ci
32478c2ecf20Sopenharmony_ci	/* snap trace */
32488c2ecf20Sopenharmony_ci	realm = NULL;
32498c2ecf20Sopenharmony_ci	if (rinfo->snapblob_len) {
32508c2ecf20Sopenharmony_ci		down_write(&mdsc->snap_rwsem);
32518c2ecf20Sopenharmony_ci		ceph_update_snap_trace(mdsc, rinfo->snapblob,
32528c2ecf20Sopenharmony_ci				rinfo->snapblob + rinfo->snapblob_len,
32538c2ecf20Sopenharmony_ci				le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
32548c2ecf20Sopenharmony_ci				&realm);
32558c2ecf20Sopenharmony_ci		downgrade_write(&mdsc->snap_rwsem);
32568c2ecf20Sopenharmony_ci	} else {
32578c2ecf20Sopenharmony_ci		down_read(&mdsc->snap_rwsem);
32588c2ecf20Sopenharmony_ci	}
32598c2ecf20Sopenharmony_ci
32608c2ecf20Sopenharmony_ci	/* insert trace into our cache */
32618c2ecf20Sopenharmony_ci	mutex_lock(&req->r_fill_mutex);
32628c2ecf20Sopenharmony_ci	current->journal_info = req;
32638c2ecf20Sopenharmony_ci	err = ceph_fill_trace(mdsc->fsc->sb, req);
32648c2ecf20Sopenharmony_ci	if (err == 0) {
32658c2ecf20Sopenharmony_ci		if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
32668c2ecf20Sopenharmony_ci				    req->r_op == CEPH_MDS_OP_LSSNAP))
32678c2ecf20Sopenharmony_ci			ceph_readdir_prepopulate(req, req->r_session);
32688c2ecf20Sopenharmony_ci	}
32698c2ecf20Sopenharmony_ci	current->journal_info = NULL;
32708c2ecf20Sopenharmony_ci	mutex_unlock(&req->r_fill_mutex);
32718c2ecf20Sopenharmony_ci
32728c2ecf20Sopenharmony_ci	up_read(&mdsc->snap_rwsem);
32738c2ecf20Sopenharmony_ci	if (realm)
32748c2ecf20Sopenharmony_ci		ceph_put_snap_realm(mdsc, realm);
32758c2ecf20Sopenharmony_ci
32768c2ecf20Sopenharmony_ci	if (err == 0) {
32778c2ecf20Sopenharmony_ci		if (req->r_target_inode &&
32788c2ecf20Sopenharmony_ci		    test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
32798c2ecf20Sopenharmony_ci			struct ceph_inode_info *ci =
32808c2ecf20Sopenharmony_ci				ceph_inode(req->r_target_inode);
32818c2ecf20Sopenharmony_ci			spin_lock(&ci->i_unsafe_lock);
32828c2ecf20Sopenharmony_ci			list_add_tail(&req->r_unsafe_target_item,
32838c2ecf20Sopenharmony_ci				      &ci->i_unsafe_iops);
32848c2ecf20Sopenharmony_ci			spin_unlock(&ci->i_unsafe_lock);
32858c2ecf20Sopenharmony_ci		}
32868c2ecf20Sopenharmony_ci
32878c2ecf20Sopenharmony_ci		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
32888c2ecf20Sopenharmony_ci	}
32898c2ecf20Sopenharmony_ciout_err:
32908c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
32918c2ecf20Sopenharmony_ci	if (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
32928c2ecf20Sopenharmony_ci		if (err) {
32938c2ecf20Sopenharmony_ci			req->r_err = err;
32948c2ecf20Sopenharmony_ci		} else {
32958c2ecf20Sopenharmony_ci			req->r_reply =  ceph_msg_get(msg);
32968c2ecf20Sopenharmony_ci			set_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags);
32978c2ecf20Sopenharmony_ci		}
32988c2ecf20Sopenharmony_ci	} else {
32998c2ecf20Sopenharmony_ci		dout("reply arrived after request %lld was aborted\n", tid);
33008c2ecf20Sopenharmony_ci	}
33018c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
33028c2ecf20Sopenharmony_ci
33038c2ecf20Sopenharmony_ci	mutex_unlock(&session->s_mutex);
33048c2ecf20Sopenharmony_ci
33058c2ecf20Sopenharmony_ci	/* kick calling process */
33068c2ecf20Sopenharmony_ci	complete_request(mdsc, req);
33078c2ecf20Sopenharmony_ci
33088c2ecf20Sopenharmony_ci	ceph_update_metadata_latency(&mdsc->metric, req->r_start_latency,
33098c2ecf20Sopenharmony_ci				     req->r_end_latency, err);
33108c2ecf20Sopenharmony_ciout:
33118c2ecf20Sopenharmony_ci	ceph_mdsc_put_request(req);
33128c2ecf20Sopenharmony_ci	return;
33138c2ecf20Sopenharmony_ci}
33148c2ecf20Sopenharmony_ci
33158c2ecf20Sopenharmony_ci
33168c2ecf20Sopenharmony_ci
33178c2ecf20Sopenharmony_ci/*
33188c2ecf20Sopenharmony_ci * handle mds notification that our request has been forwarded.
33198c2ecf20Sopenharmony_ci */
33208c2ecf20Sopenharmony_cistatic void handle_forward(struct ceph_mds_client *mdsc,
33218c2ecf20Sopenharmony_ci			   struct ceph_mds_session *session,
33228c2ecf20Sopenharmony_ci			   struct ceph_msg *msg)
33238c2ecf20Sopenharmony_ci{
33248c2ecf20Sopenharmony_ci	struct ceph_mds_request *req;
33258c2ecf20Sopenharmony_ci	u64 tid = le64_to_cpu(msg->hdr.tid);
33268c2ecf20Sopenharmony_ci	u32 next_mds;
33278c2ecf20Sopenharmony_ci	u32 fwd_seq;
33288c2ecf20Sopenharmony_ci	int err = -EINVAL;
33298c2ecf20Sopenharmony_ci	void *p = msg->front.iov_base;
33308c2ecf20Sopenharmony_ci	void *end = p + msg->front.iov_len;
33318c2ecf20Sopenharmony_ci
33328c2ecf20Sopenharmony_ci	ceph_decode_need(&p, end, 2*sizeof(u32), bad);
33338c2ecf20Sopenharmony_ci	next_mds = ceph_decode_32(&p);
33348c2ecf20Sopenharmony_ci	fwd_seq = ceph_decode_32(&p);
33358c2ecf20Sopenharmony_ci
33368c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
33378c2ecf20Sopenharmony_ci	req = lookup_get_request(mdsc, tid);
33388c2ecf20Sopenharmony_ci	if (!req) {
33398c2ecf20Sopenharmony_ci		dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
33408c2ecf20Sopenharmony_ci		goto out;  /* dup reply? */
33418c2ecf20Sopenharmony_ci	}
33428c2ecf20Sopenharmony_ci
33438c2ecf20Sopenharmony_ci	if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
33448c2ecf20Sopenharmony_ci		dout("forward tid %llu aborted, unregistering\n", tid);
33458c2ecf20Sopenharmony_ci		__unregister_request(mdsc, req);
33468c2ecf20Sopenharmony_ci	} else if (fwd_seq <= req->r_num_fwd) {
33478c2ecf20Sopenharmony_ci		dout("forward tid %llu to mds%d - old seq %d <= %d\n",
33488c2ecf20Sopenharmony_ci		     tid, next_mds, req->r_num_fwd, fwd_seq);
33498c2ecf20Sopenharmony_ci	} else {
33508c2ecf20Sopenharmony_ci		/* resend. forward race not possible; mds would drop */
33518c2ecf20Sopenharmony_ci		dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
33528c2ecf20Sopenharmony_ci		BUG_ON(req->r_err);
33538c2ecf20Sopenharmony_ci		BUG_ON(test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags));
33548c2ecf20Sopenharmony_ci		req->r_attempts = 0;
33558c2ecf20Sopenharmony_ci		req->r_num_fwd = fwd_seq;
33568c2ecf20Sopenharmony_ci		req->r_resend_mds = next_mds;
33578c2ecf20Sopenharmony_ci		put_request_session(req);
33588c2ecf20Sopenharmony_ci		__do_request(mdsc, req);
33598c2ecf20Sopenharmony_ci	}
33608c2ecf20Sopenharmony_ci	ceph_mdsc_put_request(req);
33618c2ecf20Sopenharmony_ciout:
33628c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
33638c2ecf20Sopenharmony_ci	return;
33648c2ecf20Sopenharmony_ci
33658c2ecf20Sopenharmony_cibad:
33668c2ecf20Sopenharmony_ci	pr_err("mdsc_handle_forward decode error err=%d\n", err);
33678c2ecf20Sopenharmony_ci}
33688c2ecf20Sopenharmony_ci
33698c2ecf20Sopenharmony_cistatic int __decode_session_metadata(void **p, void *end,
33708c2ecf20Sopenharmony_ci				     bool *blocklisted)
33718c2ecf20Sopenharmony_ci{
33728c2ecf20Sopenharmony_ci	/* map<string,string> */
33738c2ecf20Sopenharmony_ci	u32 n;
33748c2ecf20Sopenharmony_ci	bool err_str;
33758c2ecf20Sopenharmony_ci	ceph_decode_32_safe(p, end, n, bad);
33768c2ecf20Sopenharmony_ci	while (n-- > 0) {
33778c2ecf20Sopenharmony_ci		u32 len;
33788c2ecf20Sopenharmony_ci		ceph_decode_32_safe(p, end, len, bad);
33798c2ecf20Sopenharmony_ci		ceph_decode_need(p, end, len, bad);
33808c2ecf20Sopenharmony_ci		err_str = !strncmp(*p, "error_string", len);
33818c2ecf20Sopenharmony_ci		*p += len;
33828c2ecf20Sopenharmony_ci		ceph_decode_32_safe(p, end, len, bad);
33838c2ecf20Sopenharmony_ci		ceph_decode_need(p, end, len, bad);
33848c2ecf20Sopenharmony_ci		/*
33858c2ecf20Sopenharmony_ci		 * Match "blocklisted (blacklisted)" from newer MDSes,
33868c2ecf20Sopenharmony_ci		 * or "blacklisted" from older MDSes.
33878c2ecf20Sopenharmony_ci		 */
33888c2ecf20Sopenharmony_ci		if (err_str && strnstr(*p, "blacklisted", len))
33898c2ecf20Sopenharmony_ci			*blocklisted = true;
33908c2ecf20Sopenharmony_ci		*p += len;
33918c2ecf20Sopenharmony_ci	}
33928c2ecf20Sopenharmony_ci	return 0;
33938c2ecf20Sopenharmony_cibad:
33948c2ecf20Sopenharmony_ci	return -1;
33958c2ecf20Sopenharmony_ci}
33968c2ecf20Sopenharmony_ci
33978c2ecf20Sopenharmony_ci/*
33988c2ecf20Sopenharmony_ci * handle a mds session control message
33998c2ecf20Sopenharmony_ci */
34008c2ecf20Sopenharmony_cistatic void handle_session(struct ceph_mds_session *session,
34018c2ecf20Sopenharmony_ci			   struct ceph_msg *msg)
34028c2ecf20Sopenharmony_ci{
34038c2ecf20Sopenharmony_ci	struct ceph_mds_client *mdsc = session->s_mdsc;
34048c2ecf20Sopenharmony_ci	int mds = session->s_mds;
34058c2ecf20Sopenharmony_ci	int msg_version = le16_to_cpu(msg->hdr.version);
34068c2ecf20Sopenharmony_ci	void *p = msg->front.iov_base;
34078c2ecf20Sopenharmony_ci	void *end = p + msg->front.iov_len;
34088c2ecf20Sopenharmony_ci	struct ceph_mds_session_head *h;
34098c2ecf20Sopenharmony_ci	u32 op;
34108c2ecf20Sopenharmony_ci	u64 seq, features = 0;
34118c2ecf20Sopenharmony_ci	int wake = 0;
34128c2ecf20Sopenharmony_ci	bool blocklisted = false;
34138c2ecf20Sopenharmony_ci
34148c2ecf20Sopenharmony_ci	/* decode */
34158c2ecf20Sopenharmony_ci	ceph_decode_need(&p, end, sizeof(*h), bad);
34168c2ecf20Sopenharmony_ci	h = p;
34178c2ecf20Sopenharmony_ci	p += sizeof(*h);
34188c2ecf20Sopenharmony_ci
34198c2ecf20Sopenharmony_ci	op = le32_to_cpu(h->op);
34208c2ecf20Sopenharmony_ci	seq = le64_to_cpu(h->seq);
34218c2ecf20Sopenharmony_ci
34228c2ecf20Sopenharmony_ci	if (msg_version >= 3) {
34238c2ecf20Sopenharmony_ci		u32 len;
34248c2ecf20Sopenharmony_ci		/* version >= 2, metadata */
34258c2ecf20Sopenharmony_ci		if (__decode_session_metadata(&p, end, &blocklisted) < 0)
34268c2ecf20Sopenharmony_ci			goto bad;
34278c2ecf20Sopenharmony_ci		/* version >= 3, feature bits */
34288c2ecf20Sopenharmony_ci		ceph_decode_32_safe(&p, end, len, bad);
34298c2ecf20Sopenharmony_ci		if (len) {
34308c2ecf20Sopenharmony_ci			ceph_decode_64_safe(&p, end, features, bad);
34318c2ecf20Sopenharmony_ci			p += len - sizeof(features);
34328c2ecf20Sopenharmony_ci		}
34338c2ecf20Sopenharmony_ci	}
34348c2ecf20Sopenharmony_ci
34358c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
34368c2ecf20Sopenharmony_ci	if (op == CEPH_SESSION_CLOSE) {
34378c2ecf20Sopenharmony_ci		ceph_get_mds_session(session);
34388c2ecf20Sopenharmony_ci		__unregister_session(mdsc, session);
34398c2ecf20Sopenharmony_ci	}
34408c2ecf20Sopenharmony_ci	/* FIXME: this ttl calculation is generous */
34418c2ecf20Sopenharmony_ci	session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
34428c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
34438c2ecf20Sopenharmony_ci
34448c2ecf20Sopenharmony_ci	mutex_lock(&session->s_mutex);
34458c2ecf20Sopenharmony_ci
34468c2ecf20Sopenharmony_ci	dout("handle_session mds%d %s %p state %s seq %llu\n",
34478c2ecf20Sopenharmony_ci	     mds, ceph_session_op_name(op), session,
34488c2ecf20Sopenharmony_ci	     ceph_session_state_name(session->s_state), seq);
34498c2ecf20Sopenharmony_ci
34508c2ecf20Sopenharmony_ci	if (session->s_state == CEPH_MDS_SESSION_HUNG) {
34518c2ecf20Sopenharmony_ci		session->s_state = CEPH_MDS_SESSION_OPEN;
34528c2ecf20Sopenharmony_ci		pr_info("mds%d came back\n", session->s_mds);
34538c2ecf20Sopenharmony_ci	}
34548c2ecf20Sopenharmony_ci
34558c2ecf20Sopenharmony_ci	switch (op) {
34568c2ecf20Sopenharmony_ci	case CEPH_SESSION_OPEN:
34578c2ecf20Sopenharmony_ci		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
34588c2ecf20Sopenharmony_ci			pr_info("mds%d reconnect success\n", session->s_mds);
34598c2ecf20Sopenharmony_ci		session->s_state = CEPH_MDS_SESSION_OPEN;
34608c2ecf20Sopenharmony_ci		session->s_features = features;
34618c2ecf20Sopenharmony_ci		renewed_caps(mdsc, session, 0);
34628c2ecf20Sopenharmony_ci		if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features))
34638c2ecf20Sopenharmony_ci			metric_schedule_delayed(&mdsc->metric);
34648c2ecf20Sopenharmony_ci		wake = 1;
34658c2ecf20Sopenharmony_ci		if (mdsc->stopping)
34668c2ecf20Sopenharmony_ci			__close_session(mdsc, session);
34678c2ecf20Sopenharmony_ci		break;
34688c2ecf20Sopenharmony_ci
34698c2ecf20Sopenharmony_ci	case CEPH_SESSION_RENEWCAPS:
34708c2ecf20Sopenharmony_ci		if (session->s_renew_seq == seq)
34718c2ecf20Sopenharmony_ci			renewed_caps(mdsc, session, 1);
34728c2ecf20Sopenharmony_ci		break;
34738c2ecf20Sopenharmony_ci
34748c2ecf20Sopenharmony_ci	case CEPH_SESSION_CLOSE:
34758c2ecf20Sopenharmony_ci		if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
34768c2ecf20Sopenharmony_ci			pr_info("mds%d reconnect denied\n", session->s_mds);
34778c2ecf20Sopenharmony_ci		session->s_state = CEPH_MDS_SESSION_CLOSED;
34788c2ecf20Sopenharmony_ci		cleanup_session_requests(mdsc, session);
34798c2ecf20Sopenharmony_ci		remove_session_caps(session);
34808c2ecf20Sopenharmony_ci		wake = 2; /* for good measure */
34818c2ecf20Sopenharmony_ci		wake_up_all(&mdsc->session_close_wq);
34828c2ecf20Sopenharmony_ci		break;
34838c2ecf20Sopenharmony_ci
34848c2ecf20Sopenharmony_ci	case CEPH_SESSION_STALE:
34858c2ecf20Sopenharmony_ci		pr_info("mds%d caps went stale, renewing\n",
34868c2ecf20Sopenharmony_ci			session->s_mds);
34878c2ecf20Sopenharmony_ci		spin_lock(&session->s_gen_ttl_lock);
34888c2ecf20Sopenharmony_ci		session->s_cap_gen++;
34898c2ecf20Sopenharmony_ci		session->s_cap_ttl = jiffies - 1;
34908c2ecf20Sopenharmony_ci		spin_unlock(&session->s_gen_ttl_lock);
34918c2ecf20Sopenharmony_ci		send_renew_caps(mdsc, session);
34928c2ecf20Sopenharmony_ci		break;
34938c2ecf20Sopenharmony_ci
34948c2ecf20Sopenharmony_ci	case CEPH_SESSION_RECALL_STATE:
34958c2ecf20Sopenharmony_ci		ceph_trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
34968c2ecf20Sopenharmony_ci		break;
34978c2ecf20Sopenharmony_ci
34988c2ecf20Sopenharmony_ci	case CEPH_SESSION_FLUSHMSG:
34998c2ecf20Sopenharmony_ci		/* flush cap releases */
35008c2ecf20Sopenharmony_ci		spin_lock(&session->s_cap_lock);
35018c2ecf20Sopenharmony_ci		if (session->s_num_cap_releases)
35028c2ecf20Sopenharmony_ci			ceph_flush_cap_releases(mdsc, session);
35038c2ecf20Sopenharmony_ci		spin_unlock(&session->s_cap_lock);
35048c2ecf20Sopenharmony_ci
35058c2ecf20Sopenharmony_ci		send_flushmsg_ack(mdsc, session, seq);
35068c2ecf20Sopenharmony_ci		break;
35078c2ecf20Sopenharmony_ci
35088c2ecf20Sopenharmony_ci	case CEPH_SESSION_FORCE_RO:
35098c2ecf20Sopenharmony_ci		dout("force_session_readonly %p\n", session);
35108c2ecf20Sopenharmony_ci		spin_lock(&session->s_cap_lock);
35118c2ecf20Sopenharmony_ci		session->s_readonly = true;
35128c2ecf20Sopenharmony_ci		spin_unlock(&session->s_cap_lock);
35138c2ecf20Sopenharmony_ci		wake_up_session_caps(session, FORCE_RO);
35148c2ecf20Sopenharmony_ci		break;
35158c2ecf20Sopenharmony_ci
35168c2ecf20Sopenharmony_ci	case CEPH_SESSION_REJECT:
35178c2ecf20Sopenharmony_ci		WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING);
35188c2ecf20Sopenharmony_ci		pr_info("mds%d rejected session\n", session->s_mds);
35198c2ecf20Sopenharmony_ci		session->s_state = CEPH_MDS_SESSION_REJECTED;
35208c2ecf20Sopenharmony_ci		cleanup_session_requests(mdsc, session);
35218c2ecf20Sopenharmony_ci		remove_session_caps(session);
35228c2ecf20Sopenharmony_ci		if (blocklisted)
35238c2ecf20Sopenharmony_ci			mdsc->fsc->blocklisted = true;
35248c2ecf20Sopenharmony_ci		wake = 2; /* for good measure */
35258c2ecf20Sopenharmony_ci		break;
35268c2ecf20Sopenharmony_ci
35278c2ecf20Sopenharmony_ci	default:
35288c2ecf20Sopenharmony_ci		pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
35298c2ecf20Sopenharmony_ci		WARN_ON(1);
35308c2ecf20Sopenharmony_ci	}
35318c2ecf20Sopenharmony_ci
35328c2ecf20Sopenharmony_ci	mutex_unlock(&session->s_mutex);
35338c2ecf20Sopenharmony_ci	if (wake) {
35348c2ecf20Sopenharmony_ci		mutex_lock(&mdsc->mutex);
35358c2ecf20Sopenharmony_ci		__wake_requests(mdsc, &session->s_waiting);
35368c2ecf20Sopenharmony_ci		if (wake == 2)
35378c2ecf20Sopenharmony_ci			kick_requests(mdsc, mds);
35388c2ecf20Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
35398c2ecf20Sopenharmony_ci	}
35408c2ecf20Sopenharmony_ci	if (op == CEPH_SESSION_CLOSE)
35418c2ecf20Sopenharmony_ci		ceph_put_mds_session(session);
35428c2ecf20Sopenharmony_ci	return;
35438c2ecf20Sopenharmony_ci
35448c2ecf20Sopenharmony_cibad:
35458c2ecf20Sopenharmony_ci	pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
35468c2ecf20Sopenharmony_ci	       (int)msg->front.iov_len);
35478c2ecf20Sopenharmony_ci	ceph_msg_dump(msg);
35488c2ecf20Sopenharmony_ci	return;
35498c2ecf20Sopenharmony_ci}
35508c2ecf20Sopenharmony_ci
35518c2ecf20Sopenharmony_civoid ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
35528c2ecf20Sopenharmony_ci{
35538c2ecf20Sopenharmony_ci	int dcaps;
35548c2ecf20Sopenharmony_ci
35558c2ecf20Sopenharmony_ci	dcaps = xchg(&req->r_dir_caps, 0);
35568c2ecf20Sopenharmony_ci	if (dcaps) {
35578c2ecf20Sopenharmony_ci		dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
35588c2ecf20Sopenharmony_ci		ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps);
35598c2ecf20Sopenharmony_ci	}
35608c2ecf20Sopenharmony_ci}
35618c2ecf20Sopenharmony_ci
35628c2ecf20Sopenharmony_civoid ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req)
35638c2ecf20Sopenharmony_ci{
35648c2ecf20Sopenharmony_ci	int dcaps;
35658c2ecf20Sopenharmony_ci
35668c2ecf20Sopenharmony_ci	dcaps = xchg(&req->r_dir_caps, 0);
35678c2ecf20Sopenharmony_ci	if (dcaps) {
35688c2ecf20Sopenharmony_ci		dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
35698c2ecf20Sopenharmony_ci		ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent),
35708c2ecf20Sopenharmony_ci						dcaps);
35718c2ecf20Sopenharmony_ci	}
35728c2ecf20Sopenharmony_ci}
35738c2ecf20Sopenharmony_ci
35748c2ecf20Sopenharmony_ci/*
35758c2ecf20Sopenharmony_ci * called under session->mutex.
35768c2ecf20Sopenharmony_ci */
35778c2ecf20Sopenharmony_cistatic void replay_unsafe_requests(struct ceph_mds_client *mdsc,
35788c2ecf20Sopenharmony_ci				   struct ceph_mds_session *session)
35798c2ecf20Sopenharmony_ci{
35808c2ecf20Sopenharmony_ci	struct ceph_mds_request *req, *nreq;
35818c2ecf20Sopenharmony_ci	struct rb_node *p;
35828c2ecf20Sopenharmony_ci
35838c2ecf20Sopenharmony_ci	dout("replay_unsafe_requests mds%d\n", session->s_mds);
35848c2ecf20Sopenharmony_ci
35858c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
35868c2ecf20Sopenharmony_ci	list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item)
35878c2ecf20Sopenharmony_ci		__send_request(mdsc, session, req, true);
35888c2ecf20Sopenharmony_ci
35898c2ecf20Sopenharmony_ci	/*
35908c2ecf20Sopenharmony_ci	 * also re-send old requests when MDS enters reconnect stage. So that MDS
35918c2ecf20Sopenharmony_ci	 * can process completed request in clientreplay stage.
35928c2ecf20Sopenharmony_ci	 */
35938c2ecf20Sopenharmony_ci	p = rb_first(&mdsc->request_tree);
35948c2ecf20Sopenharmony_ci	while (p) {
35958c2ecf20Sopenharmony_ci		req = rb_entry(p, struct ceph_mds_request, r_node);
35968c2ecf20Sopenharmony_ci		p = rb_next(p);
35978c2ecf20Sopenharmony_ci		if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
35988c2ecf20Sopenharmony_ci			continue;
35998c2ecf20Sopenharmony_ci		if (req->r_attempts == 0)
36008c2ecf20Sopenharmony_ci			continue; /* only old requests */
36018c2ecf20Sopenharmony_ci		if (!req->r_session)
36028c2ecf20Sopenharmony_ci			continue;
36038c2ecf20Sopenharmony_ci		if (req->r_session->s_mds != session->s_mds)
36048c2ecf20Sopenharmony_ci			continue;
36058c2ecf20Sopenharmony_ci
36068c2ecf20Sopenharmony_ci		ceph_mdsc_release_dir_caps_no_check(req);
36078c2ecf20Sopenharmony_ci
36088c2ecf20Sopenharmony_ci		__send_request(mdsc, session, req, true);
36098c2ecf20Sopenharmony_ci	}
36108c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
36118c2ecf20Sopenharmony_ci}
36128c2ecf20Sopenharmony_ci
36138c2ecf20Sopenharmony_cistatic int send_reconnect_partial(struct ceph_reconnect_state *recon_state)
36148c2ecf20Sopenharmony_ci{
36158c2ecf20Sopenharmony_ci	struct ceph_msg *reply;
36168c2ecf20Sopenharmony_ci	struct ceph_pagelist *_pagelist;
36178c2ecf20Sopenharmony_ci	struct page *page;
36188c2ecf20Sopenharmony_ci	__le32 *addr;
36198c2ecf20Sopenharmony_ci	int err = -ENOMEM;
36208c2ecf20Sopenharmony_ci
36218c2ecf20Sopenharmony_ci	if (!recon_state->allow_multi)
36228c2ecf20Sopenharmony_ci		return -ENOSPC;
36238c2ecf20Sopenharmony_ci
36248c2ecf20Sopenharmony_ci	/* can't handle message that contains both caps and realm */
36258c2ecf20Sopenharmony_ci	BUG_ON(!recon_state->nr_caps == !recon_state->nr_realms);
36268c2ecf20Sopenharmony_ci
36278c2ecf20Sopenharmony_ci	/* pre-allocate new pagelist */
36288c2ecf20Sopenharmony_ci	_pagelist = ceph_pagelist_alloc(GFP_NOFS);
36298c2ecf20Sopenharmony_ci	if (!_pagelist)
36308c2ecf20Sopenharmony_ci		return -ENOMEM;
36318c2ecf20Sopenharmony_ci
36328c2ecf20Sopenharmony_ci	reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
36338c2ecf20Sopenharmony_ci	if (!reply)
36348c2ecf20Sopenharmony_ci		goto fail_msg;
36358c2ecf20Sopenharmony_ci
36368c2ecf20Sopenharmony_ci	/* placeholder for nr_caps */
36378c2ecf20Sopenharmony_ci	err = ceph_pagelist_encode_32(_pagelist, 0);
36388c2ecf20Sopenharmony_ci	if (err < 0)
36398c2ecf20Sopenharmony_ci		goto fail;
36408c2ecf20Sopenharmony_ci
36418c2ecf20Sopenharmony_ci	if (recon_state->nr_caps) {
36428c2ecf20Sopenharmony_ci		/* currently encoding caps */
36438c2ecf20Sopenharmony_ci		err = ceph_pagelist_encode_32(recon_state->pagelist, 0);
36448c2ecf20Sopenharmony_ci		if (err)
36458c2ecf20Sopenharmony_ci			goto fail;
36468c2ecf20Sopenharmony_ci	} else {
36478c2ecf20Sopenharmony_ci		/* placeholder for nr_realms (currently encoding relams) */
36488c2ecf20Sopenharmony_ci		err = ceph_pagelist_encode_32(_pagelist, 0);
36498c2ecf20Sopenharmony_ci		if (err < 0)
36508c2ecf20Sopenharmony_ci			goto fail;
36518c2ecf20Sopenharmony_ci	}
36528c2ecf20Sopenharmony_ci
36538c2ecf20Sopenharmony_ci	err = ceph_pagelist_encode_8(recon_state->pagelist, 1);
36548c2ecf20Sopenharmony_ci	if (err)
36558c2ecf20Sopenharmony_ci		goto fail;
36568c2ecf20Sopenharmony_ci
36578c2ecf20Sopenharmony_ci	page = list_first_entry(&recon_state->pagelist->head, struct page, lru);
36588c2ecf20Sopenharmony_ci	addr = kmap_atomic(page);
36598c2ecf20Sopenharmony_ci	if (recon_state->nr_caps) {
36608c2ecf20Sopenharmony_ci		/* currently encoding caps */
36618c2ecf20Sopenharmony_ci		*addr = cpu_to_le32(recon_state->nr_caps);
36628c2ecf20Sopenharmony_ci	} else {
36638c2ecf20Sopenharmony_ci		/* currently encoding relams */
36648c2ecf20Sopenharmony_ci		*(addr + 1) = cpu_to_le32(recon_state->nr_realms);
36658c2ecf20Sopenharmony_ci	}
36668c2ecf20Sopenharmony_ci	kunmap_atomic(addr);
36678c2ecf20Sopenharmony_ci
36688c2ecf20Sopenharmony_ci	reply->hdr.version = cpu_to_le16(5);
36698c2ecf20Sopenharmony_ci	reply->hdr.compat_version = cpu_to_le16(4);
36708c2ecf20Sopenharmony_ci
36718c2ecf20Sopenharmony_ci	reply->hdr.data_len = cpu_to_le32(recon_state->pagelist->length);
36728c2ecf20Sopenharmony_ci	ceph_msg_data_add_pagelist(reply, recon_state->pagelist);
36738c2ecf20Sopenharmony_ci
36748c2ecf20Sopenharmony_ci	ceph_con_send(&recon_state->session->s_con, reply);
36758c2ecf20Sopenharmony_ci	ceph_pagelist_release(recon_state->pagelist);
36768c2ecf20Sopenharmony_ci
36778c2ecf20Sopenharmony_ci	recon_state->pagelist = _pagelist;
36788c2ecf20Sopenharmony_ci	recon_state->nr_caps = 0;
36798c2ecf20Sopenharmony_ci	recon_state->nr_realms = 0;
36808c2ecf20Sopenharmony_ci	recon_state->msg_version = 5;
36818c2ecf20Sopenharmony_ci	return 0;
36828c2ecf20Sopenharmony_cifail:
36838c2ecf20Sopenharmony_ci	ceph_msg_put(reply);
36848c2ecf20Sopenharmony_cifail_msg:
36858c2ecf20Sopenharmony_ci	ceph_pagelist_release(_pagelist);
36868c2ecf20Sopenharmony_ci	return err;
36878c2ecf20Sopenharmony_ci}
36888c2ecf20Sopenharmony_ci
36898c2ecf20Sopenharmony_cistatic struct dentry* d_find_primary(struct inode *inode)
36908c2ecf20Sopenharmony_ci{
36918c2ecf20Sopenharmony_ci	struct dentry *alias, *dn = NULL;
36928c2ecf20Sopenharmony_ci
36938c2ecf20Sopenharmony_ci	if (hlist_empty(&inode->i_dentry))
36948c2ecf20Sopenharmony_ci		return NULL;
36958c2ecf20Sopenharmony_ci
36968c2ecf20Sopenharmony_ci	spin_lock(&inode->i_lock);
36978c2ecf20Sopenharmony_ci	if (hlist_empty(&inode->i_dentry))
36988c2ecf20Sopenharmony_ci		goto out_unlock;
36998c2ecf20Sopenharmony_ci
37008c2ecf20Sopenharmony_ci	if (S_ISDIR(inode->i_mode)) {
37018c2ecf20Sopenharmony_ci		alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
37028c2ecf20Sopenharmony_ci		if (!IS_ROOT(alias))
37038c2ecf20Sopenharmony_ci			dn = dget(alias);
37048c2ecf20Sopenharmony_ci		goto out_unlock;
37058c2ecf20Sopenharmony_ci	}
37068c2ecf20Sopenharmony_ci
37078c2ecf20Sopenharmony_ci	hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
37088c2ecf20Sopenharmony_ci		spin_lock(&alias->d_lock);
37098c2ecf20Sopenharmony_ci		if (!d_unhashed(alias) &&
37108c2ecf20Sopenharmony_ci		    (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) {
37118c2ecf20Sopenharmony_ci			dn = dget_dlock(alias);
37128c2ecf20Sopenharmony_ci		}
37138c2ecf20Sopenharmony_ci		spin_unlock(&alias->d_lock);
37148c2ecf20Sopenharmony_ci		if (dn)
37158c2ecf20Sopenharmony_ci			break;
37168c2ecf20Sopenharmony_ci	}
37178c2ecf20Sopenharmony_ciout_unlock:
37188c2ecf20Sopenharmony_ci	spin_unlock(&inode->i_lock);
37198c2ecf20Sopenharmony_ci	return dn;
37208c2ecf20Sopenharmony_ci}
37218c2ecf20Sopenharmony_ci
37228c2ecf20Sopenharmony_ci/*
37238c2ecf20Sopenharmony_ci * Encode information about a cap for a reconnect with the MDS.
37248c2ecf20Sopenharmony_ci */
37258c2ecf20Sopenharmony_cistatic int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap,
37268c2ecf20Sopenharmony_ci			  void *arg)
37278c2ecf20Sopenharmony_ci{
37288c2ecf20Sopenharmony_ci	union {
37298c2ecf20Sopenharmony_ci		struct ceph_mds_cap_reconnect v2;
37308c2ecf20Sopenharmony_ci		struct ceph_mds_cap_reconnect_v1 v1;
37318c2ecf20Sopenharmony_ci	} rec;
37328c2ecf20Sopenharmony_ci	struct ceph_inode_info *ci = cap->ci;
37338c2ecf20Sopenharmony_ci	struct ceph_reconnect_state *recon_state = arg;
37348c2ecf20Sopenharmony_ci	struct ceph_pagelist *pagelist = recon_state->pagelist;
37358c2ecf20Sopenharmony_ci	struct dentry *dentry;
37368c2ecf20Sopenharmony_ci	char *path;
37378c2ecf20Sopenharmony_ci	int pathlen = 0, err;
37388c2ecf20Sopenharmony_ci	u64 pathbase;
37398c2ecf20Sopenharmony_ci	u64 snap_follows;
37408c2ecf20Sopenharmony_ci
37418c2ecf20Sopenharmony_ci	dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
37428c2ecf20Sopenharmony_ci	     inode, ceph_vinop(inode), cap, cap->cap_id,
37438c2ecf20Sopenharmony_ci	     ceph_cap_string(cap->issued));
37448c2ecf20Sopenharmony_ci
37458c2ecf20Sopenharmony_ci	dentry = d_find_primary(inode);
37468c2ecf20Sopenharmony_ci	if (dentry) {
37478c2ecf20Sopenharmony_ci		/* set pathbase to parent dir when msg_version >= 2 */
37488c2ecf20Sopenharmony_ci		path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase,
37498c2ecf20Sopenharmony_ci					    recon_state->msg_version >= 2);
37508c2ecf20Sopenharmony_ci		dput(dentry);
37518c2ecf20Sopenharmony_ci		if (IS_ERR(path)) {
37528c2ecf20Sopenharmony_ci			err = PTR_ERR(path);
37538c2ecf20Sopenharmony_ci			goto out_err;
37548c2ecf20Sopenharmony_ci		}
37558c2ecf20Sopenharmony_ci	} else {
37568c2ecf20Sopenharmony_ci		path = NULL;
37578c2ecf20Sopenharmony_ci		pathbase = 0;
37588c2ecf20Sopenharmony_ci	}
37598c2ecf20Sopenharmony_ci
37608c2ecf20Sopenharmony_ci	spin_lock(&ci->i_ceph_lock);
37618c2ecf20Sopenharmony_ci	cap->seq = 0;        /* reset cap seq */
37628c2ecf20Sopenharmony_ci	cap->issue_seq = 0;  /* and issue_seq */
37638c2ecf20Sopenharmony_ci	cap->mseq = 0;       /* and migrate_seq */
37648c2ecf20Sopenharmony_ci	cap->cap_gen = cap->session->s_cap_gen;
37658c2ecf20Sopenharmony_ci
37668c2ecf20Sopenharmony_ci	/* These are lost when the session goes away */
37678c2ecf20Sopenharmony_ci	if (S_ISDIR(inode->i_mode)) {
37688c2ecf20Sopenharmony_ci		if (cap->issued & CEPH_CAP_DIR_CREATE) {
37698c2ecf20Sopenharmony_ci			ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
37708c2ecf20Sopenharmony_ci			memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
37718c2ecf20Sopenharmony_ci		}
37728c2ecf20Sopenharmony_ci		cap->issued &= ~CEPH_CAP_ANY_DIR_OPS;
37738c2ecf20Sopenharmony_ci	}
37748c2ecf20Sopenharmony_ci
37758c2ecf20Sopenharmony_ci	if (recon_state->msg_version >= 2) {
37768c2ecf20Sopenharmony_ci		rec.v2.cap_id = cpu_to_le64(cap->cap_id);
37778c2ecf20Sopenharmony_ci		rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
37788c2ecf20Sopenharmony_ci		rec.v2.issued = cpu_to_le32(cap->issued);
37798c2ecf20Sopenharmony_ci		rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
37808c2ecf20Sopenharmony_ci		rec.v2.pathbase = cpu_to_le64(pathbase);
37818c2ecf20Sopenharmony_ci		rec.v2.flock_len = (__force __le32)
37828c2ecf20Sopenharmony_ci			((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1);
37838c2ecf20Sopenharmony_ci	} else {
37848c2ecf20Sopenharmony_ci		rec.v1.cap_id = cpu_to_le64(cap->cap_id);
37858c2ecf20Sopenharmony_ci		rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
37868c2ecf20Sopenharmony_ci		rec.v1.issued = cpu_to_le32(cap->issued);
37878c2ecf20Sopenharmony_ci		rec.v1.size = cpu_to_le64(inode->i_size);
37888c2ecf20Sopenharmony_ci		ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime);
37898c2ecf20Sopenharmony_ci		ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime);
37908c2ecf20Sopenharmony_ci		rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
37918c2ecf20Sopenharmony_ci		rec.v1.pathbase = cpu_to_le64(pathbase);
37928c2ecf20Sopenharmony_ci	}
37938c2ecf20Sopenharmony_ci
37948c2ecf20Sopenharmony_ci	if (list_empty(&ci->i_cap_snaps)) {
37958c2ecf20Sopenharmony_ci		snap_follows = ci->i_head_snapc ? ci->i_head_snapc->seq : 0;
37968c2ecf20Sopenharmony_ci	} else {
37978c2ecf20Sopenharmony_ci		struct ceph_cap_snap *capsnap =
37988c2ecf20Sopenharmony_ci			list_first_entry(&ci->i_cap_snaps,
37998c2ecf20Sopenharmony_ci					 struct ceph_cap_snap, ci_item);
38008c2ecf20Sopenharmony_ci		snap_follows = capsnap->follows;
38018c2ecf20Sopenharmony_ci	}
38028c2ecf20Sopenharmony_ci	spin_unlock(&ci->i_ceph_lock);
38038c2ecf20Sopenharmony_ci
38048c2ecf20Sopenharmony_ci	if (recon_state->msg_version >= 2) {
38058c2ecf20Sopenharmony_ci		int num_fcntl_locks, num_flock_locks;
38068c2ecf20Sopenharmony_ci		struct ceph_filelock *flocks = NULL;
38078c2ecf20Sopenharmony_ci		size_t struct_len, total_len = sizeof(u64);
38088c2ecf20Sopenharmony_ci		u8 struct_v = 0;
38098c2ecf20Sopenharmony_ci
38108c2ecf20Sopenharmony_ciencode_again:
38118c2ecf20Sopenharmony_ci		if (rec.v2.flock_len) {
38128c2ecf20Sopenharmony_ci			ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
38138c2ecf20Sopenharmony_ci		} else {
38148c2ecf20Sopenharmony_ci			num_fcntl_locks = 0;
38158c2ecf20Sopenharmony_ci			num_flock_locks = 0;
38168c2ecf20Sopenharmony_ci		}
38178c2ecf20Sopenharmony_ci		if (num_fcntl_locks + num_flock_locks > 0) {
38188c2ecf20Sopenharmony_ci			flocks = kmalloc_array(num_fcntl_locks + num_flock_locks,
38198c2ecf20Sopenharmony_ci					       sizeof(struct ceph_filelock),
38208c2ecf20Sopenharmony_ci					       GFP_NOFS);
38218c2ecf20Sopenharmony_ci			if (!flocks) {
38228c2ecf20Sopenharmony_ci				err = -ENOMEM;
38238c2ecf20Sopenharmony_ci				goto out_err;
38248c2ecf20Sopenharmony_ci			}
38258c2ecf20Sopenharmony_ci			err = ceph_encode_locks_to_buffer(inode, flocks,
38268c2ecf20Sopenharmony_ci							  num_fcntl_locks,
38278c2ecf20Sopenharmony_ci							  num_flock_locks);
38288c2ecf20Sopenharmony_ci			if (err) {
38298c2ecf20Sopenharmony_ci				kfree(flocks);
38308c2ecf20Sopenharmony_ci				flocks = NULL;
38318c2ecf20Sopenharmony_ci				if (err == -ENOSPC)
38328c2ecf20Sopenharmony_ci					goto encode_again;
38338c2ecf20Sopenharmony_ci				goto out_err;
38348c2ecf20Sopenharmony_ci			}
38358c2ecf20Sopenharmony_ci		} else {
38368c2ecf20Sopenharmony_ci			kfree(flocks);
38378c2ecf20Sopenharmony_ci			flocks = NULL;
38388c2ecf20Sopenharmony_ci		}
38398c2ecf20Sopenharmony_ci
38408c2ecf20Sopenharmony_ci		if (recon_state->msg_version >= 3) {
38418c2ecf20Sopenharmony_ci			/* version, compat_version and struct_len */
38428c2ecf20Sopenharmony_ci			total_len += 2 * sizeof(u8) + sizeof(u32);
38438c2ecf20Sopenharmony_ci			struct_v = 2;
38448c2ecf20Sopenharmony_ci		}
38458c2ecf20Sopenharmony_ci		/*
38468c2ecf20Sopenharmony_ci		 * number of encoded locks is stable, so copy to pagelist
38478c2ecf20Sopenharmony_ci		 */
38488c2ecf20Sopenharmony_ci		struct_len = 2 * sizeof(u32) +
38498c2ecf20Sopenharmony_ci			    (num_fcntl_locks + num_flock_locks) *
38508c2ecf20Sopenharmony_ci			    sizeof(struct ceph_filelock);
38518c2ecf20Sopenharmony_ci		rec.v2.flock_len = cpu_to_le32(struct_len);
38528c2ecf20Sopenharmony_ci
38538c2ecf20Sopenharmony_ci		struct_len += sizeof(u32) + pathlen + sizeof(rec.v2);
38548c2ecf20Sopenharmony_ci
38558c2ecf20Sopenharmony_ci		if (struct_v >= 2)
38568c2ecf20Sopenharmony_ci			struct_len += sizeof(u64); /* snap_follows */
38578c2ecf20Sopenharmony_ci
38588c2ecf20Sopenharmony_ci		total_len += struct_len;
38598c2ecf20Sopenharmony_ci
38608c2ecf20Sopenharmony_ci		if (pagelist->length + total_len > RECONNECT_MAX_SIZE) {
38618c2ecf20Sopenharmony_ci			err = send_reconnect_partial(recon_state);
38628c2ecf20Sopenharmony_ci			if (err)
38638c2ecf20Sopenharmony_ci				goto out_freeflocks;
38648c2ecf20Sopenharmony_ci			pagelist = recon_state->pagelist;
38658c2ecf20Sopenharmony_ci		}
38668c2ecf20Sopenharmony_ci
38678c2ecf20Sopenharmony_ci		err = ceph_pagelist_reserve(pagelist, total_len);
38688c2ecf20Sopenharmony_ci		if (err)
38698c2ecf20Sopenharmony_ci			goto out_freeflocks;
38708c2ecf20Sopenharmony_ci
38718c2ecf20Sopenharmony_ci		ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
38728c2ecf20Sopenharmony_ci		if (recon_state->msg_version >= 3) {
38738c2ecf20Sopenharmony_ci			ceph_pagelist_encode_8(pagelist, struct_v);
38748c2ecf20Sopenharmony_ci			ceph_pagelist_encode_8(pagelist, 1);
38758c2ecf20Sopenharmony_ci			ceph_pagelist_encode_32(pagelist, struct_len);
38768c2ecf20Sopenharmony_ci		}
38778c2ecf20Sopenharmony_ci		ceph_pagelist_encode_string(pagelist, path, pathlen);
38788c2ecf20Sopenharmony_ci		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
38798c2ecf20Sopenharmony_ci		ceph_locks_to_pagelist(flocks, pagelist,
38808c2ecf20Sopenharmony_ci				       num_fcntl_locks, num_flock_locks);
38818c2ecf20Sopenharmony_ci		if (struct_v >= 2)
38828c2ecf20Sopenharmony_ci			ceph_pagelist_encode_64(pagelist, snap_follows);
38838c2ecf20Sopenharmony_ciout_freeflocks:
38848c2ecf20Sopenharmony_ci		kfree(flocks);
38858c2ecf20Sopenharmony_ci	} else {
38868c2ecf20Sopenharmony_ci		err = ceph_pagelist_reserve(pagelist,
38878c2ecf20Sopenharmony_ci					    sizeof(u64) + sizeof(u32) +
38888c2ecf20Sopenharmony_ci					    pathlen + sizeof(rec.v1));
38898c2ecf20Sopenharmony_ci		if (err)
38908c2ecf20Sopenharmony_ci			goto out_err;
38918c2ecf20Sopenharmony_ci
38928c2ecf20Sopenharmony_ci		ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
38938c2ecf20Sopenharmony_ci		ceph_pagelist_encode_string(pagelist, path, pathlen);
38948c2ecf20Sopenharmony_ci		ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
38958c2ecf20Sopenharmony_ci	}
38968c2ecf20Sopenharmony_ci
38978c2ecf20Sopenharmony_ciout_err:
38988c2ecf20Sopenharmony_ci	ceph_mdsc_free_path(path, pathlen);
38998c2ecf20Sopenharmony_ci	if (!err)
39008c2ecf20Sopenharmony_ci		recon_state->nr_caps++;
39018c2ecf20Sopenharmony_ci	return err;
39028c2ecf20Sopenharmony_ci}
39038c2ecf20Sopenharmony_ci
39048c2ecf20Sopenharmony_cistatic int encode_snap_realms(struct ceph_mds_client *mdsc,
39058c2ecf20Sopenharmony_ci			      struct ceph_reconnect_state *recon_state)
39068c2ecf20Sopenharmony_ci{
39078c2ecf20Sopenharmony_ci	struct rb_node *p;
39088c2ecf20Sopenharmony_ci	struct ceph_pagelist *pagelist = recon_state->pagelist;
39098c2ecf20Sopenharmony_ci	int err = 0;
39108c2ecf20Sopenharmony_ci
39118c2ecf20Sopenharmony_ci	if (recon_state->msg_version >= 4) {
39128c2ecf20Sopenharmony_ci		err = ceph_pagelist_encode_32(pagelist, mdsc->num_snap_realms);
39138c2ecf20Sopenharmony_ci		if (err < 0)
39148c2ecf20Sopenharmony_ci			goto fail;
39158c2ecf20Sopenharmony_ci	}
39168c2ecf20Sopenharmony_ci
39178c2ecf20Sopenharmony_ci	/*
39188c2ecf20Sopenharmony_ci	 * snaprealms.  we provide mds with the ino, seq (version), and
39198c2ecf20Sopenharmony_ci	 * parent for all of our realms.  If the mds has any newer info,
39208c2ecf20Sopenharmony_ci	 * it will tell us.
39218c2ecf20Sopenharmony_ci	 */
39228c2ecf20Sopenharmony_ci	for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
39238c2ecf20Sopenharmony_ci		struct ceph_snap_realm *realm =
39248c2ecf20Sopenharmony_ci		       rb_entry(p, struct ceph_snap_realm, node);
39258c2ecf20Sopenharmony_ci		struct ceph_mds_snaprealm_reconnect sr_rec;
39268c2ecf20Sopenharmony_ci
39278c2ecf20Sopenharmony_ci		if (recon_state->msg_version >= 4) {
39288c2ecf20Sopenharmony_ci			size_t need = sizeof(u8) * 2 + sizeof(u32) +
39298c2ecf20Sopenharmony_ci				      sizeof(sr_rec);
39308c2ecf20Sopenharmony_ci
39318c2ecf20Sopenharmony_ci			if (pagelist->length + need > RECONNECT_MAX_SIZE) {
39328c2ecf20Sopenharmony_ci				err = send_reconnect_partial(recon_state);
39338c2ecf20Sopenharmony_ci				if (err)
39348c2ecf20Sopenharmony_ci					goto fail;
39358c2ecf20Sopenharmony_ci				pagelist = recon_state->pagelist;
39368c2ecf20Sopenharmony_ci			}
39378c2ecf20Sopenharmony_ci
39388c2ecf20Sopenharmony_ci			err = ceph_pagelist_reserve(pagelist, need);
39398c2ecf20Sopenharmony_ci			if (err)
39408c2ecf20Sopenharmony_ci				goto fail;
39418c2ecf20Sopenharmony_ci
39428c2ecf20Sopenharmony_ci			ceph_pagelist_encode_8(pagelist, 1);
39438c2ecf20Sopenharmony_ci			ceph_pagelist_encode_8(pagelist, 1);
39448c2ecf20Sopenharmony_ci			ceph_pagelist_encode_32(pagelist, sizeof(sr_rec));
39458c2ecf20Sopenharmony_ci		}
39468c2ecf20Sopenharmony_ci
39478c2ecf20Sopenharmony_ci		dout(" adding snap realm %llx seq %lld parent %llx\n",
39488c2ecf20Sopenharmony_ci		     realm->ino, realm->seq, realm->parent_ino);
39498c2ecf20Sopenharmony_ci		sr_rec.ino = cpu_to_le64(realm->ino);
39508c2ecf20Sopenharmony_ci		sr_rec.seq = cpu_to_le64(realm->seq);
39518c2ecf20Sopenharmony_ci		sr_rec.parent = cpu_to_le64(realm->parent_ino);
39528c2ecf20Sopenharmony_ci
39538c2ecf20Sopenharmony_ci		err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
39548c2ecf20Sopenharmony_ci		if (err)
39558c2ecf20Sopenharmony_ci			goto fail;
39568c2ecf20Sopenharmony_ci
39578c2ecf20Sopenharmony_ci		recon_state->nr_realms++;
39588c2ecf20Sopenharmony_ci	}
39598c2ecf20Sopenharmony_cifail:
39608c2ecf20Sopenharmony_ci	return err;
39618c2ecf20Sopenharmony_ci}
39628c2ecf20Sopenharmony_ci
39638c2ecf20Sopenharmony_ci
39648c2ecf20Sopenharmony_ci/*
39658c2ecf20Sopenharmony_ci * If an MDS fails and recovers, clients need to reconnect in order to
39668c2ecf20Sopenharmony_ci * reestablish shared state.  This includes all caps issued through
39678c2ecf20Sopenharmony_ci * this session _and_ the snap_realm hierarchy.  Because it's not
39688c2ecf20Sopenharmony_ci * clear which snap realms the mds cares about, we send everything we
39698c2ecf20Sopenharmony_ci * know about.. that ensures we'll then get any new info the
39708c2ecf20Sopenharmony_ci * recovering MDS might have.
39718c2ecf20Sopenharmony_ci *
39728c2ecf20Sopenharmony_ci * This is a relatively heavyweight operation, but it's rare.
39738c2ecf20Sopenharmony_ci */
39748c2ecf20Sopenharmony_cistatic void send_mds_reconnect(struct ceph_mds_client *mdsc,
39758c2ecf20Sopenharmony_ci			       struct ceph_mds_session *session)
39768c2ecf20Sopenharmony_ci{
39778c2ecf20Sopenharmony_ci	struct ceph_msg *reply;
39788c2ecf20Sopenharmony_ci	int mds = session->s_mds;
39798c2ecf20Sopenharmony_ci	int err = -ENOMEM;
39808c2ecf20Sopenharmony_ci	struct ceph_reconnect_state recon_state = {
39818c2ecf20Sopenharmony_ci		.session = session,
39828c2ecf20Sopenharmony_ci	};
39838c2ecf20Sopenharmony_ci	LIST_HEAD(dispose);
39848c2ecf20Sopenharmony_ci
39858c2ecf20Sopenharmony_ci	pr_info("mds%d reconnect start\n", mds);
39868c2ecf20Sopenharmony_ci
39878c2ecf20Sopenharmony_ci	recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS);
39888c2ecf20Sopenharmony_ci	if (!recon_state.pagelist)
39898c2ecf20Sopenharmony_ci		goto fail_nopagelist;
39908c2ecf20Sopenharmony_ci
39918c2ecf20Sopenharmony_ci	reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
39928c2ecf20Sopenharmony_ci	if (!reply)
39938c2ecf20Sopenharmony_ci		goto fail_nomsg;
39948c2ecf20Sopenharmony_ci
39958c2ecf20Sopenharmony_ci	xa_destroy(&session->s_delegated_inos);
39968c2ecf20Sopenharmony_ci
39978c2ecf20Sopenharmony_ci	mutex_lock(&session->s_mutex);
39988c2ecf20Sopenharmony_ci	session->s_state = CEPH_MDS_SESSION_RECONNECTING;
39998c2ecf20Sopenharmony_ci	session->s_seq = 0;
40008c2ecf20Sopenharmony_ci
40018c2ecf20Sopenharmony_ci	dout("session %p state %s\n", session,
40028c2ecf20Sopenharmony_ci	     ceph_session_state_name(session->s_state));
40038c2ecf20Sopenharmony_ci
40048c2ecf20Sopenharmony_ci	spin_lock(&session->s_gen_ttl_lock);
40058c2ecf20Sopenharmony_ci	session->s_cap_gen++;
40068c2ecf20Sopenharmony_ci	spin_unlock(&session->s_gen_ttl_lock);
40078c2ecf20Sopenharmony_ci
40088c2ecf20Sopenharmony_ci	spin_lock(&session->s_cap_lock);
40098c2ecf20Sopenharmony_ci	/* don't know if session is readonly */
40108c2ecf20Sopenharmony_ci	session->s_readonly = 0;
40118c2ecf20Sopenharmony_ci	/*
40128c2ecf20Sopenharmony_ci	 * notify __ceph_remove_cap() that we are composing cap reconnect.
40138c2ecf20Sopenharmony_ci	 * If a cap get released before being added to the cap reconnect,
40148c2ecf20Sopenharmony_ci	 * __ceph_remove_cap() should skip queuing cap release.
40158c2ecf20Sopenharmony_ci	 */
40168c2ecf20Sopenharmony_ci	session->s_cap_reconnect = 1;
40178c2ecf20Sopenharmony_ci	/* drop old cap expires; we're about to reestablish that state */
40188c2ecf20Sopenharmony_ci	detach_cap_releases(session, &dispose);
40198c2ecf20Sopenharmony_ci	spin_unlock(&session->s_cap_lock);
40208c2ecf20Sopenharmony_ci	dispose_cap_releases(mdsc, &dispose);
40218c2ecf20Sopenharmony_ci
40228c2ecf20Sopenharmony_ci	/* trim unused caps to reduce MDS's cache rejoin time */
40238c2ecf20Sopenharmony_ci	if (mdsc->fsc->sb->s_root)
40248c2ecf20Sopenharmony_ci		shrink_dcache_parent(mdsc->fsc->sb->s_root);
40258c2ecf20Sopenharmony_ci
40268c2ecf20Sopenharmony_ci	ceph_con_close(&session->s_con);
40278c2ecf20Sopenharmony_ci	ceph_con_open(&session->s_con,
40288c2ecf20Sopenharmony_ci		      CEPH_ENTITY_TYPE_MDS, mds,
40298c2ecf20Sopenharmony_ci		      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
40308c2ecf20Sopenharmony_ci
40318c2ecf20Sopenharmony_ci	/* replay unsafe requests */
40328c2ecf20Sopenharmony_ci	replay_unsafe_requests(mdsc, session);
40338c2ecf20Sopenharmony_ci
40348c2ecf20Sopenharmony_ci	ceph_early_kick_flushing_caps(mdsc, session);
40358c2ecf20Sopenharmony_ci
40368c2ecf20Sopenharmony_ci	down_read(&mdsc->snap_rwsem);
40378c2ecf20Sopenharmony_ci
40388c2ecf20Sopenharmony_ci	/* placeholder for nr_caps */
40398c2ecf20Sopenharmony_ci	err = ceph_pagelist_encode_32(recon_state.pagelist, 0);
40408c2ecf20Sopenharmony_ci	if (err)
40418c2ecf20Sopenharmony_ci		goto fail;
40428c2ecf20Sopenharmony_ci
40438c2ecf20Sopenharmony_ci	if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) {
40448c2ecf20Sopenharmony_ci		recon_state.msg_version = 3;
40458c2ecf20Sopenharmony_ci		recon_state.allow_multi = true;
40468c2ecf20Sopenharmony_ci	} else if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) {
40478c2ecf20Sopenharmony_ci		recon_state.msg_version = 3;
40488c2ecf20Sopenharmony_ci	} else {
40498c2ecf20Sopenharmony_ci		recon_state.msg_version = 2;
40508c2ecf20Sopenharmony_ci	}
40518c2ecf20Sopenharmony_ci	/* trsaverse this session's caps */
40528c2ecf20Sopenharmony_ci	err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state);
40538c2ecf20Sopenharmony_ci
40548c2ecf20Sopenharmony_ci	spin_lock(&session->s_cap_lock);
40558c2ecf20Sopenharmony_ci	session->s_cap_reconnect = 0;
40568c2ecf20Sopenharmony_ci	spin_unlock(&session->s_cap_lock);
40578c2ecf20Sopenharmony_ci
40588c2ecf20Sopenharmony_ci	if (err < 0)
40598c2ecf20Sopenharmony_ci		goto fail;
40608c2ecf20Sopenharmony_ci
40618c2ecf20Sopenharmony_ci	/* check if all realms can be encoded into current message */
40628c2ecf20Sopenharmony_ci	if (mdsc->num_snap_realms) {
40638c2ecf20Sopenharmony_ci		size_t total_len =
40648c2ecf20Sopenharmony_ci			recon_state.pagelist->length +
40658c2ecf20Sopenharmony_ci			mdsc->num_snap_realms *
40668c2ecf20Sopenharmony_ci			sizeof(struct ceph_mds_snaprealm_reconnect);
40678c2ecf20Sopenharmony_ci		if (recon_state.msg_version >= 4) {
40688c2ecf20Sopenharmony_ci			/* number of realms */
40698c2ecf20Sopenharmony_ci			total_len += sizeof(u32);
40708c2ecf20Sopenharmony_ci			/* version, compat_version and struct_len */
40718c2ecf20Sopenharmony_ci			total_len += mdsc->num_snap_realms *
40728c2ecf20Sopenharmony_ci				     (2 * sizeof(u8) + sizeof(u32));
40738c2ecf20Sopenharmony_ci		}
40748c2ecf20Sopenharmony_ci		if (total_len > RECONNECT_MAX_SIZE) {
40758c2ecf20Sopenharmony_ci			if (!recon_state.allow_multi) {
40768c2ecf20Sopenharmony_ci				err = -ENOSPC;
40778c2ecf20Sopenharmony_ci				goto fail;
40788c2ecf20Sopenharmony_ci			}
40798c2ecf20Sopenharmony_ci			if (recon_state.nr_caps) {
40808c2ecf20Sopenharmony_ci				err = send_reconnect_partial(&recon_state);
40818c2ecf20Sopenharmony_ci				if (err)
40828c2ecf20Sopenharmony_ci					goto fail;
40838c2ecf20Sopenharmony_ci			}
40848c2ecf20Sopenharmony_ci			recon_state.msg_version = 5;
40858c2ecf20Sopenharmony_ci		}
40868c2ecf20Sopenharmony_ci	}
40878c2ecf20Sopenharmony_ci
40888c2ecf20Sopenharmony_ci	err = encode_snap_realms(mdsc, &recon_state);
40898c2ecf20Sopenharmony_ci	if (err < 0)
40908c2ecf20Sopenharmony_ci		goto fail;
40918c2ecf20Sopenharmony_ci
40928c2ecf20Sopenharmony_ci	if (recon_state.msg_version >= 5) {
40938c2ecf20Sopenharmony_ci		err = ceph_pagelist_encode_8(recon_state.pagelist, 0);
40948c2ecf20Sopenharmony_ci		if (err < 0)
40958c2ecf20Sopenharmony_ci			goto fail;
40968c2ecf20Sopenharmony_ci	}
40978c2ecf20Sopenharmony_ci
40988c2ecf20Sopenharmony_ci	if (recon_state.nr_caps || recon_state.nr_realms) {
40998c2ecf20Sopenharmony_ci		struct page *page =
41008c2ecf20Sopenharmony_ci			list_first_entry(&recon_state.pagelist->head,
41018c2ecf20Sopenharmony_ci					struct page, lru);
41028c2ecf20Sopenharmony_ci		__le32 *addr = kmap_atomic(page);
41038c2ecf20Sopenharmony_ci		if (recon_state.nr_caps) {
41048c2ecf20Sopenharmony_ci			WARN_ON(recon_state.nr_realms != mdsc->num_snap_realms);
41058c2ecf20Sopenharmony_ci			*addr = cpu_to_le32(recon_state.nr_caps);
41068c2ecf20Sopenharmony_ci		} else if (recon_state.msg_version >= 4) {
41078c2ecf20Sopenharmony_ci			*(addr + 1) = cpu_to_le32(recon_state.nr_realms);
41088c2ecf20Sopenharmony_ci		}
41098c2ecf20Sopenharmony_ci		kunmap_atomic(addr);
41108c2ecf20Sopenharmony_ci	}
41118c2ecf20Sopenharmony_ci
41128c2ecf20Sopenharmony_ci	reply->hdr.version = cpu_to_le16(recon_state.msg_version);
41138c2ecf20Sopenharmony_ci	if (recon_state.msg_version >= 4)
41148c2ecf20Sopenharmony_ci		reply->hdr.compat_version = cpu_to_le16(4);
41158c2ecf20Sopenharmony_ci
41168c2ecf20Sopenharmony_ci	reply->hdr.data_len = cpu_to_le32(recon_state.pagelist->length);
41178c2ecf20Sopenharmony_ci	ceph_msg_data_add_pagelist(reply, recon_state.pagelist);
41188c2ecf20Sopenharmony_ci
41198c2ecf20Sopenharmony_ci	ceph_con_send(&session->s_con, reply);
41208c2ecf20Sopenharmony_ci
41218c2ecf20Sopenharmony_ci	mutex_unlock(&session->s_mutex);
41228c2ecf20Sopenharmony_ci
41238c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
41248c2ecf20Sopenharmony_ci	__wake_requests(mdsc, &session->s_waiting);
41258c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
41268c2ecf20Sopenharmony_ci
41278c2ecf20Sopenharmony_ci	up_read(&mdsc->snap_rwsem);
41288c2ecf20Sopenharmony_ci	ceph_pagelist_release(recon_state.pagelist);
41298c2ecf20Sopenharmony_ci	return;
41308c2ecf20Sopenharmony_ci
41318c2ecf20Sopenharmony_cifail:
41328c2ecf20Sopenharmony_ci	ceph_msg_put(reply);
41338c2ecf20Sopenharmony_ci	up_read(&mdsc->snap_rwsem);
41348c2ecf20Sopenharmony_ci	mutex_unlock(&session->s_mutex);
41358c2ecf20Sopenharmony_cifail_nomsg:
41368c2ecf20Sopenharmony_ci	ceph_pagelist_release(recon_state.pagelist);
41378c2ecf20Sopenharmony_cifail_nopagelist:
41388c2ecf20Sopenharmony_ci	pr_err("error %d preparing reconnect for mds%d\n", err, mds);
41398c2ecf20Sopenharmony_ci	return;
41408c2ecf20Sopenharmony_ci}
41418c2ecf20Sopenharmony_ci
41428c2ecf20Sopenharmony_ci
41438c2ecf20Sopenharmony_ci/*
41448c2ecf20Sopenharmony_ci * compare old and new mdsmaps, kicking requests
41458c2ecf20Sopenharmony_ci * and closing out old connections as necessary
41468c2ecf20Sopenharmony_ci *
41478c2ecf20Sopenharmony_ci * called under mdsc->mutex.
41488c2ecf20Sopenharmony_ci */
41498c2ecf20Sopenharmony_cistatic void check_new_map(struct ceph_mds_client *mdsc,
41508c2ecf20Sopenharmony_ci			  struct ceph_mdsmap *newmap,
41518c2ecf20Sopenharmony_ci			  struct ceph_mdsmap *oldmap)
41528c2ecf20Sopenharmony_ci{
41538c2ecf20Sopenharmony_ci	int i;
41548c2ecf20Sopenharmony_ci	int oldstate, newstate;
41558c2ecf20Sopenharmony_ci	struct ceph_mds_session *s;
41568c2ecf20Sopenharmony_ci
41578c2ecf20Sopenharmony_ci	dout("check_new_map new %u old %u\n",
41588c2ecf20Sopenharmony_ci	     newmap->m_epoch, oldmap->m_epoch);
41598c2ecf20Sopenharmony_ci
41608c2ecf20Sopenharmony_ci	for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) {
41618c2ecf20Sopenharmony_ci		if (!mdsc->sessions[i])
41628c2ecf20Sopenharmony_ci			continue;
41638c2ecf20Sopenharmony_ci		s = mdsc->sessions[i];
41648c2ecf20Sopenharmony_ci		oldstate = ceph_mdsmap_get_state(oldmap, i);
41658c2ecf20Sopenharmony_ci		newstate = ceph_mdsmap_get_state(newmap, i);
41668c2ecf20Sopenharmony_ci
41678c2ecf20Sopenharmony_ci		dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
41688c2ecf20Sopenharmony_ci		     i, ceph_mds_state_name(oldstate),
41698c2ecf20Sopenharmony_ci		     ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
41708c2ecf20Sopenharmony_ci		     ceph_mds_state_name(newstate),
41718c2ecf20Sopenharmony_ci		     ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
41728c2ecf20Sopenharmony_ci		     ceph_session_state_name(s->s_state));
41738c2ecf20Sopenharmony_ci
41748c2ecf20Sopenharmony_ci		if (i >= newmap->possible_max_rank) {
41758c2ecf20Sopenharmony_ci			/* force close session for stopped mds */
41768c2ecf20Sopenharmony_ci			ceph_get_mds_session(s);
41778c2ecf20Sopenharmony_ci			__unregister_session(mdsc, s);
41788c2ecf20Sopenharmony_ci			__wake_requests(mdsc, &s->s_waiting);
41798c2ecf20Sopenharmony_ci			mutex_unlock(&mdsc->mutex);
41808c2ecf20Sopenharmony_ci
41818c2ecf20Sopenharmony_ci			mutex_lock(&s->s_mutex);
41828c2ecf20Sopenharmony_ci			cleanup_session_requests(mdsc, s);
41838c2ecf20Sopenharmony_ci			remove_session_caps(s);
41848c2ecf20Sopenharmony_ci			mutex_unlock(&s->s_mutex);
41858c2ecf20Sopenharmony_ci
41868c2ecf20Sopenharmony_ci			ceph_put_mds_session(s);
41878c2ecf20Sopenharmony_ci
41888c2ecf20Sopenharmony_ci			mutex_lock(&mdsc->mutex);
41898c2ecf20Sopenharmony_ci			kick_requests(mdsc, i);
41908c2ecf20Sopenharmony_ci			continue;
41918c2ecf20Sopenharmony_ci		}
41928c2ecf20Sopenharmony_ci
41938c2ecf20Sopenharmony_ci		if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
41948c2ecf20Sopenharmony_ci			   ceph_mdsmap_get_addr(newmap, i),
41958c2ecf20Sopenharmony_ci			   sizeof(struct ceph_entity_addr))) {
41968c2ecf20Sopenharmony_ci			/* just close it */
41978c2ecf20Sopenharmony_ci			mutex_unlock(&mdsc->mutex);
41988c2ecf20Sopenharmony_ci			mutex_lock(&s->s_mutex);
41998c2ecf20Sopenharmony_ci			mutex_lock(&mdsc->mutex);
42008c2ecf20Sopenharmony_ci			ceph_con_close(&s->s_con);
42018c2ecf20Sopenharmony_ci			mutex_unlock(&s->s_mutex);
42028c2ecf20Sopenharmony_ci			s->s_state = CEPH_MDS_SESSION_RESTARTING;
42038c2ecf20Sopenharmony_ci		} else if (oldstate == newstate) {
42048c2ecf20Sopenharmony_ci			continue;  /* nothing new with this mds */
42058c2ecf20Sopenharmony_ci		}
42068c2ecf20Sopenharmony_ci
42078c2ecf20Sopenharmony_ci		/*
42088c2ecf20Sopenharmony_ci		 * send reconnect?
42098c2ecf20Sopenharmony_ci		 */
42108c2ecf20Sopenharmony_ci		if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
42118c2ecf20Sopenharmony_ci		    newstate >= CEPH_MDS_STATE_RECONNECT) {
42128c2ecf20Sopenharmony_ci			mutex_unlock(&mdsc->mutex);
42138c2ecf20Sopenharmony_ci			send_mds_reconnect(mdsc, s);
42148c2ecf20Sopenharmony_ci			mutex_lock(&mdsc->mutex);
42158c2ecf20Sopenharmony_ci		}
42168c2ecf20Sopenharmony_ci
42178c2ecf20Sopenharmony_ci		/*
42188c2ecf20Sopenharmony_ci		 * kick request on any mds that has gone active.
42198c2ecf20Sopenharmony_ci		 */
42208c2ecf20Sopenharmony_ci		if (oldstate < CEPH_MDS_STATE_ACTIVE &&
42218c2ecf20Sopenharmony_ci		    newstate >= CEPH_MDS_STATE_ACTIVE) {
42228c2ecf20Sopenharmony_ci			if (oldstate != CEPH_MDS_STATE_CREATING &&
42238c2ecf20Sopenharmony_ci			    oldstate != CEPH_MDS_STATE_STARTING)
42248c2ecf20Sopenharmony_ci				pr_info("mds%d recovery completed\n", s->s_mds);
42258c2ecf20Sopenharmony_ci			kick_requests(mdsc, i);
42268c2ecf20Sopenharmony_ci			mutex_unlock(&mdsc->mutex);
42278c2ecf20Sopenharmony_ci			mutex_lock(&s->s_mutex);
42288c2ecf20Sopenharmony_ci			mutex_lock(&mdsc->mutex);
42298c2ecf20Sopenharmony_ci			ceph_kick_flushing_caps(mdsc, s);
42308c2ecf20Sopenharmony_ci			mutex_unlock(&s->s_mutex);
42318c2ecf20Sopenharmony_ci			wake_up_session_caps(s, RECONNECT);
42328c2ecf20Sopenharmony_ci		}
42338c2ecf20Sopenharmony_ci	}
42348c2ecf20Sopenharmony_ci
42358c2ecf20Sopenharmony_ci	for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) {
42368c2ecf20Sopenharmony_ci		s = mdsc->sessions[i];
42378c2ecf20Sopenharmony_ci		if (!s)
42388c2ecf20Sopenharmony_ci			continue;
42398c2ecf20Sopenharmony_ci		if (!ceph_mdsmap_is_laggy(newmap, i))
42408c2ecf20Sopenharmony_ci			continue;
42418c2ecf20Sopenharmony_ci		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
42428c2ecf20Sopenharmony_ci		    s->s_state == CEPH_MDS_SESSION_HUNG ||
42438c2ecf20Sopenharmony_ci		    s->s_state == CEPH_MDS_SESSION_CLOSING) {
42448c2ecf20Sopenharmony_ci			dout(" connecting to export targets of laggy mds%d\n",
42458c2ecf20Sopenharmony_ci			     i);
42468c2ecf20Sopenharmony_ci			__open_export_target_sessions(mdsc, s);
42478c2ecf20Sopenharmony_ci		}
42488c2ecf20Sopenharmony_ci	}
42498c2ecf20Sopenharmony_ci}
42508c2ecf20Sopenharmony_ci
42518c2ecf20Sopenharmony_ci
42528c2ecf20Sopenharmony_ci
42538c2ecf20Sopenharmony_ci/*
42548c2ecf20Sopenharmony_ci * leases
42558c2ecf20Sopenharmony_ci */
42568c2ecf20Sopenharmony_ci
42578c2ecf20Sopenharmony_ci/*
42588c2ecf20Sopenharmony_ci * caller must hold session s_mutex, dentry->d_lock
42598c2ecf20Sopenharmony_ci */
42608c2ecf20Sopenharmony_civoid __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
42618c2ecf20Sopenharmony_ci{
42628c2ecf20Sopenharmony_ci	struct ceph_dentry_info *di = ceph_dentry(dentry);
42638c2ecf20Sopenharmony_ci
42648c2ecf20Sopenharmony_ci	ceph_put_mds_session(di->lease_session);
42658c2ecf20Sopenharmony_ci	di->lease_session = NULL;
42668c2ecf20Sopenharmony_ci}
42678c2ecf20Sopenharmony_ci
42688c2ecf20Sopenharmony_cistatic void handle_lease(struct ceph_mds_client *mdsc,
42698c2ecf20Sopenharmony_ci			 struct ceph_mds_session *session,
42708c2ecf20Sopenharmony_ci			 struct ceph_msg *msg)
42718c2ecf20Sopenharmony_ci{
42728c2ecf20Sopenharmony_ci	struct super_block *sb = mdsc->fsc->sb;
42738c2ecf20Sopenharmony_ci	struct inode *inode;
42748c2ecf20Sopenharmony_ci	struct dentry *parent, *dentry;
42758c2ecf20Sopenharmony_ci	struct ceph_dentry_info *di;
42768c2ecf20Sopenharmony_ci	int mds = session->s_mds;
42778c2ecf20Sopenharmony_ci	struct ceph_mds_lease *h = msg->front.iov_base;
42788c2ecf20Sopenharmony_ci	u32 seq;
42798c2ecf20Sopenharmony_ci	struct ceph_vino vino;
42808c2ecf20Sopenharmony_ci	struct qstr dname;
42818c2ecf20Sopenharmony_ci	int release = 0;
42828c2ecf20Sopenharmony_ci
42838c2ecf20Sopenharmony_ci	dout("handle_lease from mds%d\n", mds);
42848c2ecf20Sopenharmony_ci
42858c2ecf20Sopenharmony_ci	/* decode */
42868c2ecf20Sopenharmony_ci	if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
42878c2ecf20Sopenharmony_ci		goto bad;
42888c2ecf20Sopenharmony_ci	vino.ino = le64_to_cpu(h->ino);
42898c2ecf20Sopenharmony_ci	vino.snap = CEPH_NOSNAP;
42908c2ecf20Sopenharmony_ci	seq = le32_to_cpu(h->seq);
42918c2ecf20Sopenharmony_ci	dname.len = get_unaligned_le32(h + 1);
42928c2ecf20Sopenharmony_ci	if (msg->front.iov_len < sizeof(*h) + sizeof(u32) + dname.len)
42938c2ecf20Sopenharmony_ci		goto bad;
42948c2ecf20Sopenharmony_ci	dname.name = (void *)(h + 1) + sizeof(u32);
42958c2ecf20Sopenharmony_ci
42968c2ecf20Sopenharmony_ci	/* lookup inode */
42978c2ecf20Sopenharmony_ci	inode = ceph_find_inode(sb, vino);
42988c2ecf20Sopenharmony_ci	dout("handle_lease %s, ino %llx %p %.*s\n",
42998c2ecf20Sopenharmony_ci	     ceph_lease_op_name(h->action), vino.ino, inode,
43008c2ecf20Sopenharmony_ci	     dname.len, dname.name);
43018c2ecf20Sopenharmony_ci
43028c2ecf20Sopenharmony_ci	mutex_lock(&session->s_mutex);
43038c2ecf20Sopenharmony_ci	inc_session_sequence(session);
43048c2ecf20Sopenharmony_ci
43058c2ecf20Sopenharmony_ci	if (!inode) {
43068c2ecf20Sopenharmony_ci		dout("handle_lease no inode %llx\n", vino.ino);
43078c2ecf20Sopenharmony_ci		goto release;
43088c2ecf20Sopenharmony_ci	}
43098c2ecf20Sopenharmony_ci
43108c2ecf20Sopenharmony_ci	/* dentry */
43118c2ecf20Sopenharmony_ci	parent = d_find_alias(inode);
43128c2ecf20Sopenharmony_ci	if (!parent) {
43138c2ecf20Sopenharmony_ci		dout("no parent dentry on inode %p\n", inode);
43148c2ecf20Sopenharmony_ci		WARN_ON(1);
43158c2ecf20Sopenharmony_ci		goto release;  /* hrm... */
43168c2ecf20Sopenharmony_ci	}
43178c2ecf20Sopenharmony_ci	dname.hash = full_name_hash(parent, dname.name, dname.len);
43188c2ecf20Sopenharmony_ci	dentry = d_lookup(parent, &dname);
43198c2ecf20Sopenharmony_ci	dput(parent);
43208c2ecf20Sopenharmony_ci	if (!dentry)
43218c2ecf20Sopenharmony_ci		goto release;
43228c2ecf20Sopenharmony_ci
43238c2ecf20Sopenharmony_ci	spin_lock(&dentry->d_lock);
43248c2ecf20Sopenharmony_ci	di = ceph_dentry(dentry);
43258c2ecf20Sopenharmony_ci	switch (h->action) {
43268c2ecf20Sopenharmony_ci	case CEPH_MDS_LEASE_REVOKE:
43278c2ecf20Sopenharmony_ci		if (di->lease_session == session) {
43288c2ecf20Sopenharmony_ci			if (ceph_seq_cmp(di->lease_seq, seq) > 0)
43298c2ecf20Sopenharmony_ci				h->seq = cpu_to_le32(di->lease_seq);
43308c2ecf20Sopenharmony_ci			__ceph_mdsc_drop_dentry_lease(dentry);
43318c2ecf20Sopenharmony_ci		}
43328c2ecf20Sopenharmony_ci		release = 1;
43338c2ecf20Sopenharmony_ci		break;
43348c2ecf20Sopenharmony_ci
43358c2ecf20Sopenharmony_ci	case CEPH_MDS_LEASE_RENEW:
43368c2ecf20Sopenharmony_ci		if (di->lease_session == session &&
43378c2ecf20Sopenharmony_ci		    di->lease_gen == session->s_cap_gen &&
43388c2ecf20Sopenharmony_ci		    di->lease_renew_from &&
43398c2ecf20Sopenharmony_ci		    di->lease_renew_after == 0) {
43408c2ecf20Sopenharmony_ci			unsigned long duration =
43418c2ecf20Sopenharmony_ci				msecs_to_jiffies(le32_to_cpu(h->duration_ms));
43428c2ecf20Sopenharmony_ci
43438c2ecf20Sopenharmony_ci			di->lease_seq = seq;
43448c2ecf20Sopenharmony_ci			di->time = di->lease_renew_from + duration;
43458c2ecf20Sopenharmony_ci			di->lease_renew_after = di->lease_renew_from +
43468c2ecf20Sopenharmony_ci				(duration >> 1);
43478c2ecf20Sopenharmony_ci			di->lease_renew_from = 0;
43488c2ecf20Sopenharmony_ci		}
43498c2ecf20Sopenharmony_ci		break;
43508c2ecf20Sopenharmony_ci	}
43518c2ecf20Sopenharmony_ci	spin_unlock(&dentry->d_lock);
43528c2ecf20Sopenharmony_ci	dput(dentry);
43538c2ecf20Sopenharmony_ci
43548c2ecf20Sopenharmony_ci	if (!release)
43558c2ecf20Sopenharmony_ci		goto out;
43568c2ecf20Sopenharmony_ci
43578c2ecf20Sopenharmony_cirelease:
43588c2ecf20Sopenharmony_ci	/* let's just reuse the same message */
43598c2ecf20Sopenharmony_ci	h->action = CEPH_MDS_LEASE_REVOKE_ACK;
43608c2ecf20Sopenharmony_ci	ceph_msg_get(msg);
43618c2ecf20Sopenharmony_ci	ceph_con_send(&session->s_con, msg);
43628c2ecf20Sopenharmony_ci
43638c2ecf20Sopenharmony_ciout:
43648c2ecf20Sopenharmony_ci	mutex_unlock(&session->s_mutex);
43658c2ecf20Sopenharmony_ci	/* avoid calling iput_final() in mds dispatch threads */
43668c2ecf20Sopenharmony_ci	ceph_async_iput(inode);
43678c2ecf20Sopenharmony_ci	return;
43688c2ecf20Sopenharmony_ci
43698c2ecf20Sopenharmony_cibad:
43708c2ecf20Sopenharmony_ci	pr_err("corrupt lease message\n");
43718c2ecf20Sopenharmony_ci	ceph_msg_dump(msg);
43728c2ecf20Sopenharmony_ci}
43738c2ecf20Sopenharmony_ci
43748c2ecf20Sopenharmony_civoid ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
43758c2ecf20Sopenharmony_ci			      struct dentry *dentry, char action,
43768c2ecf20Sopenharmony_ci			      u32 seq)
43778c2ecf20Sopenharmony_ci{
43788c2ecf20Sopenharmony_ci	struct ceph_msg *msg;
43798c2ecf20Sopenharmony_ci	struct ceph_mds_lease *lease;
43808c2ecf20Sopenharmony_ci	struct inode *dir;
43818c2ecf20Sopenharmony_ci	int len = sizeof(*lease) + sizeof(u32) + NAME_MAX;
43828c2ecf20Sopenharmony_ci
43838c2ecf20Sopenharmony_ci	dout("lease_send_msg identry %p %s to mds%d\n",
43848c2ecf20Sopenharmony_ci	     dentry, ceph_lease_op_name(action), session->s_mds);
43858c2ecf20Sopenharmony_ci
43868c2ecf20Sopenharmony_ci	msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false);
43878c2ecf20Sopenharmony_ci	if (!msg)
43888c2ecf20Sopenharmony_ci		return;
43898c2ecf20Sopenharmony_ci	lease = msg->front.iov_base;
43908c2ecf20Sopenharmony_ci	lease->action = action;
43918c2ecf20Sopenharmony_ci	lease->seq = cpu_to_le32(seq);
43928c2ecf20Sopenharmony_ci
43938c2ecf20Sopenharmony_ci	spin_lock(&dentry->d_lock);
43948c2ecf20Sopenharmony_ci	dir = d_inode(dentry->d_parent);
43958c2ecf20Sopenharmony_ci	lease->ino = cpu_to_le64(ceph_ino(dir));
43968c2ecf20Sopenharmony_ci	lease->first = lease->last = cpu_to_le64(ceph_snap(dir));
43978c2ecf20Sopenharmony_ci
43988c2ecf20Sopenharmony_ci	put_unaligned_le32(dentry->d_name.len, lease + 1);
43998c2ecf20Sopenharmony_ci	memcpy((void *)(lease + 1) + 4,
44008c2ecf20Sopenharmony_ci	       dentry->d_name.name, dentry->d_name.len);
44018c2ecf20Sopenharmony_ci	spin_unlock(&dentry->d_lock);
44028c2ecf20Sopenharmony_ci	/*
44038c2ecf20Sopenharmony_ci	 * if this is a preemptive lease RELEASE, no need to
44048c2ecf20Sopenharmony_ci	 * flush request stream, since the actual request will
44058c2ecf20Sopenharmony_ci	 * soon follow.
44068c2ecf20Sopenharmony_ci	 */
44078c2ecf20Sopenharmony_ci	msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
44088c2ecf20Sopenharmony_ci
44098c2ecf20Sopenharmony_ci	ceph_con_send(&session->s_con, msg);
44108c2ecf20Sopenharmony_ci}
44118c2ecf20Sopenharmony_ci
44128c2ecf20Sopenharmony_ci/*
44138c2ecf20Sopenharmony_ci * lock unlock the session, to wait ongoing session activities
44148c2ecf20Sopenharmony_ci */
44158c2ecf20Sopenharmony_cistatic void lock_unlock_session(struct ceph_mds_session *s)
44168c2ecf20Sopenharmony_ci{
44178c2ecf20Sopenharmony_ci	mutex_lock(&s->s_mutex);
44188c2ecf20Sopenharmony_ci	mutex_unlock(&s->s_mutex);
44198c2ecf20Sopenharmony_ci}
44208c2ecf20Sopenharmony_ci
44218c2ecf20Sopenharmony_cistatic void maybe_recover_session(struct ceph_mds_client *mdsc)
44228c2ecf20Sopenharmony_ci{
44238c2ecf20Sopenharmony_ci	struct ceph_fs_client *fsc = mdsc->fsc;
44248c2ecf20Sopenharmony_ci
44258c2ecf20Sopenharmony_ci	if (!ceph_test_mount_opt(fsc, CLEANRECOVER))
44268c2ecf20Sopenharmony_ci		return;
44278c2ecf20Sopenharmony_ci
44288c2ecf20Sopenharmony_ci	if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
44298c2ecf20Sopenharmony_ci		return;
44308c2ecf20Sopenharmony_ci
44318c2ecf20Sopenharmony_ci	if (!READ_ONCE(fsc->blocklisted))
44328c2ecf20Sopenharmony_ci		return;
44338c2ecf20Sopenharmony_ci
44348c2ecf20Sopenharmony_ci	if (fsc->last_auto_reconnect &&
44358c2ecf20Sopenharmony_ci	    time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30))
44368c2ecf20Sopenharmony_ci		return;
44378c2ecf20Sopenharmony_ci
44388c2ecf20Sopenharmony_ci	pr_info("auto reconnect after blocklisted\n");
44398c2ecf20Sopenharmony_ci	fsc->last_auto_reconnect = jiffies;
44408c2ecf20Sopenharmony_ci	ceph_force_reconnect(fsc->sb);
44418c2ecf20Sopenharmony_ci}
44428c2ecf20Sopenharmony_ci
44438c2ecf20Sopenharmony_cibool check_session_state(struct ceph_mds_session *s)
44448c2ecf20Sopenharmony_ci{
44458c2ecf20Sopenharmony_ci	switch (s->s_state) {
44468c2ecf20Sopenharmony_ci	case CEPH_MDS_SESSION_OPEN:
44478c2ecf20Sopenharmony_ci		if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
44488c2ecf20Sopenharmony_ci			s->s_state = CEPH_MDS_SESSION_HUNG;
44498c2ecf20Sopenharmony_ci			pr_info("mds%d hung\n", s->s_mds);
44508c2ecf20Sopenharmony_ci		}
44518c2ecf20Sopenharmony_ci		break;
44528c2ecf20Sopenharmony_ci	case CEPH_MDS_SESSION_CLOSING:
44538c2ecf20Sopenharmony_ci		/* Should never reach this when we're unmounting */
44548c2ecf20Sopenharmony_ci		WARN_ON_ONCE(s->s_ttl);
44558c2ecf20Sopenharmony_ci		fallthrough;
44568c2ecf20Sopenharmony_ci	case CEPH_MDS_SESSION_NEW:
44578c2ecf20Sopenharmony_ci	case CEPH_MDS_SESSION_RESTARTING:
44588c2ecf20Sopenharmony_ci	case CEPH_MDS_SESSION_CLOSED:
44598c2ecf20Sopenharmony_ci	case CEPH_MDS_SESSION_REJECTED:
44608c2ecf20Sopenharmony_ci		return false;
44618c2ecf20Sopenharmony_ci	}
44628c2ecf20Sopenharmony_ci
44638c2ecf20Sopenharmony_ci	return true;
44648c2ecf20Sopenharmony_ci}
44658c2ecf20Sopenharmony_ci
44668c2ecf20Sopenharmony_ci/*
44678c2ecf20Sopenharmony_ci * If the sequence is incremented while we're waiting on a REQUEST_CLOSE reply,
44688c2ecf20Sopenharmony_ci * then we need to retransmit that request.
44698c2ecf20Sopenharmony_ci */
44708c2ecf20Sopenharmony_civoid inc_session_sequence(struct ceph_mds_session *s)
44718c2ecf20Sopenharmony_ci{
44728c2ecf20Sopenharmony_ci	lockdep_assert_held(&s->s_mutex);
44738c2ecf20Sopenharmony_ci
44748c2ecf20Sopenharmony_ci	s->s_seq++;
44758c2ecf20Sopenharmony_ci
44768c2ecf20Sopenharmony_ci	if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
44778c2ecf20Sopenharmony_ci		int ret;
44788c2ecf20Sopenharmony_ci
44798c2ecf20Sopenharmony_ci		dout("resending session close request for mds%d\n", s->s_mds);
44808c2ecf20Sopenharmony_ci		ret = request_close_session(s);
44818c2ecf20Sopenharmony_ci		if (ret < 0)
44828c2ecf20Sopenharmony_ci			pr_err("unable to close session to mds%d: %d\n",
44838c2ecf20Sopenharmony_ci			       s->s_mds, ret);
44848c2ecf20Sopenharmony_ci	}
44858c2ecf20Sopenharmony_ci}
44868c2ecf20Sopenharmony_ci
44878c2ecf20Sopenharmony_ci/*
44888c2ecf20Sopenharmony_ci * delayed work -- periodically trim expired leases, renew caps with mds.  If
44898c2ecf20Sopenharmony_ci * the @delay parameter is set to 0 or if it's more than 5 secs, the default
44908c2ecf20Sopenharmony_ci * workqueue delay value of 5 secs will be used.
44918c2ecf20Sopenharmony_ci */
44928c2ecf20Sopenharmony_cistatic void schedule_delayed(struct ceph_mds_client *mdsc, unsigned long delay)
44938c2ecf20Sopenharmony_ci{
44948c2ecf20Sopenharmony_ci	unsigned long max_delay = HZ * 5;
44958c2ecf20Sopenharmony_ci
44968c2ecf20Sopenharmony_ci	/* 5 secs default delay */
44978c2ecf20Sopenharmony_ci	if (!delay || (delay > max_delay))
44988c2ecf20Sopenharmony_ci		delay = max_delay;
44998c2ecf20Sopenharmony_ci	schedule_delayed_work(&mdsc->delayed_work,
45008c2ecf20Sopenharmony_ci			      round_jiffies_relative(delay));
45018c2ecf20Sopenharmony_ci}
45028c2ecf20Sopenharmony_ci
45038c2ecf20Sopenharmony_cistatic void delayed_work(struct work_struct *work)
45048c2ecf20Sopenharmony_ci{
45058c2ecf20Sopenharmony_ci	struct ceph_mds_client *mdsc =
45068c2ecf20Sopenharmony_ci		container_of(work, struct ceph_mds_client, delayed_work.work);
45078c2ecf20Sopenharmony_ci	unsigned long delay;
45088c2ecf20Sopenharmony_ci	int renew_interval;
45098c2ecf20Sopenharmony_ci	int renew_caps;
45108c2ecf20Sopenharmony_ci	int i;
45118c2ecf20Sopenharmony_ci
45128c2ecf20Sopenharmony_ci	dout("mdsc delayed_work\n");
45138c2ecf20Sopenharmony_ci
45148c2ecf20Sopenharmony_ci	if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED)
45158c2ecf20Sopenharmony_ci		return;
45168c2ecf20Sopenharmony_ci
45178c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
45188c2ecf20Sopenharmony_ci	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
45198c2ecf20Sopenharmony_ci	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
45208c2ecf20Sopenharmony_ci				   mdsc->last_renew_caps);
45218c2ecf20Sopenharmony_ci	if (renew_caps)
45228c2ecf20Sopenharmony_ci		mdsc->last_renew_caps = jiffies;
45238c2ecf20Sopenharmony_ci
45248c2ecf20Sopenharmony_ci	for (i = 0; i < mdsc->max_sessions; i++) {
45258c2ecf20Sopenharmony_ci		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
45268c2ecf20Sopenharmony_ci		if (!s)
45278c2ecf20Sopenharmony_ci			continue;
45288c2ecf20Sopenharmony_ci
45298c2ecf20Sopenharmony_ci		if (!check_session_state(s)) {
45308c2ecf20Sopenharmony_ci			ceph_put_mds_session(s);
45318c2ecf20Sopenharmony_ci			continue;
45328c2ecf20Sopenharmony_ci		}
45338c2ecf20Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
45348c2ecf20Sopenharmony_ci
45358c2ecf20Sopenharmony_ci		mutex_lock(&s->s_mutex);
45368c2ecf20Sopenharmony_ci		if (renew_caps)
45378c2ecf20Sopenharmony_ci			send_renew_caps(mdsc, s);
45388c2ecf20Sopenharmony_ci		else
45398c2ecf20Sopenharmony_ci			ceph_con_keepalive(&s->s_con);
45408c2ecf20Sopenharmony_ci		if (s->s_state == CEPH_MDS_SESSION_OPEN ||
45418c2ecf20Sopenharmony_ci		    s->s_state == CEPH_MDS_SESSION_HUNG)
45428c2ecf20Sopenharmony_ci			ceph_send_cap_releases(mdsc, s);
45438c2ecf20Sopenharmony_ci		mutex_unlock(&s->s_mutex);
45448c2ecf20Sopenharmony_ci		ceph_put_mds_session(s);
45458c2ecf20Sopenharmony_ci
45468c2ecf20Sopenharmony_ci		mutex_lock(&mdsc->mutex);
45478c2ecf20Sopenharmony_ci	}
45488c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
45498c2ecf20Sopenharmony_ci
45508c2ecf20Sopenharmony_ci	delay = ceph_check_delayed_caps(mdsc);
45518c2ecf20Sopenharmony_ci
45528c2ecf20Sopenharmony_ci	ceph_queue_cap_reclaim_work(mdsc);
45538c2ecf20Sopenharmony_ci
45548c2ecf20Sopenharmony_ci	ceph_trim_snapid_map(mdsc);
45558c2ecf20Sopenharmony_ci
45568c2ecf20Sopenharmony_ci	maybe_recover_session(mdsc);
45578c2ecf20Sopenharmony_ci
45588c2ecf20Sopenharmony_ci	schedule_delayed(mdsc, delay);
45598c2ecf20Sopenharmony_ci}
45608c2ecf20Sopenharmony_ci
45618c2ecf20Sopenharmony_ciint ceph_mdsc_init(struct ceph_fs_client *fsc)
45628c2ecf20Sopenharmony_ci
45638c2ecf20Sopenharmony_ci{
45648c2ecf20Sopenharmony_ci	struct ceph_mds_client *mdsc;
45658c2ecf20Sopenharmony_ci	int err;
45668c2ecf20Sopenharmony_ci
45678c2ecf20Sopenharmony_ci	mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
45688c2ecf20Sopenharmony_ci	if (!mdsc)
45698c2ecf20Sopenharmony_ci		return -ENOMEM;
45708c2ecf20Sopenharmony_ci	mdsc->fsc = fsc;
45718c2ecf20Sopenharmony_ci	mutex_init(&mdsc->mutex);
45728c2ecf20Sopenharmony_ci	mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
45738c2ecf20Sopenharmony_ci	if (!mdsc->mdsmap) {
45748c2ecf20Sopenharmony_ci		err = -ENOMEM;
45758c2ecf20Sopenharmony_ci		goto err_mdsc;
45768c2ecf20Sopenharmony_ci	}
45778c2ecf20Sopenharmony_ci
45788c2ecf20Sopenharmony_ci	init_completion(&mdsc->safe_umount_waiters);
45798c2ecf20Sopenharmony_ci	init_waitqueue_head(&mdsc->session_close_wq);
45808c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->waiting_for_map);
45818c2ecf20Sopenharmony_ci	mdsc->sessions = NULL;
45828c2ecf20Sopenharmony_ci	atomic_set(&mdsc->num_sessions, 0);
45838c2ecf20Sopenharmony_ci	mdsc->max_sessions = 0;
45848c2ecf20Sopenharmony_ci	mdsc->stopping = 0;
45858c2ecf20Sopenharmony_ci	atomic64_set(&mdsc->quotarealms_count, 0);
45868c2ecf20Sopenharmony_ci	mdsc->quotarealms_inodes = RB_ROOT;
45878c2ecf20Sopenharmony_ci	mutex_init(&mdsc->quotarealms_inodes_mutex);
45888c2ecf20Sopenharmony_ci	mdsc->last_snap_seq = 0;
45898c2ecf20Sopenharmony_ci	init_rwsem(&mdsc->snap_rwsem);
45908c2ecf20Sopenharmony_ci	mdsc->snap_realms = RB_ROOT;
45918c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->snap_empty);
45928c2ecf20Sopenharmony_ci	mdsc->num_snap_realms = 0;
45938c2ecf20Sopenharmony_ci	spin_lock_init(&mdsc->snap_empty_lock);
45948c2ecf20Sopenharmony_ci	mdsc->last_tid = 0;
45958c2ecf20Sopenharmony_ci	mdsc->oldest_tid = 0;
45968c2ecf20Sopenharmony_ci	mdsc->request_tree = RB_ROOT;
45978c2ecf20Sopenharmony_ci	INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
45988c2ecf20Sopenharmony_ci	mdsc->last_renew_caps = jiffies;
45998c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->cap_delay_list);
46008c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->cap_wait_list);
46018c2ecf20Sopenharmony_ci	spin_lock_init(&mdsc->cap_delay_lock);
46028c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->snap_flush_list);
46038c2ecf20Sopenharmony_ci	spin_lock_init(&mdsc->snap_flush_lock);
46048c2ecf20Sopenharmony_ci	mdsc->last_cap_flush_tid = 1;
46058c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->cap_flush_list);
46068c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
46078c2ecf20Sopenharmony_ci	mdsc->num_cap_flushing = 0;
46088c2ecf20Sopenharmony_ci	spin_lock_init(&mdsc->cap_dirty_lock);
46098c2ecf20Sopenharmony_ci	init_waitqueue_head(&mdsc->cap_flushing_wq);
46108c2ecf20Sopenharmony_ci	INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
46118c2ecf20Sopenharmony_ci	atomic_set(&mdsc->cap_reclaim_pending, 0);
46128c2ecf20Sopenharmony_ci	err = ceph_metric_init(&mdsc->metric);
46138c2ecf20Sopenharmony_ci	if (err)
46148c2ecf20Sopenharmony_ci		goto err_mdsmap;
46158c2ecf20Sopenharmony_ci
46168c2ecf20Sopenharmony_ci	spin_lock_init(&mdsc->dentry_list_lock);
46178c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->dentry_leases);
46188c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->dentry_dir_leases);
46198c2ecf20Sopenharmony_ci
46208c2ecf20Sopenharmony_ci	ceph_caps_init(mdsc);
46218c2ecf20Sopenharmony_ci	ceph_adjust_caps_max_min(mdsc, fsc->mount_options);
46228c2ecf20Sopenharmony_ci
46238c2ecf20Sopenharmony_ci	spin_lock_init(&mdsc->snapid_map_lock);
46248c2ecf20Sopenharmony_ci	mdsc->snapid_map_tree = RB_ROOT;
46258c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&mdsc->snapid_map_lru);
46268c2ecf20Sopenharmony_ci
46278c2ecf20Sopenharmony_ci	init_rwsem(&mdsc->pool_perm_rwsem);
46288c2ecf20Sopenharmony_ci	mdsc->pool_perm_tree = RB_ROOT;
46298c2ecf20Sopenharmony_ci
46308c2ecf20Sopenharmony_ci	strscpy(mdsc->nodename, utsname()->nodename,
46318c2ecf20Sopenharmony_ci		sizeof(mdsc->nodename));
46328c2ecf20Sopenharmony_ci
46338c2ecf20Sopenharmony_ci	fsc->mdsc = mdsc;
46348c2ecf20Sopenharmony_ci	return 0;
46358c2ecf20Sopenharmony_ci
46368c2ecf20Sopenharmony_cierr_mdsmap:
46378c2ecf20Sopenharmony_ci	kfree(mdsc->mdsmap);
46388c2ecf20Sopenharmony_cierr_mdsc:
46398c2ecf20Sopenharmony_ci	kfree(mdsc);
46408c2ecf20Sopenharmony_ci	return err;
46418c2ecf20Sopenharmony_ci}
46428c2ecf20Sopenharmony_ci
46438c2ecf20Sopenharmony_ci/*
46448c2ecf20Sopenharmony_ci * Wait for safe replies on open mds requests.  If we time out, drop
46458c2ecf20Sopenharmony_ci * all requests from the tree to avoid dangling dentry refs.
46468c2ecf20Sopenharmony_ci */
46478c2ecf20Sopenharmony_cistatic void wait_requests(struct ceph_mds_client *mdsc)
46488c2ecf20Sopenharmony_ci{
46498c2ecf20Sopenharmony_ci	struct ceph_options *opts = mdsc->fsc->client->options;
46508c2ecf20Sopenharmony_ci	struct ceph_mds_request *req;
46518c2ecf20Sopenharmony_ci
46528c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
46538c2ecf20Sopenharmony_ci	if (__get_oldest_req(mdsc)) {
46548c2ecf20Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
46558c2ecf20Sopenharmony_ci
46568c2ecf20Sopenharmony_ci		dout("wait_requests waiting for requests\n");
46578c2ecf20Sopenharmony_ci		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
46588c2ecf20Sopenharmony_ci				    ceph_timeout_jiffies(opts->mount_timeout));
46598c2ecf20Sopenharmony_ci
46608c2ecf20Sopenharmony_ci		/* tear down remaining requests */
46618c2ecf20Sopenharmony_ci		mutex_lock(&mdsc->mutex);
46628c2ecf20Sopenharmony_ci		while ((req = __get_oldest_req(mdsc))) {
46638c2ecf20Sopenharmony_ci			dout("wait_requests timed out on tid %llu\n",
46648c2ecf20Sopenharmony_ci			     req->r_tid);
46658c2ecf20Sopenharmony_ci			list_del_init(&req->r_wait);
46668c2ecf20Sopenharmony_ci			__unregister_request(mdsc, req);
46678c2ecf20Sopenharmony_ci		}
46688c2ecf20Sopenharmony_ci	}
46698c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
46708c2ecf20Sopenharmony_ci	dout("wait_requests done\n");
46718c2ecf20Sopenharmony_ci}
46728c2ecf20Sopenharmony_ci
46738c2ecf20Sopenharmony_civoid send_flush_mdlog(struct ceph_mds_session *s)
46748c2ecf20Sopenharmony_ci{
46758c2ecf20Sopenharmony_ci	struct ceph_msg *msg;
46768c2ecf20Sopenharmony_ci
46778c2ecf20Sopenharmony_ci	/*
46788c2ecf20Sopenharmony_ci	 * Pre-luminous MDS crashes when it sees an unknown session request
46798c2ecf20Sopenharmony_ci	 */
46808c2ecf20Sopenharmony_ci	if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS))
46818c2ecf20Sopenharmony_ci		return;
46828c2ecf20Sopenharmony_ci
46838c2ecf20Sopenharmony_ci	mutex_lock(&s->s_mutex);
46848c2ecf20Sopenharmony_ci	dout("request mdlog flush to mds%d (%s)s seq %lld\n", s->s_mds,
46858c2ecf20Sopenharmony_ci	     ceph_session_state_name(s->s_state), s->s_seq);
46868c2ecf20Sopenharmony_ci	msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG,
46878c2ecf20Sopenharmony_ci				      s->s_seq);
46888c2ecf20Sopenharmony_ci	if (!msg) {
46898c2ecf20Sopenharmony_ci		pr_err("failed to request mdlog flush to mds%d (%s) seq %lld\n",
46908c2ecf20Sopenharmony_ci		       s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);
46918c2ecf20Sopenharmony_ci	} else {
46928c2ecf20Sopenharmony_ci		ceph_con_send(&s->s_con, msg);
46938c2ecf20Sopenharmony_ci	}
46948c2ecf20Sopenharmony_ci	mutex_unlock(&s->s_mutex);
46958c2ecf20Sopenharmony_ci}
46968c2ecf20Sopenharmony_ci
46978c2ecf20Sopenharmony_ci/*
46988c2ecf20Sopenharmony_ci * called before mount is ro, and before dentries are torn down.
46998c2ecf20Sopenharmony_ci * (hmm, does this still race with new lookups?)
47008c2ecf20Sopenharmony_ci */
47018c2ecf20Sopenharmony_civoid ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
47028c2ecf20Sopenharmony_ci{
47038c2ecf20Sopenharmony_ci	dout("pre_umount\n");
47048c2ecf20Sopenharmony_ci	mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN;
47058c2ecf20Sopenharmony_ci
47068c2ecf20Sopenharmony_ci	ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
47078c2ecf20Sopenharmony_ci	ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
47088c2ecf20Sopenharmony_ci	ceph_flush_dirty_caps(mdsc);
47098c2ecf20Sopenharmony_ci	wait_requests(mdsc);
47108c2ecf20Sopenharmony_ci
47118c2ecf20Sopenharmony_ci	/*
47128c2ecf20Sopenharmony_ci	 * wait for reply handlers to drop their request refs and
47138c2ecf20Sopenharmony_ci	 * their inode/dcache refs
47148c2ecf20Sopenharmony_ci	 */
47158c2ecf20Sopenharmony_ci	ceph_msgr_flush();
47168c2ecf20Sopenharmony_ci
47178c2ecf20Sopenharmony_ci	ceph_cleanup_quotarealms_inodes(mdsc);
47188c2ecf20Sopenharmony_ci}
47198c2ecf20Sopenharmony_ci
47208c2ecf20Sopenharmony_ci/*
47218c2ecf20Sopenharmony_ci * wait for all write mds requests to flush.
47228c2ecf20Sopenharmony_ci */
47238c2ecf20Sopenharmony_cistatic void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
47248c2ecf20Sopenharmony_ci{
47258c2ecf20Sopenharmony_ci	struct ceph_mds_request *req = NULL, *nextreq;
47268c2ecf20Sopenharmony_ci	struct rb_node *n;
47278c2ecf20Sopenharmony_ci
47288c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
47298c2ecf20Sopenharmony_ci	dout("wait_unsafe_requests want %lld\n", want_tid);
47308c2ecf20Sopenharmony_cirestart:
47318c2ecf20Sopenharmony_ci	req = __get_oldest_req(mdsc);
47328c2ecf20Sopenharmony_ci	while (req && req->r_tid <= want_tid) {
47338c2ecf20Sopenharmony_ci		/* find next request */
47348c2ecf20Sopenharmony_ci		n = rb_next(&req->r_node);
47358c2ecf20Sopenharmony_ci		if (n)
47368c2ecf20Sopenharmony_ci			nextreq = rb_entry(n, struct ceph_mds_request, r_node);
47378c2ecf20Sopenharmony_ci		else
47388c2ecf20Sopenharmony_ci			nextreq = NULL;
47398c2ecf20Sopenharmony_ci		if (req->r_op != CEPH_MDS_OP_SETFILELOCK &&
47408c2ecf20Sopenharmony_ci		    (req->r_op & CEPH_MDS_OP_WRITE)) {
47418c2ecf20Sopenharmony_ci			/* write op */
47428c2ecf20Sopenharmony_ci			ceph_mdsc_get_request(req);
47438c2ecf20Sopenharmony_ci			if (nextreq)
47448c2ecf20Sopenharmony_ci				ceph_mdsc_get_request(nextreq);
47458c2ecf20Sopenharmony_ci			mutex_unlock(&mdsc->mutex);
47468c2ecf20Sopenharmony_ci			dout("wait_unsafe_requests  wait on %llu (want %llu)\n",
47478c2ecf20Sopenharmony_ci			     req->r_tid, want_tid);
47488c2ecf20Sopenharmony_ci			wait_for_completion(&req->r_safe_completion);
47498c2ecf20Sopenharmony_ci			mutex_lock(&mdsc->mutex);
47508c2ecf20Sopenharmony_ci			ceph_mdsc_put_request(req);
47518c2ecf20Sopenharmony_ci			if (!nextreq)
47528c2ecf20Sopenharmony_ci				break;  /* next dne before, so we're done! */
47538c2ecf20Sopenharmony_ci			if (RB_EMPTY_NODE(&nextreq->r_node)) {
47548c2ecf20Sopenharmony_ci				/* next request was removed from tree */
47558c2ecf20Sopenharmony_ci				ceph_mdsc_put_request(nextreq);
47568c2ecf20Sopenharmony_ci				goto restart;
47578c2ecf20Sopenharmony_ci			}
47588c2ecf20Sopenharmony_ci			ceph_mdsc_put_request(nextreq);  /* won't go away */
47598c2ecf20Sopenharmony_ci		}
47608c2ecf20Sopenharmony_ci		req = nextreq;
47618c2ecf20Sopenharmony_ci	}
47628c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
47638c2ecf20Sopenharmony_ci	dout("wait_unsafe_requests done\n");
47648c2ecf20Sopenharmony_ci}
47658c2ecf20Sopenharmony_ci
47668c2ecf20Sopenharmony_civoid ceph_mdsc_sync(struct ceph_mds_client *mdsc)
47678c2ecf20Sopenharmony_ci{
47688c2ecf20Sopenharmony_ci	u64 want_tid, want_flush;
47698c2ecf20Sopenharmony_ci
47708c2ecf20Sopenharmony_ci	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
47718c2ecf20Sopenharmony_ci		return;
47728c2ecf20Sopenharmony_ci
47738c2ecf20Sopenharmony_ci	dout("sync\n");
47748c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
47758c2ecf20Sopenharmony_ci	want_tid = mdsc->last_tid;
47768c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
47778c2ecf20Sopenharmony_ci
47788c2ecf20Sopenharmony_ci	ceph_flush_dirty_caps(mdsc);
47798c2ecf20Sopenharmony_ci	spin_lock(&mdsc->cap_dirty_lock);
47808c2ecf20Sopenharmony_ci	want_flush = mdsc->last_cap_flush_tid;
47818c2ecf20Sopenharmony_ci	if (!list_empty(&mdsc->cap_flush_list)) {
47828c2ecf20Sopenharmony_ci		struct ceph_cap_flush *cf =
47838c2ecf20Sopenharmony_ci			list_last_entry(&mdsc->cap_flush_list,
47848c2ecf20Sopenharmony_ci					struct ceph_cap_flush, g_list);
47858c2ecf20Sopenharmony_ci		cf->wake = true;
47868c2ecf20Sopenharmony_ci	}
47878c2ecf20Sopenharmony_ci	spin_unlock(&mdsc->cap_dirty_lock);
47888c2ecf20Sopenharmony_ci
47898c2ecf20Sopenharmony_ci	dout("sync want tid %lld flush_seq %lld\n",
47908c2ecf20Sopenharmony_ci	     want_tid, want_flush);
47918c2ecf20Sopenharmony_ci
47928c2ecf20Sopenharmony_ci	wait_unsafe_requests(mdsc, want_tid);
47938c2ecf20Sopenharmony_ci	wait_caps_flush(mdsc, want_flush);
47948c2ecf20Sopenharmony_ci}
47958c2ecf20Sopenharmony_ci
47968c2ecf20Sopenharmony_ci/*
47978c2ecf20Sopenharmony_ci * true if all sessions are closed, or we force unmount
47988c2ecf20Sopenharmony_ci */
47998c2ecf20Sopenharmony_cistatic bool done_closing_sessions(struct ceph_mds_client *mdsc, int skipped)
48008c2ecf20Sopenharmony_ci{
48018c2ecf20Sopenharmony_ci	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
48028c2ecf20Sopenharmony_ci		return true;
48038c2ecf20Sopenharmony_ci	return atomic_read(&mdsc->num_sessions) <= skipped;
48048c2ecf20Sopenharmony_ci}
48058c2ecf20Sopenharmony_ci
48068c2ecf20Sopenharmony_ci/*
48078c2ecf20Sopenharmony_ci * called after sb is ro.
48088c2ecf20Sopenharmony_ci */
48098c2ecf20Sopenharmony_civoid ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
48108c2ecf20Sopenharmony_ci{
48118c2ecf20Sopenharmony_ci	struct ceph_options *opts = mdsc->fsc->client->options;
48128c2ecf20Sopenharmony_ci	struct ceph_mds_session *session;
48138c2ecf20Sopenharmony_ci	int i;
48148c2ecf20Sopenharmony_ci	int skipped = 0;
48158c2ecf20Sopenharmony_ci
48168c2ecf20Sopenharmony_ci	dout("close_sessions\n");
48178c2ecf20Sopenharmony_ci
48188c2ecf20Sopenharmony_ci	/* close sessions */
48198c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
48208c2ecf20Sopenharmony_ci	for (i = 0; i < mdsc->max_sessions; i++) {
48218c2ecf20Sopenharmony_ci		session = __ceph_lookup_mds_session(mdsc, i);
48228c2ecf20Sopenharmony_ci		if (!session)
48238c2ecf20Sopenharmony_ci			continue;
48248c2ecf20Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
48258c2ecf20Sopenharmony_ci		mutex_lock(&session->s_mutex);
48268c2ecf20Sopenharmony_ci		if (__close_session(mdsc, session) <= 0)
48278c2ecf20Sopenharmony_ci			skipped++;
48288c2ecf20Sopenharmony_ci		mutex_unlock(&session->s_mutex);
48298c2ecf20Sopenharmony_ci		ceph_put_mds_session(session);
48308c2ecf20Sopenharmony_ci		mutex_lock(&mdsc->mutex);
48318c2ecf20Sopenharmony_ci	}
48328c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
48338c2ecf20Sopenharmony_ci
48348c2ecf20Sopenharmony_ci	dout("waiting for sessions to close\n");
48358c2ecf20Sopenharmony_ci	wait_event_timeout(mdsc->session_close_wq,
48368c2ecf20Sopenharmony_ci			   done_closing_sessions(mdsc, skipped),
48378c2ecf20Sopenharmony_ci			   ceph_timeout_jiffies(opts->mount_timeout));
48388c2ecf20Sopenharmony_ci
48398c2ecf20Sopenharmony_ci	/* tear down remaining sessions */
48408c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
48418c2ecf20Sopenharmony_ci	for (i = 0; i < mdsc->max_sessions; i++) {
48428c2ecf20Sopenharmony_ci		if (mdsc->sessions[i]) {
48438c2ecf20Sopenharmony_ci			session = ceph_get_mds_session(mdsc->sessions[i]);
48448c2ecf20Sopenharmony_ci			__unregister_session(mdsc, session);
48458c2ecf20Sopenharmony_ci			mutex_unlock(&mdsc->mutex);
48468c2ecf20Sopenharmony_ci			mutex_lock(&session->s_mutex);
48478c2ecf20Sopenharmony_ci			remove_session_caps(session);
48488c2ecf20Sopenharmony_ci			mutex_unlock(&session->s_mutex);
48498c2ecf20Sopenharmony_ci			ceph_put_mds_session(session);
48508c2ecf20Sopenharmony_ci			mutex_lock(&mdsc->mutex);
48518c2ecf20Sopenharmony_ci		}
48528c2ecf20Sopenharmony_ci	}
48538c2ecf20Sopenharmony_ci	WARN_ON(!list_empty(&mdsc->cap_delay_list));
48548c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
48558c2ecf20Sopenharmony_ci
48568c2ecf20Sopenharmony_ci	ceph_cleanup_snapid_map(mdsc);
48578c2ecf20Sopenharmony_ci	ceph_cleanup_empty_realms(mdsc);
48588c2ecf20Sopenharmony_ci
48598c2ecf20Sopenharmony_ci	cancel_work_sync(&mdsc->cap_reclaim_work);
48608c2ecf20Sopenharmony_ci	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
48618c2ecf20Sopenharmony_ci
48628c2ecf20Sopenharmony_ci	dout("stopped\n");
48638c2ecf20Sopenharmony_ci}
48648c2ecf20Sopenharmony_ci
48658c2ecf20Sopenharmony_civoid ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
48668c2ecf20Sopenharmony_ci{
48678c2ecf20Sopenharmony_ci	struct ceph_mds_session *session;
48688c2ecf20Sopenharmony_ci	int mds;
48698c2ecf20Sopenharmony_ci
48708c2ecf20Sopenharmony_ci	dout("force umount\n");
48718c2ecf20Sopenharmony_ci
48728c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
48738c2ecf20Sopenharmony_ci	for (mds = 0; mds < mdsc->max_sessions; mds++) {
48748c2ecf20Sopenharmony_ci		session = __ceph_lookup_mds_session(mdsc, mds);
48758c2ecf20Sopenharmony_ci		if (!session)
48768c2ecf20Sopenharmony_ci			continue;
48778c2ecf20Sopenharmony_ci
48788c2ecf20Sopenharmony_ci		if (session->s_state == CEPH_MDS_SESSION_REJECTED)
48798c2ecf20Sopenharmony_ci			__unregister_session(mdsc, session);
48808c2ecf20Sopenharmony_ci		__wake_requests(mdsc, &session->s_waiting);
48818c2ecf20Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
48828c2ecf20Sopenharmony_ci
48838c2ecf20Sopenharmony_ci		mutex_lock(&session->s_mutex);
48848c2ecf20Sopenharmony_ci		__close_session(mdsc, session);
48858c2ecf20Sopenharmony_ci		if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
48868c2ecf20Sopenharmony_ci			cleanup_session_requests(mdsc, session);
48878c2ecf20Sopenharmony_ci			remove_session_caps(session);
48888c2ecf20Sopenharmony_ci		}
48898c2ecf20Sopenharmony_ci		mutex_unlock(&session->s_mutex);
48908c2ecf20Sopenharmony_ci		ceph_put_mds_session(session);
48918c2ecf20Sopenharmony_ci
48928c2ecf20Sopenharmony_ci		mutex_lock(&mdsc->mutex);
48938c2ecf20Sopenharmony_ci		kick_requests(mdsc, mds);
48948c2ecf20Sopenharmony_ci	}
48958c2ecf20Sopenharmony_ci	__wake_requests(mdsc, &mdsc->waiting_for_map);
48968c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
48978c2ecf20Sopenharmony_ci}
48988c2ecf20Sopenharmony_ci
48998c2ecf20Sopenharmony_cistatic void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
49008c2ecf20Sopenharmony_ci{
49018c2ecf20Sopenharmony_ci	dout("stop\n");
49028c2ecf20Sopenharmony_ci	/*
49038c2ecf20Sopenharmony_ci	 * Make sure the delayed work stopped before releasing
49048c2ecf20Sopenharmony_ci	 * the resources.
49058c2ecf20Sopenharmony_ci	 *
49068c2ecf20Sopenharmony_ci	 * Because the cancel_delayed_work_sync() will only
49078c2ecf20Sopenharmony_ci	 * guarantee that the work finishes executing. But the
49088c2ecf20Sopenharmony_ci	 * delayed work will re-arm itself again after that.
49098c2ecf20Sopenharmony_ci	 */
49108c2ecf20Sopenharmony_ci	flush_delayed_work(&mdsc->delayed_work);
49118c2ecf20Sopenharmony_ci
49128c2ecf20Sopenharmony_ci	if (mdsc->mdsmap)
49138c2ecf20Sopenharmony_ci		ceph_mdsmap_destroy(mdsc->mdsmap);
49148c2ecf20Sopenharmony_ci	kfree(mdsc->sessions);
49158c2ecf20Sopenharmony_ci	ceph_caps_finalize(mdsc);
49168c2ecf20Sopenharmony_ci	ceph_pool_perm_destroy(mdsc);
49178c2ecf20Sopenharmony_ci}
49188c2ecf20Sopenharmony_ci
49198c2ecf20Sopenharmony_civoid ceph_mdsc_destroy(struct ceph_fs_client *fsc)
49208c2ecf20Sopenharmony_ci{
49218c2ecf20Sopenharmony_ci	struct ceph_mds_client *mdsc = fsc->mdsc;
49228c2ecf20Sopenharmony_ci	dout("mdsc_destroy %p\n", mdsc);
49238c2ecf20Sopenharmony_ci
49248c2ecf20Sopenharmony_ci	if (!mdsc)
49258c2ecf20Sopenharmony_ci		return;
49268c2ecf20Sopenharmony_ci
49278c2ecf20Sopenharmony_ci	/* flush out any connection work with references to us */
49288c2ecf20Sopenharmony_ci	ceph_msgr_flush();
49298c2ecf20Sopenharmony_ci
49308c2ecf20Sopenharmony_ci	ceph_mdsc_stop(mdsc);
49318c2ecf20Sopenharmony_ci
49328c2ecf20Sopenharmony_ci	ceph_metric_destroy(&mdsc->metric);
49338c2ecf20Sopenharmony_ci
49348c2ecf20Sopenharmony_ci	fsc->mdsc = NULL;
49358c2ecf20Sopenharmony_ci	kfree(mdsc);
49368c2ecf20Sopenharmony_ci	dout("mdsc_destroy %p done\n", mdsc);
49378c2ecf20Sopenharmony_ci}
49388c2ecf20Sopenharmony_ci
49398c2ecf20Sopenharmony_civoid ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
49408c2ecf20Sopenharmony_ci{
49418c2ecf20Sopenharmony_ci	struct ceph_fs_client *fsc = mdsc->fsc;
49428c2ecf20Sopenharmony_ci	const char *mds_namespace = fsc->mount_options->mds_namespace;
49438c2ecf20Sopenharmony_ci	void *p = msg->front.iov_base;
49448c2ecf20Sopenharmony_ci	void *end = p + msg->front.iov_len;
49458c2ecf20Sopenharmony_ci	u32 epoch;
49468c2ecf20Sopenharmony_ci	u32 map_len;
49478c2ecf20Sopenharmony_ci	u32 num_fs;
49488c2ecf20Sopenharmony_ci	u32 mount_fscid = (u32)-1;
49498c2ecf20Sopenharmony_ci	u8 struct_v, struct_cv;
49508c2ecf20Sopenharmony_ci	int err = -EINVAL;
49518c2ecf20Sopenharmony_ci
49528c2ecf20Sopenharmony_ci	ceph_decode_need(&p, end, sizeof(u32), bad);
49538c2ecf20Sopenharmony_ci	epoch = ceph_decode_32(&p);
49548c2ecf20Sopenharmony_ci
49558c2ecf20Sopenharmony_ci	dout("handle_fsmap epoch %u\n", epoch);
49568c2ecf20Sopenharmony_ci
49578c2ecf20Sopenharmony_ci	ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
49588c2ecf20Sopenharmony_ci	struct_v = ceph_decode_8(&p);
49598c2ecf20Sopenharmony_ci	struct_cv = ceph_decode_8(&p);
49608c2ecf20Sopenharmony_ci	map_len = ceph_decode_32(&p);
49618c2ecf20Sopenharmony_ci
49628c2ecf20Sopenharmony_ci	ceph_decode_need(&p, end, sizeof(u32) * 3, bad);
49638c2ecf20Sopenharmony_ci	p += sizeof(u32) * 2; /* skip epoch and legacy_client_fscid */
49648c2ecf20Sopenharmony_ci
49658c2ecf20Sopenharmony_ci	num_fs = ceph_decode_32(&p);
49668c2ecf20Sopenharmony_ci	while (num_fs-- > 0) {
49678c2ecf20Sopenharmony_ci		void *info_p, *info_end;
49688c2ecf20Sopenharmony_ci		u32 info_len;
49698c2ecf20Sopenharmony_ci		u8 info_v, info_cv;
49708c2ecf20Sopenharmony_ci		u32 fscid, namelen;
49718c2ecf20Sopenharmony_ci
49728c2ecf20Sopenharmony_ci		ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
49738c2ecf20Sopenharmony_ci		info_v = ceph_decode_8(&p);
49748c2ecf20Sopenharmony_ci		info_cv = ceph_decode_8(&p);
49758c2ecf20Sopenharmony_ci		info_len = ceph_decode_32(&p);
49768c2ecf20Sopenharmony_ci		ceph_decode_need(&p, end, info_len, bad);
49778c2ecf20Sopenharmony_ci		info_p = p;
49788c2ecf20Sopenharmony_ci		info_end = p + info_len;
49798c2ecf20Sopenharmony_ci		p = info_end;
49808c2ecf20Sopenharmony_ci
49818c2ecf20Sopenharmony_ci		ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad);
49828c2ecf20Sopenharmony_ci		fscid = ceph_decode_32(&info_p);
49838c2ecf20Sopenharmony_ci		namelen = ceph_decode_32(&info_p);
49848c2ecf20Sopenharmony_ci		ceph_decode_need(&info_p, info_end, namelen, bad);
49858c2ecf20Sopenharmony_ci
49868c2ecf20Sopenharmony_ci		if (mds_namespace &&
49878c2ecf20Sopenharmony_ci		    strlen(mds_namespace) == namelen &&
49888c2ecf20Sopenharmony_ci		    !strncmp(mds_namespace, (char *)info_p, namelen)) {
49898c2ecf20Sopenharmony_ci			mount_fscid = fscid;
49908c2ecf20Sopenharmony_ci			break;
49918c2ecf20Sopenharmony_ci		}
49928c2ecf20Sopenharmony_ci	}
49938c2ecf20Sopenharmony_ci
49948c2ecf20Sopenharmony_ci	ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch);
49958c2ecf20Sopenharmony_ci	if (mount_fscid != (u32)-1) {
49968c2ecf20Sopenharmony_ci		fsc->client->monc.fs_cluster_id = mount_fscid;
49978c2ecf20Sopenharmony_ci		ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
49988c2ecf20Sopenharmony_ci				   0, true);
49998c2ecf20Sopenharmony_ci		ceph_monc_renew_subs(&fsc->client->monc);
50008c2ecf20Sopenharmony_ci	} else {
50018c2ecf20Sopenharmony_ci		err = -ENOENT;
50028c2ecf20Sopenharmony_ci		goto err_out;
50038c2ecf20Sopenharmony_ci	}
50048c2ecf20Sopenharmony_ci	return;
50058c2ecf20Sopenharmony_ci
50068c2ecf20Sopenharmony_cibad:
50078c2ecf20Sopenharmony_ci	pr_err("error decoding fsmap\n");
50088c2ecf20Sopenharmony_cierr_out:
50098c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
50108c2ecf20Sopenharmony_ci	mdsc->mdsmap_err = err;
50118c2ecf20Sopenharmony_ci	__wake_requests(mdsc, &mdsc->waiting_for_map);
50128c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
50138c2ecf20Sopenharmony_ci}
50148c2ecf20Sopenharmony_ci
50158c2ecf20Sopenharmony_ci/*
50168c2ecf20Sopenharmony_ci * handle mds map update.
50178c2ecf20Sopenharmony_ci */
50188c2ecf20Sopenharmony_civoid ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
50198c2ecf20Sopenharmony_ci{
50208c2ecf20Sopenharmony_ci	u32 epoch;
50218c2ecf20Sopenharmony_ci	u32 maplen;
50228c2ecf20Sopenharmony_ci	void *p = msg->front.iov_base;
50238c2ecf20Sopenharmony_ci	void *end = p + msg->front.iov_len;
50248c2ecf20Sopenharmony_ci	struct ceph_mdsmap *newmap, *oldmap;
50258c2ecf20Sopenharmony_ci	struct ceph_fsid fsid;
50268c2ecf20Sopenharmony_ci	int err = -EINVAL;
50278c2ecf20Sopenharmony_ci
50288c2ecf20Sopenharmony_ci	ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
50298c2ecf20Sopenharmony_ci	ceph_decode_copy(&p, &fsid, sizeof(fsid));
50308c2ecf20Sopenharmony_ci	if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
50318c2ecf20Sopenharmony_ci		return;
50328c2ecf20Sopenharmony_ci	epoch = ceph_decode_32(&p);
50338c2ecf20Sopenharmony_ci	maplen = ceph_decode_32(&p);
50348c2ecf20Sopenharmony_ci	dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
50358c2ecf20Sopenharmony_ci
50368c2ecf20Sopenharmony_ci	/* do we need it? */
50378c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
50388c2ecf20Sopenharmony_ci	if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
50398c2ecf20Sopenharmony_ci		dout("handle_map epoch %u <= our %u\n",
50408c2ecf20Sopenharmony_ci		     epoch, mdsc->mdsmap->m_epoch);
50418c2ecf20Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
50428c2ecf20Sopenharmony_ci		return;
50438c2ecf20Sopenharmony_ci	}
50448c2ecf20Sopenharmony_ci
50458c2ecf20Sopenharmony_ci	newmap = ceph_mdsmap_decode(&p, end);
50468c2ecf20Sopenharmony_ci	if (IS_ERR(newmap)) {
50478c2ecf20Sopenharmony_ci		err = PTR_ERR(newmap);
50488c2ecf20Sopenharmony_ci		goto bad_unlock;
50498c2ecf20Sopenharmony_ci	}
50508c2ecf20Sopenharmony_ci
50518c2ecf20Sopenharmony_ci	/* swap into place */
50528c2ecf20Sopenharmony_ci	if (mdsc->mdsmap) {
50538c2ecf20Sopenharmony_ci		oldmap = mdsc->mdsmap;
50548c2ecf20Sopenharmony_ci		mdsc->mdsmap = newmap;
50558c2ecf20Sopenharmony_ci		check_new_map(mdsc, newmap, oldmap);
50568c2ecf20Sopenharmony_ci		ceph_mdsmap_destroy(oldmap);
50578c2ecf20Sopenharmony_ci	} else {
50588c2ecf20Sopenharmony_ci		mdsc->mdsmap = newmap;  /* first mds map */
50598c2ecf20Sopenharmony_ci	}
50608c2ecf20Sopenharmony_ci	mdsc->fsc->max_file_size = min((loff_t)mdsc->mdsmap->m_max_file_size,
50618c2ecf20Sopenharmony_ci					MAX_LFS_FILESIZE);
50628c2ecf20Sopenharmony_ci
50638c2ecf20Sopenharmony_ci	__wake_requests(mdsc, &mdsc->waiting_for_map);
50648c2ecf20Sopenharmony_ci	ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP,
50658c2ecf20Sopenharmony_ci			  mdsc->mdsmap->m_epoch);
50668c2ecf20Sopenharmony_ci
50678c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
50688c2ecf20Sopenharmony_ci	schedule_delayed(mdsc, 0);
50698c2ecf20Sopenharmony_ci	return;
50708c2ecf20Sopenharmony_ci
50718c2ecf20Sopenharmony_cibad_unlock:
50728c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
50738c2ecf20Sopenharmony_cibad:
50748c2ecf20Sopenharmony_ci	pr_err("error decoding mdsmap %d\n", err);
50758c2ecf20Sopenharmony_ci	return;
50768c2ecf20Sopenharmony_ci}
50778c2ecf20Sopenharmony_ci
50788c2ecf20Sopenharmony_cistatic struct ceph_connection *con_get(struct ceph_connection *con)
50798c2ecf20Sopenharmony_ci{
50808c2ecf20Sopenharmony_ci	struct ceph_mds_session *s = con->private;
50818c2ecf20Sopenharmony_ci
50828c2ecf20Sopenharmony_ci	if (ceph_get_mds_session(s))
50838c2ecf20Sopenharmony_ci		return con;
50848c2ecf20Sopenharmony_ci	return NULL;
50858c2ecf20Sopenharmony_ci}
50868c2ecf20Sopenharmony_ci
50878c2ecf20Sopenharmony_cistatic void con_put(struct ceph_connection *con)
50888c2ecf20Sopenharmony_ci{
50898c2ecf20Sopenharmony_ci	struct ceph_mds_session *s = con->private;
50908c2ecf20Sopenharmony_ci
50918c2ecf20Sopenharmony_ci	ceph_put_mds_session(s);
50928c2ecf20Sopenharmony_ci}
50938c2ecf20Sopenharmony_ci
50948c2ecf20Sopenharmony_ci/*
50958c2ecf20Sopenharmony_ci * if the client is unresponsive for long enough, the mds will kill
50968c2ecf20Sopenharmony_ci * the session entirely.
50978c2ecf20Sopenharmony_ci */
50988c2ecf20Sopenharmony_cistatic void peer_reset(struct ceph_connection *con)
50998c2ecf20Sopenharmony_ci{
51008c2ecf20Sopenharmony_ci	struct ceph_mds_session *s = con->private;
51018c2ecf20Sopenharmony_ci	struct ceph_mds_client *mdsc = s->s_mdsc;
51028c2ecf20Sopenharmony_ci
51038c2ecf20Sopenharmony_ci	pr_warn("mds%d closed our session\n", s->s_mds);
51048c2ecf20Sopenharmony_ci	send_mds_reconnect(mdsc, s);
51058c2ecf20Sopenharmony_ci}
51068c2ecf20Sopenharmony_ci
51078c2ecf20Sopenharmony_cistatic void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
51088c2ecf20Sopenharmony_ci{
51098c2ecf20Sopenharmony_ci	struct ceph_mds_session *s = con->private;
51108c2ecf20Sopenharmony_ci	struct ceph_mds_client *mdsc = s->s_mdsc;
51118c2ecf20Sopenharmony_ci	int type = le16_to_cpu(msg->hdr.type);
51128c2ecf20Sopenharmony_ci
51138c2ecf20Sopenharmony_ci	mutex_lock(&mdsc->mutex);
51148c2ecf20Sopenharmony_ci	if (__verify_registered_session(mdsc, s) < 0) {
51158c2ecf20Sopenharmony_ci		mutex_unlock(&mdsc->mutex);
51168c2ecf20Sopenharmony_ci		goto out;
51178c2ecf20Sopenharmony_ci	}
51188c2ecf20Sopenharmony_ci	mutex_unlock(&mdsc->mutex);
51198c2ecf20Sopenharmony_ci
51208c2ecf20Sopenharmony_ci	switch (type) {
51218c2ecf20Sopenharmony_ci	case CEPH_MSG_MDS_MAP:
51228c2ecf20Sopenharmony_ci		ceph_mdsc_handle_mdsmap(mdsc, msg);
51238c2ecf20Sopenharmony_ci		break;
51248c2ecf20Sopenharmony_ci	case CEPH_MSG_FS_MAP_USER:
51258c2ecf20Sopenharmony_ci		ceph_mdsc_handle_fsmap(mdsc, msg);
51268c2ecf20Sopenharmony_ci		break;
51278c2ecf20Sopenharmony_ci	case CEPH_MSG_CLIENT_SESSION:
51288c2ecf20Sopenharmony_ci		handle_session(s, msg);
51298c2ecf20Sopenharmony_ci		break;
51308c2ecf20Sopenharmony_ci	case CEPH_MSG_CLIENT_REPLY:
51318c2ecf20Sopenharmony_ci		handle_reply(s, msg);
51328c2ecf20Sopenharmony_ci		break;
51338c2ecf20Sopenharmony_ci	case CEPH_MSG_CLIENT_REQUEST_FORWARD:
51348c2ecf20Sopenharmony_ci		handle_forward(mdsc, s, msg);
51358c2ecf20Sopenharmony_ci		break;
51368c2ecf20Sopenharmony_ci	case CEPH_MSG_CLIENT_CAPS:
51378c2ecf20Sopenharmony_ci		ceph_handle_caps(s, msg);
51388c2ecf20Sopenharmony_ci		break;
51398c2ecf20Sopenharmony_ci	case CEPH_MSG_CLIENT_SNAP:
51408c2ecf20Sopenharmony_ci		ceph_handle_snap(mdsc, s, msg);
51418c2ecf20Sopenharmony_ci		break;
51428c2ecf20Sopenharmony_ci	case CEPH_MSG_CLIENT_LEASE:
51438c2ecf20Sopenharmony_ci		handle_lease(mdsc, s, msg);
51448c2ecf20Sopenharmony_ci		break;
51458c2ecf20Sopenharmony_ci	case CEPH_MSG_CLIENT_QUOTA:
51468c2ecf20Sopenharmony_ci		ceph_handle_quota(mdsc, s, msg);
51478c2ecf20Sopenharmony_ci		break;
51488c2ecf20Sopenharmony_ci
51498c2ecf20Sopenharmony_ci	default:
51508c2ecf20Sopenharmony_ci		pr_err("received unknown message type %d %s\n", type,
51518c2ecf20Sopenharmony_ci		       ceph_msg_type_name(type));
51528c2ecf20Sopenharmony_ci	}
51538c2ecf20Sopenharmony_ciout:
51548c2ecf20Sopenharmony_ci	ceph_msg_put(msg);
51558c2ecf20Sopenharmony_ci}
51568c2ecf20Sopenharmony_ci
51578c2ecf20Sopenharmony_ci/*
51588c2ecf20Sopenharmony_ci * authentication
51598c2ecf20Sopenharmony_ci */
51608c2ecf20Sopenharmony_ci
51618c2ecf20Sopenharmony_ci/*
51628c2ecf20Sopenharmony_ci * Note: returned pointer is the address of a structure that's
51638c2ecf20Sopenharmony_ci * managed separately.  Caller must *not* attempt to free it.
51648c2ecf20Sopenharmony_ci */
51658c2ecf20Sopenharmony_cistatic struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
51668c2ecf20Sopenharmony_ci					int *proto, int force_new)
51678c2ecf20Sopenharmony_ci{
51688c2ecf20Sopenharmony_ci	struct ceph_mds_session *s = con->private;
51698c2ecf20Sopenharmony_ci	struct ceph_mds_client *mdsc = s->s_mdsc;
51708c2ecf20Sopenharmony_ci	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
51718c2ecf20Sopenharmony_ci	struct ceph_auth_handshake *auth = &s->s_auth;
51728c2ecf20Sopenharmony_ci
51738c2ecf20Sopenharmony_ci	if (force_new && auth->authorizer) {
51748c2ecf20Sopenharmony_ci		ceph_auth_destroy_authorizer(auth->authorizer);
51758c2ecf20Sopenharmony_ci		auth->authorizer = NULL;
51768c2ecf20Sopenharmony_ci	}
51778c2ecf20Sopenharmony_ci	if (!auth->authorizer) {
51788c2ecf20Sopenharmony_ci		int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
51798c2ecf20Sopenharmony_ci						      auth);
51808c2ecf20Sopenharmony_ci		if (ret)
51818c2ecf20Sopenharmony_ci			return ERR_PTR(ret);
51828c2ecf20Sopenharmony_ci	} else {
51838c2ecf20Sopenharmony_ci		int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
51848c2ecf20Sopenharmony_ci						      auth);
51858c2ecf20Sopenharmony_ci		if (ret)
51868c2ecf20Sopenharmony_ci			return ERR_PTR(ret);
51878c2ecf20Sopenharmony_ci	}
51888c2ecf20Sopenharmony_ci	*proto = ac->protocol;
51898c2ecf20Sopenharmony_ci
51908c2ecf20Sopenharmony_ci	return auth;
51918c2ecf20Sopenharmony_ci}
51928c2ecf20Sopenharmony_ci
51938c2ecf20Sopenharmony_cistatic int add_authorizer_challenge(struct ceph_connection *con,
51948c2ecf20Sopenharmony_ci				    void *challenge_buf, int challenge_buf_len)
51958c2ecf20Sopenharmony_ci{
51968c2ecf20Sopenharmony_ci	struct ceph_mds_session *s = con->private;
51978c2ecf20Sopenharmony_ci	struct ceph_mds_client *mdsc = s->s_mdsc;
51988c2ecf20Sopenharmony_ci	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
51998c2ecf20Sopenharmony_ci
52008c2ecf20Sopenharmony_ci	return ceph_auth_add_authorizer_challenge(ac, s->s_auth.authorizer,
52018c2ecf20Sopenharmony_ci					    challenge_buf, challenge_buf_len);
52028c2ecf20Sopenharmony_ci}
52038c2ecf20Sopenharmony_ci
52048c2ecf20Sopenharmony_cistatic int verify_authorizer_reply(struct ceph_connection *con)
52058c2ecf20Sopenharmony_ci{
52068c2ecf20Sopenharmony_ci	struct ceph_mds_session *s = con->private;
52078c2ecf20Sopenharmony_ci	struct ceph_mds_client *mdsc = s->s_mdsc;
52088c2ecf20Sopenharmony_ci	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
52098c2ecf20Sopenharmony_ci
52108c2ecf20Sopenharmony_ci	return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer);
52118c2ecf20Sopenharmony_ci}
52128c2ecf20Sopenharmony_ci
52138c2ecf20Sopenharmony_cistatic int invalidate_authorizer(struct ceph_connection *con)
52148c2ecf20Sopenharmony_ci{
52158c2ecf20Sopenharmony_ci	struct ceph_mds_session *s = con->private;
52168c2ecf20Sopenharmony_ci	struct ceph_mds_client *mdsc = s->s_mdsc;
52178c2ecf20Sopenharmony_ci	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
52188c2ecf20Sopenharmony_ci
52198c2ecf20Sopenharmony_ci	ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
52208c2ecf20Sopenharmony_ci
52218c2ecf20Sopenharmony_ci	return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
52228c2ecf20Sopenharmony_ci}
52238c2ecf20Sopenharmony_ci
52248c2ecf20Sopenharmony_cistatic struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
52258c2ecf20Sopenharmony_ci				struct ceph_msg_header *hdr, int *skip)
52268c2ecf20Sopenharmony_ci{
52278c2ecf20Sopenharmony_ci	struct ceph_msg *msg;
52288c2ecf20Sopenharmony_ci	int type = (int) le16_to_cpu(hdr->type);
52298c2ecf20Sopenharmony_ci	int front_len = (int) le32_to_cpu(hdr->front_len);
52308c2ecf20Sopenharmony_ci
52318c2ecf20Sopenharmony_ci	if (con->in_msg)
52328c2ecf20Sopenharmony_ci		return con->in_msg;
52338c2ecf20Sopenharmony_ci
52348c2ecf20Sopenharmony_ci	*skip = 0;
52358c2ecf20Sopenharmony_ci	msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
52368c2ecf20Sopenharmony_ci	if (!msg) {
52378c2ecf20Sopenharmony_ci		pr_err("unable to allocate msg type %d len %d\n",
52388c2ecf20Sopenharmony_ci		       type, front_len);
52398c2ecf20Sopenharmony_ci		return NULL;
52408c2ecf20Sopenharmony_ci	}
52418c2ecf20Sopenharmony_ci
52428c2ecf20Sopenharmony_ci	return msg;
52438c2ecf20Sopenharmony_ci}
52448c2ecf20Sopenharmony_ci
52458c2ecf20Sopenharmony_cistatic int mds_sign_message(struct ceph_msg *msg)
52468c2ecf20Sopenharmony_ci{
52478c2ecf20Sopenharmony_ci       struct ceph_mds_session *s = msg->con->private;
52488c2ecf20Sopenharmony_ci       struct ceph_auth_handshake *auth = &s->s_auth;
52498c2ecf20Sopenharmony_ci
52508c2ecf20Sopenharmony_ci       return ceph_auth_sign_message(auth, msg);
52518c2ecf20Sopenharmony_ci}
52528c2ecf20Sopenharmony_ci
52538c2ecf20Sopenharmony_cistatic int mds_check_message_signature(struct ceph_msg *msg)
52548c2ecf20Sopenharmony_ci{
52558c2ecf20Sopenharmony_ci       struct ceph_mds_session *s = msg->con->private;
52568c2ecf20Sopenharmony_ci       struct ceph_auth_handshake *auth = &s->s_auth;
52578c2ecf20Sopenharmony_ci
52588c2ecf20Sopenharmony_ci       return ceph_auth_check_message_signature(auth, msg);
52598c2ecf20Sopenharmony_ci}
52608c2ecf20Sopenharmony_ci
52618c2ecf20Sopenharmony_cistatic const struct ceph_connection_operations mds_con_ops = {
52628c2ecf20Sopenharmony_ci	.get = con_get,
52638c2ecf20Sopenharmony_ci	.put = con_put,
52648c2ecf20Sopenharmony_ci	.dispatch = dispatch,
52658c2ecf20Sopenharmony_ci	.get_authorizer = get_authorizer,
52668c2ecf20Sopenharmony_ci	.add_authorizer_challenge = add_authorizer_challenge,
52678c2ecf20Sopenharmony_ci	.verify_authorizer_reply = verify_authorizer_reply,
52688c2ecf20Sopenharmony_ci	.invalidate_authorizer = invalidate_authorizer,
52698c2ecf20Sopenharmony_ci	.peer_reset = peer_reset,
52708c2ecf20Sopenharmony_ci	.alloc_msg = mds_alloc_msg,
52718c2ecf20Sopenharmony_ci	.sign_message = mds_sign_message,
52728c2ecf20Sopenharmony_ci	.check_message_signature = mds_check_message_signature,
52738c2ecf20Sopenharmony_ci};
52748c2ecf20Sopenharmony_ci
52758c2ecf20Sopenharmony_ci/* eof */
5276