18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
28c2ecf20Sopenharmony_ci/*
38c2ecf20Sopenharmony_ci * Device operations for the pnfs nfs4 file layout driver.
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
68c2ecf20Sopenharmony_ci *
78c2ecf20Sopenharmony_ci * Tao Peng <bergwolf@primarydata.com>
88c2ecf20Sopenharmony_ci */
98c2ecf20Sopenharmony_ci
108c2ecf20Sopenharmony_ci#include <linux/nfs_fs.h>
118c2ecf20Sopenharmony_ci#include <linux/vmalloc.h>
128c2ecf20Sopenharmony_ci#include <linux/module.h>
138c2ecf20Sopenharmony_ci#include <linux/sunrpc/addr.h>
148c2ecf20Sopenharmony_ci
158c2ecf20Sopenharmony_ci#include "../internal.h"
168c2ecf20Sopenharmony_ci#include "../nfs4session.h"
178c2ecf20Sopenharmony_ci#include "flexfilelayout.h"
188c2ecf20Sopenharmony_ci
198c2ecf20Sopenharmony_ci#define NFSDBG_FACILITY		NFSDBG_PNFS_LD
208c2ecf20Sopenharmony_ci
218c2ecf20Sopenharmony_cistatic unsigned int dataserver_timeo = NFS_DEF_TCP_TIMEO;
228c2ecf20Sopenharmony_cistatic unsigned int dataserver_retrans;
238c2ecf20Sopenharmony_ci
248c2ecf20Sopenharmony_cistatic bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
258c2ecf20Sopenharmony_ci
268c2ecf20Sopenharmony_civoid nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
278c2ecf20Sopenharmony_ci{
288c2ecf20Sopenharmony_ci	if (!IS_ERR_OR_NULL(mirror_ds))
298c2ecf20Sopenharmony_ci		nfs4_put_deviceid_node(&mirror_ds->id_node);
308c2ecf20Sopenharmony_ci}
318c2ecf20Sopenharmony_ci
328c2ecf20Sopenharmony_civoid nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
338c2ecf20Sopenharmony_ci{
348c2ecf20Sopenharmony_ci	nfs4_print_deviceid(&mirror_ds->id_node.deviceid);
358c2ecf20Sopenharmony_ci	nfs4_pnfs_ds_put(mirror_ds->ds);
368c2ecf20Sopenharmony_ci	kfree(mirror_ds->ds_versions);
378c2ecf20Sopenharmony_ci	kfree_rcu(mirror_ds, id_node.rcu);
388c2ecf20Sopenharmony_ci}
398c2ecf20Sopenharmony_ci
408c2ecf20Sopenharmony_ci/* Decode opaque device data and construct new_ds using it */
418c2ecf20Sopenharmony_cistruct nfs4_ff_layout_ds *
428c2ecf20Sopenharmony_cinfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
438c2ecf20Sopenharmony_ci			    gfp_t gfp_flags)
448c2ecf20Sopenharmony_ci{
458c2ecf20Sopenharmony_ci	struct xdr_stream stream;
468c2ecf20Sopenharmony_ci	struct xdr_buf buf;
478c2ecf20Sopenharmony_ci	struct page *scratch;
488c2ecf20Sopenharmony_ci	struct list_head dsaddrs;
498c2ecf20Sopenharmony_ci	struct nfs4_pnfs_ds_addr *da;
508c2ecf20Sopenharmony_ci	struct nfs4_ff_layout_ds *new_ds = NULL;
518c2ecf20Sopenharmony_ci	struct nfs4_ff_ds_version *ds_versions = NULL;
528c2ecf20Sopenharmony_ci	u32 mp_count;
538c2ecf20Sopenharmony_ci	u32 version_count;
548c2ecf20Sopenharmony_ci	__be32 *p;
558c2ecf20Sopenharmony_ci	int i, ret = -ENOMEM;
568c2ecf20Sopenharmony_ci
578c2ecf20Sopenharmony_ci	/* set up xdr stream */
588c2ecf20Sopenharmony_ci	scratch = alloc_page(gfp_flags);
598c2ecf20Sopenharmony_ci	if (!scratch)
608c2ecf20Sopenharmony_ci		goto out_err;
618c2ecf20Sopenharmony_ci
628c2ecf20Sopenharmony_ci	new_ds = kzalloc(sizeof(struct nfs4_ff_layout_ds), gfp_flags);
638c2ecf20Sopenharmony_ci	if (!new_ds)
648c2ecf20Sopenharmony_ci		goto out_scratch;
658c2ecf20Sopenharmony_ci
668c2ecf20Sopenharmony_ci	nfs4_init_deviceid_node(&new_ds->id_node,
678c2ecf20Sopenharmony_ci				server,
688c2ecf20Sopenharmony_ci				&pdev->dev_id);
698c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&dsaddrs);
708c2ecf20Sopenharmony_ci
718c2ecf20Sopenharmony_ci	xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
728c2ecf20Sopenharmony_ci	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
738c2ecf20Sopenharmony_ci
748c2ecf20Sopenharmony_ci	/* multipath count */
758c2ecf20Sopenharmony_ci	p = xdr_inline_decode(&stream, 4);
768c2ecf20Sopenharmony_ci	if (unlikely(!p))
778c2ecf20Sopenharmony_ci		goto out_err_drain_dsaddrs;
788c2ecf20Sopenharmony_ci	mp_count = be32_to_cpup(p);
798c2ecf20Sopenharmony_ci	dprintk("%s: multipath ds count %d\n", __func__, mp_count);
808c2ecf20Sopenharmony_ci
818c2ecf20Sopenharmony_ci	for (i = 0; i < mp_count; i++) {
828c2ecf20Sopenharmony_ci		/* multipath ds */
838c2ecf20Sopenharmony_ci		da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
848c2ecf20Sopenharmony_ci					    &stream, gfp_flags);
858c2ecf20Sopenharmony_ci		if (da)
868c2ecf20Sopenharmony_ci			list_add_tail(&da->da_node, &dsaddrs);
878c2ecf20Sopenharmony_ci	}
888c2ecf20Sopenharmony_ci	if (list_empty(&dsaddrs)) {
898c2ecf20Sopenharmony_ci		dprintk("%s: no suitable DS addresses found\n",
908c2ecf20Sopenharmony_ci			__func__);
918c2ecf20Sopenharmony_ci		ret = -ENOMEDIUM;
928c2ecf20Sopenharmony_ci		goto out_err_drain_dsaddrs;
938c2ecf20Sopenharmony_ci	}
948c2ecf20Sopenharmony_ci
958c2ecf20Sopenharmony_ci	/* version count */
968c2ecf20Sopenharmony_ci	p = xdr_inline_decode(&stream, 4);
978c2ecf20Sopenharmony_ci	if (unlikely(!p))
988c2ecf20Sopenharmony_ci		goto out_err_drain_dsaddrs;
998c2ecf20Sopenharmony_ci	version_count = be32_to_cpup(p);
1008c2ecf20Sopenharmony_ci	dprintk("%s: version count %d\n", __func__, version_count);
1018c2ecf20Sopenharmony_ci
1028c2ecf20Sopenharmony_ci	ds_versions = kcalloc(version_count,
1038c2ecf20Sopenharmony_ci			      sizeof(struct nfs4_ff_ds_version),
1048c2ecf20Sopenharmony_ci			      gfp_flags);
1058c2ecf20Sopenharmony_ci	if (!ds_versions)
1068c2ecf20Sopenharmony_ci		goto out_scratch;
1078c2ecf20Sopenharmony_ci
1088c2ecf20Sopenharmony_ci	for (i = 0; i < version_count; i++) {
1098c2ecf20Sopenharmony_ci		/* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) +
1108c2ecf20Sopenharmony_ci		 * tightly_coupled(4) */
1118c2ecf20Sopenharmony_ci		p = xdr_inline_decode(&stream, 20);
1128c2ecf20Sopenharmony_ci		if (unlikely(!p))
1138c2ecf20Sopenharmony_ci			goto out_err_drain_dsaddrs;
1148c2ecf20Sopenharmony_ci		ds_versions[i].version = be32_to_cpup(p++);
1158c2ecf20Sopenharmony_ci		ds_versions[i].minor_version = be32_to_cpup(p++);
1168c2ecf20Sopenharmony_ci		ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL);
1178c2ecf20Sopenharmony_ci		ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL);
1188c2ecf20Sopenharmony_ci		ds_versions[i].tightly_coupled = be32_to_cpup(p);
1198c2ecf20Sopenharmony_ci
1208c2ecf20Sopenharmony_ci		if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE)
1218c2ecf20Sopenharmony_ci			ds_versions[i].rsize = NFS_MAX_FILE_IO_SIZE;
1228c2ecf20Sopenharmony_ci		if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE)
1238c2ecf20Sopenharmony_ci			ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE;
1248c2ecf20Sopenharmony_ci
1258c2ecf20Sopenharmony_ci		/*
1268c2ecf20Sopenharmony_ci		 * check for valid major/minor combination.
1278c2ecf20Sopenharmony_ci		 * currently we support dataserver which talk:
1288c2ecf20Sopenharmony_ci		 *   v3, v4.0, v4.1, v4.2
1298c2ecf20Sopenharmony_ci		 */
1308c2ecf20Sopenharmony_ci		if (!((ds_versions[i].version == 3 && ds_versions[i].minor_version == 0) ||
1318c2ecf20Sopenharmony_ci			(ds_versions[i].version == 4 && ds_versions[i].minor_version < 3))) {
1328c2ecf20Sopenharmony_ci			dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__,
1338c2ecf20Sopenharmony_ci				i, ds_versions[i].version,
1348c2ecf20Sopenharmony_ci				ds_versions[i].minor_version);
1358c2ecf20Sopenharmony_ci			ret = -EPROTONOSUPPORT;
1368c2ecf20Sopenharmony_ci			goto out_err_drain_dsaddrs;
1378c2ecf20Sopenharmony_ci		}
1388c2ecf20Sopenharmony_ci
1398c2ecf20Sopenharmony_ci		dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n",
1408c2ecf20Sopenharmony_ci			__func__, i, ds_versions[i].version,
1418c2ecf20Sopenharmony_ci			ds_versions[i].minor_version,
1428c2ecf20Sopenharmony_ci			ds_versions[i].rsize,
1438c2ecf20Sopenharmony_ci			ds_versions[i].wsize,
1448c2ecf20Sopenharmony_ci			ds_versions[i].tightly_coupled);
1458c2ecf20Sopenharmony_ci	}
1468c2ecf20Sopenharmony_ci
1478c2ecf20Sopenharmony_ci	new_ds->ds_versions = ds_versions;
1488c2ecf20Sopenharmony_ci	new_ds->ds_versions_cnt = version_count;
1498c2ecf20Sopenharmony_ci
1508c2ecf20Sopenharmony_ci	new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
1518c2ecf20Sopenharmony_ci	if (!new_ds->ds)
1528c2ecf20Sopenharmony_ci		goto out_err_drain_dsaddrs;
1538c2ecf20Sopenharmony_ci
1548c2ecf20Sopenharmony_ci	/* If DS was already in cache, free ds addrs */
1558c2ecf20Sopenharmony_ci	while (!list_empty(&dsaddrs)) {
1568c2ecf20Sopenharmony_ci		da = list_first_entry(&dsaddrs,
1578c2ecf20Sopenharmony_ci				      struct nfs4_pnfs_ds_addr,
1588c2ecf20Sopenharmony_ci				      da_node);
1598c2ecf20Sopenharmony_ci		list_del_init(&da->da_node);
1608c2ecf20Sopenharmony_ci		kfree(da->da_remotestr);
1618c2ecf20Sopenharmony_ci		kfree(da);
1628c2ecf20Sopenharmony_ci	}
1638c2ecf20Sopenharmony_ci
1648c2ecf20Sopenharmony_ci	__free_page(scratch);
1658c2ecf20Sopenharmony_ci	return new_ds;
1668c2ecf20Sopenharmony_ci
1678c2ecf20Sopenharmony_ciout_err_drain_dsaddrs:
1688c2ecf20Sopenharmony_ci	while (!list_empty(&dsaddrs)) {
1698c2ecf20Sopenharmony_ci		da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
1708c2ecf20Sopenharmony_ci				      da_node);
1718c2ecf20Sopenharmony_ci		list_del_init(&da->da_node);
1728c2ecf20Sopenharmony_ci		kfree(da->da_remotestr);
1738c2ecf20Sopenharmony_ci		kfree(da);
1748c2ecf20Sopenharmony_ci	}
1758c2ecf20Sopenharmony_ci
1768c2ecf20Sopenharmony_ci	kfree(ds_versions);
1778c2ecf20Sopenharmony_ciout_scratch:
1788c2ecf20Sopenharmony_ci	__free_page(scratch);
1798c2ecf20Sopenharmony_ciout_err:
1808c2ecf20Sopenharmony_ci	kfree(new_ds);
1818c2ecf20Sopenharmony_ci
1828c2ecf20Sopenharmony_ci	dprintk("%s ERROR: returning %d\n", __func__, ret);
1838c2ecf20Sopenharmony_ci	return NULL;
1848c2ecf20Sopenharmony_ci}
1858c2ecf20Sopenharmony_ci
1868c2ecf20Sopenharmony_cistatic void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
1878c2ecf20Sopenharmony_ci			    u64 offset, u64 length)
1888c2ecf20Sopenharmony_ci{
1898c2ecf20Sopenharmony_ci	u64 end;
1908c2ecf20Sopenharmony_ci
1918c2ecf20Sopenharmony_ci	end = max_t(u64, pnfs_end_offset(err->offset, err->length),
1928c2ecf20Sopenharmony_ci		    pnfs_end_offset(offset, length));
1938c2ecf20Sopenharmony_ci	err->offset = min_t(u64, err->offset, offset);
1948c2ecf20Sopenharmony_ci	err->length = end - err->offset;
1958c2ecf20Sopenharmony_ci}
1968c2ecf20Sopenharmony_ci
1978c2ecf20Sopenharmony_cistatic int
1988c2ecf20Sopenharmony_ciff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1,
1998c2ecf20Sopenharmony_ci		const struct nfs4_ff_layout_ds_err *e2)
2008c2ecf20Sopenharmony_ci{
2018c2ecf20Sopenharmony_ci	int ret;
2028c2ecf20Sopenharmony_ci
2038c2ecf20Sopenharmony_ci	if (e1->opnum != e2->opnum)
2048c2ecf20Sopenharmony_ci		return e1->opnum < e2->opnum ? -1 : 1;
2058c2ecf20Sopenharmony_ci	if (e1->status != e2->status)
2068c2ecf20Sopenharmony_ci		return e1->status < e2->status ? -1 : 1;
2078c2ecf20Sopenharmony_ci	ret = memcmp(e1->stateid.data, e2->stateid.data,
2088c2ecf20Sopenharmony_ci			sizeof(e1->stateid.data));
2098c2ecf20Sopenharmony_ci	if (ret != 0)
2108c2ecf20Sopenharmony_ci		return ret;
2118c2ecf20Sopenharmony_ci	ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid));
2128c2ecf20Sopenharmony_ci	if (ret != 0)
2138c2ecf20Sopenharmony_ci		return ret;
2148c2ecf20Sopenharmony_ci	if (pnfs_end_offset(e1->offset, e1->length) < e2->offset)
2158c2ecf20Sopenharmony_ci		return -1;
2168c2ecf20Sopenharmony_ci	if (e1->offset > pnfs_end_offset(e2->offset, e2->length))
2178c2ecf20Sopenharmony_ci		return 1;
2188c2ecf20Sopenharmony_ci	/* If ranges overlap or are contiguous, they are the same */
2198c2ecf20Sopenharmony_ci	return 0;
2208c2ecf20Sopenharmony_ci}
2218c2ecf20Sopenharmony_ci
2228c2ecf20Sopenharmony_cistatic void
2238c2ecf20Sopenharmony_ciff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
2248c2ecf20Sopenharmony_ci			      struct nfs4_ff_layout_ds_err *dserr)
2258c2ecf20Sopenharmony_ci{
2268c2ecf20Sopenharmony_ci	struct nfs4_ff_layout_ds_err *err, *tmp;
2278c2ecf20Sopenharmony_ci	struct list_head *head = &flo->error_list;
2288c2ecf20Sopenharmony_ci	int match;
2298c2ecf20Sopenharmony_ci
2308c2ecf20Sopenharmony_ci	/* Do insertion sort w/ merges */
2318c2ecf20Sopenharmony_ci	list_for_each_entry_safe(err, tmp, &flo->error_list, list) {
2328c2ecf20Sopenharmony_ci		match = ff_ds_error_match(err, dserr);
2338c2ecf20Sopenharmony_ci		if (match < 0)
2348c2ecf20Sopenharmony_ci			continue;
2358c2ecf20Sopenharmony_ci		if (match > 0) {
2368c2ecf20Sopenharmony_ci			/* Add entry "dserr" _before_ entry "err" */
2378c2ecf20Sopenharmony_ci			head = &err->list;
2388c2ecf20Sopenharmony_ci			break;
2398c2ecf20Sopenharmony_ci		}
2408c2ecf20Sopenharmony_ci		/* Entries match, so merge "err" into "dserr" */
2418c2ecf20Sopenharmony_ci		extend_ds_error(dserr, err->offset, err->length);
2428c2ecf20Sopenharmony_ci		list_replace(&err->list, &dserr->list);
2438c2ecf20Sopenharmony_ci		kfree(err);
2448c2ecf20Sopenharmony_ci		return;
2458c2ecf20Sopenharmony_ci	}
2468c2ecf20Sopenharmony_ci
2478c2ecf20Sopenharmony_ci	list_add_tail(&dserr->list, head);
2488c2ecf20Sopenharmony_ci}
2498c2ecf20Sopenharmony_ci
2508c2ecf20Sopenharmony_ciint ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
2518c2ecf20Sopenharmony_ci			     struct nfs4_ff_layout_mirror *mirror, u64 offset,
2528c2ecf20Sopenharmony_ci			     u64 length, int status, enum nfs_opnum4 opnum,
2538c2ecf20Sopenharmony_ci			     gfp_t gfp_flags)
2548c2ecf20Sopenharmony_ci{
2558c2ecf20Sopenharmony_ci	struct nfs4_ff_layout_ds_err *dserr;
2568c2ecf20Sopenharmony_ci
2578c2ecf20Sopenharmony_ci	if (status == 0)
2588c2ecf20Sopenharmony_ci		return 0;
2598c2ecf20Sopenharmony_ci
2608c2ecf20Sopenharmony_ci	if (IS_ERR_OR_NULL(mirror->mirror_ds))
2618c2ecf20Sopenharmony_ci		return -EINVAL;
2628c2ecf20Sopenharmony_ci
2638c2ecf20Sopenharmony_ci	dserr = kmalloc(sizeof(*dserr), gfp_flags);
2648c2ecf20Sopenharmony_ci	if (!dserr)
2658c2ecf20Sopenharmony_ci		return -ENOMEM;
2668c2ecf20Sopenharmony_ci
2678c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&dserr->list);
2688c2ecf20Sopenharmony_ci	dserr->offset = offset;
2698c2ecf20Sopenharmony_ci	dserr->length = length;
2708c2ecf20Sopenharmony_ci	dserr->status = status;
2718c2ecf20Sopenharmony_ci	dserr->opnum = opnum;
2728c2ecf20Sopenharmony_ci	nfs4_stateid_copy(&dserr->stateid, &mirror->stateid);
2738c2ecf20Sopenharmony_ci	memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid,
2748c2ecf20Sopenharmony_ci	       NFS4_DEVICEID4_SIZE);
2758c2ecf20Sopenharmony_ci
2768c2ecf20Sopenharmony_ci	spin_lock(&flo->generic_hdr.plh_inode->i_lock);
2778c2ecf20Sopenharmony_ci	ff_layout_add_ds_error_locked(flo, dserr);
2788c2ecf20Sopenharmony_ci	spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
2798c2ecf20Sopenharmony_ci	return 0;
2808c2ecf20Sopenharmony_ci}
2818c2ecf20Sopenharmony_ci
2828c2ecf20Sopenharmony_cistatic const struct cred *
2838c2ecf20Sopenharmony_ciff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode)
2848c2ecf20Sopenharmony_ci{
2858c2ecf20Sopenharmony_ci	const struct cred *cred, __rcu **pcred;
2868c2ecf20Sopenharmony_ci
2878c2ecf20Sopenharmony_ci	if (iomode == IOMODE_READ)
2888c2ecf20Sopenharmony_ci		pcred = &mirror->ro_cred;
2898c2ecf20Sopenharmony_ci	else
2908c2ecf20Sopenharmony_ci		pcred = &mirror->rw_cred;
2918c2ecf20Sopenharmony_ci
2928c2ecf20Sopenharmony_ci	rcu_read_lock();
2938c2ecf20Sopenharmony_ci	do {
2948c2ecf20Sopenharmony_ci		cred = rcu_dereference(*pcred);
2958c2ecf20Sopenharmony_ci		if (!cred)
2968c2ecf20Sopenharmony_ci			break;
2978c2ecf20Sopenharmony_ci
2988c2ecf20Sopenharmony_ci		cred = get_cred_rcu(cred);
2998c2ecf20Sopenharmony_ci	} while(!cred);
3008c2ecf20Sopenharmony_ci	rcu_read_unlock();
3018c2ecf20Sopenharmony_ci	return cred;
3028c2ecf20Sopenharmony_ci}
3038c2ecf20Sopenharmony_ci
3048c2ecf20Sopenharmony_cistruct nfs_fh *
3058c2ecf20Sopenharmony_cinfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror)
3068c2ecf20Sopenharmony_ci{
3078c2ecf20Sopenharmony_ci	/* FIXME: For now assume there is only 1 version available for the DS */
3088c2ecf20Sopenharmony_ci	return &mirror->fh_versions[0];
3098c2ecf20Sopenharmony_ci}
3108c2ecf20Sopenharmony_ci
3118c2ecf20Sopenharmony_civoid
3128c2ecf20Sopenharmony_cinfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror,
3138c2ecf20Sopenharmony_ci		nfs4_stateid *stateid)
3148c2ecf20Sopenharmony_ci{
3158c2ecf20Sopenharmony_ci	if (nfs4_ff_layout_ds_version(mirror) == 4)
3168c2ecf20Sopenharmony_ci		nfs4_stateid_copy(stateid, &mirror->stateid);
3178c2ecf20Sopenharmony_ci}
3188c2ecf20Sopenharmony_ci
3198c2ecf20Sopenharmony_cistatic bool
3208c2ecf20Sopenharmony_ciff_layout_init_mirror_ds(struct pnfs_layout_hdr *lo,
3218c2ecf20Sopenharmony_ci			 struct nfs4_ff_layout_mirror *mirror)
3228c2ecf20Sopenharmony_ci{
3238c2ecf20Sopenharmony_ci	if (mirror == NULL)
3248c2ecf20Sopenharmony_ci		goto outerr;
3258c2ecf20Sopenharmony_ci	if (mirror->mirror_ds == NULL) {
3268c2ecf20Sopenharmony_ci		struct nfs4_deviceid_node *node;
3278c2ecf20Sopenharmony_ci		struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV);
3288c2ecf20Sopenharmony_ci
3298c2ecf20Sopenharmony_ci		node = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode),
3308c2ecf20Sopenharmony_ci				&mirror->devid, lo->plh_lc_cred,
3318c2ecf20Sopenharmony_ci				GFP_KERNEL);
3328c2ecf20Sopenharmony_ci		if (node)
3338c2ecf20Sopenharmony_ci			mirror_ds = FF_LAYOUT_MIRROR_DS(node);
3348c2ecf20Sopenharmony_ci
3358c2ecf20Sopenharmony_ci		/* check for race with another call to this function */
3368c2ecf20Sopenharmony_ci		if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) &&
3378c2ecf20Sopenharmony_ci		    mirror_ds != ERR_PTR(-ENODEV))
3388c2ecf20Sopenharmony_ci			nfs4_put_deviceid_node(node);
3398c2ecf20Sopenharmony_ci	}
3408c2ecf20Sopenharmony_ci
3418c2ecf20Sopenharmony_ci	if (IS_ERR(mirror->mirror_ds))
3428c2ecf20Sopenharmony_ci		goto outerr;
3438c2ecf20Sopenharmony_ci
3448c2ecf20Sopenharmony_ci	return true;
3458c2ecf20Sopenharmony_ciouterr:
3468c2ecf20Sopenharmony_ci	return false;
3478c2ecf20Sopenharmony_ci}
3488c2ecf20Sopenharmony_ci
3498c2ecf20Sopenharmony_ci/**
3508c2ecf20Sopenharmony_ci * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call
3518c2ecf20Sopenharmony_ci * @lseg: the layout segment we're operating on
3528c2ecf20Sopenharmony_ci * @mirror: layout mirror describing the DS to use
3538c2ecf20Sopenharmony_ci * @fail_return: return layout on connect failure?
3548c2ecf20Sopenharmony_ci *
3558c2ecf20Sopenharmony_ci * Try to prepare a DS connection to accept an RPC call. This involves
3568c2ecf20Sopenharmony_ci * selecting a mirror to use and connecting the client to it if it's not
3578c2ecf20Sopenharmony_ci * already connected.
3588c2ecf20Sopenharmony_ci *
3598c2ecf20Sopenharmony_ci * Since we only need a single functioning mirror to satisfy a read, we don't
3608c2ecf20Sopenharmony_ci * want to return the layout if there is one. For writes though, any down
3618c2ecf20Sopenharmony_ci * mirror should result in a LAYOUTRETURN. @fail_return is how we distinguish
3628c2ecf20Sopenharmony_ci * between the two cases.
3638c2ecf20Sopenharmony_ci *
3648c2ecf20Sopenharmony_ci * Returns a pointer to a connected DS object on success or NULL on failure.
3658c2ecf20Sopenharmony_ci */
3668c2ecf20Sopenharmony_cistruct nfs4_pnfs_ds *
3678c2ecf20Sopenharmony_cinfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
3688c2ecf20Sopenharmony_ci			  struct nfs4_ff_layout_mirror *mirror,
3698c2ecf20Sopenharmony_ci			  bool fail_return)
3708c2ecf20Sopenharmony_ci{
3718c2ecf20Sopenharmony_ci	struct nfs4_pnfs_ds *ds = NULL;
3728c2ecf20Sopenharmony_ci	struct inode *ino = lseg->pls_layout->plh_inode;
3738c2ecf20Sopenharmony_ci	struct nfs_server *s = NFS_SERVER(ino);
3748c2ecf20Sopenharmony_ci	unsigned int max_payload;
3758c2ecf20Sopenharmony_ci	int status;
3768c2ecf20Sopenharmony_ci
3778c2ecf20Sopenharmony_ci	if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror))
3788c2ecf20Sopenharmony_ci		goto noconnect;
3798c2ecf20Sopenharmony_ci
3808c2ecf20Sopenharmony_ci	ds = mirror->mirror_ds->ds;
3818c2ecf20Sopenharmony_ci	if (READ_ONCE(ds->ds_clp))
3828c2ecf20Sopenharmony_ci		goto out;
3838c2ecf20Sopenharmony_ci	/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
3848c2ecf20Sopenharmony_ci	smp_rmb();
3858c2ecf20Sopenharmony_ci
3868c2ecf20Sopenharmony_ci	/* FIXME: For now we assume the server sent only one version of NFS
3878c2ecf20Sopenharmony_ci	 * to use for the DS.
3888c2ecf20Sopenharmony_ci	 */
3898c2ecf20Sopenharmony_ci	status = nfs4_pnfs_ds_connect(s, ds, &mirror->mirror_ds->id_node,
3908c2ecf20Sopenharmony_ci			     dataserver_timeo, dataserver_retrans,
3918c2ecf20Sopenharmony_ci			     mirror->mirror_ds->ds_versions[0].version,
3928c2ecf20Sopenharmony_ci			     mirror->mirror_ds->ds_versions[0].minor_version);
3938c2ecf20Sopenharmony_ci
3948c2ecf20Sopenharmony_ci	/* connect success, check rsize/wsize limit */
3958c2ecf20Sopenharmony_ci	if (!status) {
3968c2ecf20Sopenharmony_ci		max_payload =
3978c2ecf20Sopenharmony_ci			nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
3988c2ecf20Sopenharmony_ci				       NULL);
3998c2ecf20Sopenharmony_ci		if (mirror->mirror_ds->ds_versions[0].rsize > max_payload)
4008c2ecf20Sopenharmony_ci			mirror->mirror_ds->ds_versions[0].rsize = max_payload;
4018c2ecf20Sopenharmony_ci		if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
4028c2ecf20Sopenharmony_ci			mirror->mirror_ds->ds_versions[0].wsize = max_payload;
4038c2ecf20Sopenharmony_ci		goto out;
4048c2ecf20Sopenharmony_ci	}
4058c2ecf20Sopenharmony_cinoconnect:
4068c2ecf20Sopenharmony_ci	ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
4078c2ecf20Sopenharmony_ci				 mirror, lseg->pls_range.offset,
4088c2ecf20Sopenharmony_ci				 lseg->pls_range.length, NFS4ERR_NXIO,
4098c2ecf20Sopenharmony_ci				 OP_ILLEGAL, GFP_NOIO);
4108c2ecf20Sopenharmony_ci	ff_layout_send_layouterror(lseg);
4118c2ecf20Sopenharmony_ci	if (fail_return || !ff_layout_has_available_ds(lseg))
4128c2ecf20Sopenharmony_ci		pnfs_error_mark_layout_for_return(ino, lseg);
4138c2ecf20Sopenharmony_ci	ds = NULL;
4148c2ecf20Sopenharmony_ciout:
4158c2ecf20Sopenharmony_ci	return ds;
4168c2ecf20Sopenharmony_ci}
4178c2ecf20Sopenharmony_ci
4188c2ecf20Sopenharmony_ciconst struct cred *
4198c2ecf20Sopenharmony_ciff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror,
4208c2ecf20Sopenharmony_ci		      const struct pnfs_layout_range *range,
4218c2ecf20Sopenharmony_ci		      const struct cred *mdscred)
4228c2ecf20Sopenharmony_ci{
4238c2ecf20Sopenharmony_ci	const struct cred *cred;
4248c2ecf20Sopenharmony_ci
4258c2ecf20Sopenharmony_ci	if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) {
4268c2ecf20Sopenharmony_ci		cred = ff_layout_get_mirror_cred(mirror, range->iomode);
4278c2ecf20Sopenharmony_ci		if (!cred)
4288c2ecf20Sopenharmony_ci			cred = get_cred(mdscred);
4298c2ecf20Sopenharmony_ci	} else {
4308c2ecf20Sopenharmony_ci		cred = get_cred(mdscred);
4318c2ecf20Sopenharmony_ci	}
4328c2ecf20Sopenharmony_ci	return cred;
4338c2ecf20Sopenharmony_ci}
4348c2ecf20Sopenharmony_ci
4358c2ecf20Sopenharmony_ci/**
4368c2ecf20Sopenharmony_ci * nfs4_ff_find_or_create_ds_client - Find or create a DS rpc client
4378c2ecf20Sopenharmony_ci * @mirror: pointer to the mirror
4388c2ecf20Sopenharmony_ci * @ds_clp: nfs_client for the DS
4398c2ecf20Sopenharmony_ci * @inode: pointer to inode
4408c2ecf20Sopenharmony_ci *
4418c2ecf20Sopenharmony_ci * Find or create a DS rpc client with th MDS server rpc client auth flavor
4428c2ecf20Sopenharmony_ci * in the nfs_client cl_ds_clients list.
4438c2ecf20Sopenharmony_ci */
4448c2ecf20Sopenharmony_cistruct rpc_clnt *
4458c2ecf20Sopenharmony_cinfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror,
4468c2ecf20Sopenharmony_ci				 struct nfs_client *ds_clp, struct inode *inode)
4478c2ecf20Sopenharmony_ci{
4488c2ecf20Sopenharmony_ci	switch (mirror->mirror_ds->ds_versions[0].version) {
4498c2ecf20Sopenharmony_ci	case 3:
4508c2ecf20Sopenharmony_ci		/* For NFSv3 DS, flavor is set when creating DS connections */
4518c2ecf20Sopenharmony_ci		return ds_clp->cl_rpcclient;
4528c2ecf20Sopenharmony_ci	case 4:
4538c2ecf20Sopenharmony_ci		return nfs4_find_or_create_ds_client(ds_clp, inode);
4548c2ecf20Sopenharmony_ci	default:
4558c2ecf20Sopenharmony_ci		BUG();
4568c2ecf20Sopenharmony_ci	}
4578c2ecf20Sopenharmony_ci}
4588c2ecf20Sopenharmony_ci
4598c2ecf20Sopenharmony_civoid ff_layout_free_ds_ioerr(struct list_head *head)
4608c2ecf20Sopenharmony_ci{
4618c2ecf20Sopenharmony_ci	struct nfs4_ff_layout_ds_err *err;
4628c2ecf20Sopenharmony_ci
4638c2ecf20Sopenharmony_ci	while (!list_empty(head)) {
4648c2ecf20Sopenharmony_ci		err = list_first_entry(head,
4658c2ecf20Sopenharmony_ci				struct nfs4_ff_layout_ds_err,
4668c2ecf20Sopenharmony_ci				list);
4678c2ecf20Sopenharmony_ci		list_del(&err->list);
4688c2ecf20Sopenharmony_ci		kfree(err);
4698c2ecf20Sopenharmony_ci	}
4708c2ecf20Sopenharmony_ci}
4718c2ecf20Sopenharmony_ci
4728c2ecf20Sopenharmony_ci/* called with inode i_lock held */
4738c2ecf20Sopenharmony_ciint ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head)
4748c2ecf20Sopenharmony_ci{
4758c2ecf20Sopenharmony_ci	struct nfs4_ff_layout_ds_err *err;
4768c2ecf20Sopenharmony_ci	__be32 *p;
4778c2ecf20Sopenharmony_ci
4788c2ecf20Sopenharmony_ci	list_for_each_entry(err, head, list) {
4798c2ecf20Sopenharmony_ci		/* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
4808c2ecf20Sopenharmony_ci		 * + array length + deviceid(NFS4_DEVICEID4_SIZE)
4818c2ecf20Sopenharmony_ci		 * + status(4) + opnum(4)
4828c2ecf20Sopenharmony_ci		 */
4838c2ecf20Sopenharmony_ci		p = xdr_reserve_space(xdr,
4848c2ecf20Sopenharmony_ci				28 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
4858c2ecf20Sopenharmony_ci		if (unlikely(!p))
4868c2ecf20Sopenharmony_ci			return -ENOBUFS;
4878c2ecf20Sopenharmony_ci		p = xdr_encode_hyper(p, err->offset);
4888c2ecf20Sopenharmony_ci		p = xdr_encode_hyper(p, err->length);
4898c2ecf20Sopenharmony_ci		p = xdr_encode_opaque_fixed(p, &err->stateid,
4908c2ecf20Sopenharmony_ci					    NFS4_STATEID_SIZE);
4918c2ecf20Sopenharmony_ci		/* Encode 1 error */
4928c2ecf20Sopenharmony_ci		*p++ = cpu_to_be32(1);
4938c2ecf20Sopenharmony_ci		p = xdr_encode_opaque_fixed(p, &err->deviceid,
4948c2ecf20Sopenharmony_ci					    NFS4_DEVICEID4_SIZE);
4958c2ecf20Sopenharmony_ci		*p++ = cpu_to_be32(err->status);
4968c2ecf20Sopenharmony_ci		*p++ = cpu_to_be32(err->opnum);
4978c2ecf20Sopenharmony_ci		dprintk("%s: offset %llu length %llu status %d op %d\n",
4988c2ecf20Sopenharmony_ci			__func__, err->offset, err->length, err->status,
4998c2ecf20Sopenharmony_ci			err->opnum);
5008c2ecf20Sopenharmony_ci	}
5018c2ecf20Sopenharmony_ci
5028c2ecf20Sopenharmony_ci	return 0;
5038c2ecf20Sopenharmony_ci}
5048c2ecf20Sopenharmony_ci
5058c2ecf20Sopenharmony_cistatic
5068c2ecf20Sopenharmony_ciunsigned int do_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
5078c2ecf20Sopenharmony_ci				      const struct pnfs_layout_range *range,
5088c2ecf20Sopenharmony_ci				      struct list_head *head,
5098c2ecf20Sopenharmony_ci				      unsigned int maxnum)
5108c2ecf20Sopenharmony_ci{
5118c2ecf20Sopenharmony_ci	struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
5128c2ecf20Sopenharmony_ci	struct inode *inode = lo->plh_inode;
5138c2ecf20Sopenharmony_ci	struct nfs4_ff_layout_ds_err *err, *n;
5148c2ecf20Sopenharmony_ci	unsigned int ret = 0;
5158c2ecf20Sopenharmony_ci
5168c2ecf20Sopenharmony_ci	spin_lock(&inode->i_lock);
5178c2ecf20Sopenharmony_ci	list_for_each_entry_safe(err, n, &flo->error_list, list) {
5188c2ecf20Sopenharmony_ci		if (!pnfs_is_range_intersecting(err->offset,
5198c2ecf20Sopenharmony_ci				pnfs_end_offset(err->offset, err->length),
5208c2ecf20Sopenharmony_ci				range->offset,
5218c2ecf20Sopenharmony_ci				pnfs_end_offset(range->offset, range->length)))
5228c2ecf20Sopenharmony_ci			continue;
5238c2ecf20Sopenharmony_ci		if (!maxnum)
5248c2ecf20Sopenharmony_ci			break;
5258c2ecf20Sopenharmony_ci		list_move(&err->list, head);
5268c2ecf20Sopenharmony_ci		maxnum--;
5278c2ecf20Sopenharmony_ci		ret++;
5288c2ecf20Sopenharmony_ci	}
5298c2ecf20Sopenharmony_ci	spin_unlock(&inode->i_lock);
5308c2ecf20Sopenharmony_ci	return ret;
5318c2ecf20Sopenharmony_ci}
5328c2ecf20Sopenharmony_ci
5338c2ecf20Sopenharmony_ciunsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
5348c2ecf20Sopenharmony_ci				      const struct pnfs_layout_range *range,
5358c2ecf20Sopenharmony_ci				      struct list_head *head,
5368c2ecf20Sopenharmony_ci				      unsigned int maxnum)
5378c2ecf20Sopenharmony_ci{
5388c2ecf20Sopenharmony_ci	unsigned int ret;
5398c2ecf20Sopenharmony_ci
5408c2ecf20Sopenharmony_ci	ret = do_layout_fetch_ds_ioerr(lo, range, head, maxnum);
5418c2ecf20Sopenharmony_ci	/* If we're over the max, discard all remaining entries */
5428c2ecf20Sopenharmony_ci	if (ret == maxnum) {
5438c2ecf20Sopenharmony_ci		LIST_HEAD(discard);
5448c2ecf20Sopenharmony_ci		do_layout_fetch_ds_ioerr(lo, range, &discard, -1);
5458c2ecf20Sopenharmony_ci		ff_layout_free_ds_ioerr(&discard);
5468c2ecf20Sopenharmony_ci	}
5478c2ecf20Sopenharmony_ci	return ret;
5488c2ecf20Sopenharmony_ci}
5498c2ecf20Sopenharmony_ci
5508c2ecf20Sopenharmony_cistatic bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
5518c2ecf20Sopenharmony_ci{
5528c2ecf20Sopenharmony_ci	struct nfs4_ff_layout_mirror *mirror;
5538c2ecf20Sopenharmony_ci	struct nfs4_deviceid_node *devid;
5548c2ecf20Sopenharmony_ci	u32 idx;
5558c2ecf20Sopenharmony_ci
5568c2ecf20Sopenharmony_ci	for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
5578c2ecf20Sopenharmony_ci		mirror = FF_LAYOUT_COMP(lseg, idx);
5588c2ecf20Sopenharmony_ci		if (mirror) {
5598c2ecf20Sopenharmony_ci			if (!mirror->mirror_ds)
5608c2ecf20Sopenharmony_ci				return true;
5618c2ecf20Sopenharmony_ci			if (IS_ERR(mirror->mirror_ds))
5628c2ecf20Sopenharmony_ci				continue;
5638c2ecf20Sopenharmony_ci			devid = &mirror->mirror_ds->id_node;
5648c2ecf20Sopenharmony_ci			if (!nfs4_test_deviceid_unavailable(devid))
5658c2ecf20Sopenharmony_ci				return true;
5668c2ecf20Sopenharmony_ci		}
5678c2ecf20Sopenharmony_ci	}
5688c2ecf20Sopenharmony_ci
5698c2ecf20Sopenharmony_ci	return false;
5708c2ecf20Sopenharmony_ci}
5718c2ecf20Sopenharmony_ci
5728c2ecf20Sopenharmony_cistatic bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
5738c2ecf20Sopenharmony_ci{
5748c2ecf20Sopenharmony_ci	struct nfs4_ff_layout_mirror *mirror;
5758c2ecf20Sopenharmony_ci	struct nfs4_deviceid_node *devid;
5768c2ecf20Sopenharmony_ci	u32 idx;
5778c2ecf20Sopenharmony_ci
5788c2ecf20Sopenharmony_ci	for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
5798c2ecf20Sopenharmony_ci		mirror = FF_LAYOUT_COMP(lseg, idx);
5808c2ecf20Sopenharmony_ci		if (!mirror || IS_ERR(mirror->mirror_ds))
5818c2ecf20Sopenharmony_ci			return false;
5828c2ecf20Sopenharmony_ci		if (!mirror->mirror_ds)
5838c2ecf20Sopenharmony_ci			continue;
5848c2ecf20Sopenharmony_ci		devid = &mirror->mirror_ds->id_node;
5858c2ecf20Sopenharmony_ci		if (nfs4_test_deviceid_unavailable(devid))
5868c2ecf20Sopenharmony_ci			return false;
5878c2ecf20Sopenharmony_ci	}
5888c2ecf20Sopenharmony_ci
5898c2ecf20Sopenharmony_ci	return FF_LAYOUT_MIRROR_COUNT(lseg) != 0;
5908c2ecf20Sopenharmony_ci}
5918c2ecf20Sopenharmony_ci
5928c2ecf20Sopenharmony_cistatic bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
5938c2ecf20Sopenharmony_ci{
5948c2ecf20Sopenharmony_ci	if (lseg->pls_range.iomode == IOMODE_READ)
5958c2ecf20Sopenharmony_ci		return  ff_read_layout_has_available_ds(lseg);
5968c2ecf20Sopenharmony_ci	/* Note: RW layout needs all mirrors available */
5978c2ecf20Sopenharmony_ci	return ff_rw_layout_has_available_ds(lseg);
5988c2ecf20Sopenharmony_ci}
5998c2ecf20Sopenharmony_ci
6008c2ecf20Sopenharmony_cibool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg)
6018c2ecf20Sopenharmony_ci{
6028c2ecf20Sopenharmony_ci	return ff_layout_no_fallback_to_mds(lseg) ||
6038c2ecf20Sopenharmony_ci	       ff_layout_has_available_ds(lseg);
6048c2ecf20Sopenharmony_ci}
6058c2ecf20Sopenharmony_ci
6068c2ecf20Sopenharmony_cibool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg)
6078c2ecf20Sopenharmony_ci{
6088c2ecf20Sopenharmony_ci	return lseg->pls_range.iomode == IOMODE_RW &&
6098c2ecf20Sopenharmony_ci	       ff_layout_no_read_on_rw(lseg);
6108c2ecf20Sopenharmony_ci}
6118c2ecf20Sopenharmony_ci
6128c2ecf20Sopenharmony_cimodule_param(dataserver_retrans, uint, 0644);
6138c2ecf20Sopenharmony_ciMODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
6148c2ecf20Sopenharmony_ci			"retries a request before it attempts further "
6158c2ecf20Sopenharmony_ci			" recovery  action.");
6168c2ecf20Sopenharmony_cimodule_param(dataserver_timeo, uint, 0644);
6178c2ecf20Sopenharmony_ciMODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
6188c2ecf20Sopenharmony_ci			"NFSv4.1  client  waits for a response from a "
6198c2ecf20Sopenharmony_ci			" data server before it retries an NFS request.");
620