1/* 2 * pNFS functions to call and manage layout drivers. 3 * 4 * Copyright (c) 2002 [year of first publication] 5 * The Regents of the University of Michigan 6 * All Rights Reserved 7 * 8 * Dean Hildebrand <dhildebz@umich.edu> 9 * 10 * Permission is granted to use, copy, create derivative works, and 11 * redistribute this software and such derivative works for any purpose, 12 * so long as the name of the University of Michigan is not used in 13 * any advertising or publicity pertaining to the use or distribution 14 * of this software without specific, written prior authorization. If 15 * the above copyright notice or any other identification of the 16 * University of Michigan is included in any copy of any portion of 17 * this software, then the disclaimer below must also be included. 18 * 19 * This software is provided as is, without representation or warranty 20 * of any kind either express or implied, including without limitation 21 * the implied warranties of merchantability, fitness for a particular 22 * purpose, or noninfringement. The Regents of the University of 23 * Michigan shall not be liable for any damages, including special, 24 * indirect, incidental, or consequential damages, with respect to any 25 * claim arising out of or in connection with the use of the software, 26 * even if it has been or is hereafter advised of the possibility of 27 * such damages. 28 */ 29 30#include <linux/nfs_fs.h> 31#include <linux/nfs_page.h> 32#include <linux/module.h> 33#include <linux/sort.h> 34#include "internal.h" 35#include "pnfs.h" 36#include "iostat.h" 37#include "nfs4trace.h" 38#include "delegation.h" 39#include "nfs42.h" 40#include "nfs4_fs.h" 41 42#define NFSDBG_FACILITY NFSDBG_PNFS 43#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) 44 45/* Locking: 46 * 47 * pnfs_spinlock: 48 * protects pnfs_modules_tbl. 49 */ 50static DEFINE_SPINLOCK(pnfs_spinlock); 51 52/* 53 * pnfs_modules_tbl holds all pnfs modules 54 */ 55static LIST_HEAD(pnfs_modules_tbl); 56 57static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo); 58static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo, 59 struct list_head *free_me, 60 const struct pnfs_layout_range *range, 61 u32 seq); 62static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, 63 struct list_head *tmp_list); 64 65/* Return the registered pnfs layout driver module matching given id */ 66static struct pnfs_layoutdriver_type * 67find_pnfs_driver_locked(u32 id) 68{ 69 struct pnfs_layoutdriver_type *local; 70 71 list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid) 72 if (local->id == id) 73 goto out; 74 local = NULL; 75out: 76 dprintk("%s: Searching for id %u, found %p\n", __func__, id, local); 77 return local; 78} 79 80static struct pnfs_layoutdriver_type * 81find_pnfs_driver(u32 id) 82{ 83 struct pnfs_layoutdriver_type *local; 84 85 spin_lock(&pnfs_spinlock); 86 local = find_pnfs_driver_locked(id); 87 if (local != NULL && !try_module_get(local->owner)) { 88 dprintk("%s: Could not grab reference on module\n", __func__); 89 local = NULL; 90 } 91 spin_unlock(&pnfs_spinlock); 92 return local; 93} 94 95const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id) 96{ 97 return find_pnfs_driver(id); 98} 99 100void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld) 101{ 102 if (ld) 103 module_put(ld->owner); 104} 105 106void 107unset_pnfs_layoutdriver(struct nfs_server *nfss) 108{ 109 if (nfss->pnfs_curr_ld) { 110 if (nfss->pnfs_curr_ld->clear_layoutdriver) 111 nfss->pnfs_curr_ld->clear_layoutdriver(nfss); 112 /* Decrement the MDS count. Purge the deviceid cache if zero */ 113 if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count)) 114 nfs4_deviceid_purge_client(nfss->nfs_client); 115 module_put(nfss->pnfs_curr_ld->owner); 116 } 117 nfss->pnfs_curr_ld = NULL; 118} 119 120/* 121 * When the server sends a list of layout types, we choose one in the order 122 * given in the list below. 123 * 124 * FIXME: should this list be configurable in some fashion? module param? 125 * mount option? something else? 126 */ 127static const u32 ld_prefs[] = { 128 LAYOUT_SCSI, 129 LAYOUT_BLOCK_VOLUME, 130 LAYOUT_OSD2_OBJECTS, 131 LAYOUT_FLEX_FILES, 132 LAYOUT_NFSV4_1_FILES, 133 0 134}; 135 136static int 137ld_cmp(const void *e1, const void *e2) 138{ 139 u32 ld1 = *((u32 *)e1); 140 u32 ld2 = *((u32 *)e2); 141 int i; 142 143 for (i = 0; ld_prefs[i] != 0; i++) { 144 if (ld1 == ld_prefs[i]) 145 return -1; 146 147 if (ld2 == ld_prefs[i]) 148 return 1; 149 } 150 return 0; 151} 152 153/* 154 * Try to set the server's pnfs module to the pnfs layout type specified by id. 155 * Currently only one pNFS layout driver per filesystem is supported. 156 * 157 * @ids array of layout types supported by MDS. 158 */ 159void 160set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, 161 struct nfs_fsinfo *fsinfo) 162{ 163 struct pnfs_layoutdriver_type *ld_type = NULL; 164 u32 id; 165 int i; 166 167 if (fsinfo->nlayouttypes == 0) 168 goto out_no_driver; 169 if (!(server->nfs_client->cl_exchange_flags & 170 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { 171 printk(KERN_ERR "NFS: %s: cl_exchange_flags 0x%x\n", 172 __func__, server->nfs_client->cl_exchange_flags); 173 goto out_no_driver; 174 } 175 176 sort(fsinfo->layouttype, fsinfo->nlayouttypes, 177 sizeof(*fsinfo->layouttype), ld_cmp, NULL); 178 179 for (i = 0; i < fsinfo->nlayouttypes; i++) { 180 id = fsinfo->layouttype[i]; 181 ld_type = find_pnfs_driver(id); 182 if (!ld_type) { 183 request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, 184 id); 185 ld_type = find_pnfs_driver(id); 186 } 187 if (ld_type) 188 break; 189 } 190 191 if (!ld_type) { 192 dprintk("%s: No pNFS module found!\n", __func__); 193 goto out_no_driver; 194 } 195 196 server->pnfs_curr_ld = ld_type; 197 if (ld_type->set_layoutdriver 198 && ld_type->set_layoutdriver(server, mntfh)) { 199 printk(KERN_ERR "NFS: %s: Error initializing pNFS layout " 200 "driver %u.\n", __func__, id); 201 module_put(ld_type->owner); 202 goto out_no_driver; 203 } 204 /* Bump the MDS count */ 205 atomic_inc(&server->nfs_client->cl_mds_count); 206 207 dprintk("%s: pNFS module for %u set\n", __func__, id); 208 return; 209 210out_no_driver: 211 dprintk("%s: Using NFSv4 I/O\n", __func__); 212 server->pnfs_curr_ld = NULL; 213} 214 215int 216pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type) 217{ 218 int status = -EINVAL; 219 struct pnfs_layoutdriver_type *tmp; 220 221 if (ld_type->id == 0) { 222 printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__); 223 return status; 224 } 225 if (!ld_type->alloc_lseg || !ld_type->free_lseg) { 226 printk(KERN_ERR "NFS: %s Layout driver must provide " 227 "alloc_lseg and free_lseg.\n", __func__); 228 return status; 229 } 230 231 spin_lock(&pnfs_spinlock); 232 tmp = find_pnfs_driver_locked(ld_type->id); 233 if (!tmp) { 234 list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl); 235 status = 0; 236 dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id, 237 ld_type->name); 238 } else { 239 printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n", 240 __func__, ld_type->id); 241 } 242 spin_unlock(&pnfs_spinlock); 243 244 return status; 245} 246EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver); 247 248void 249pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type) 250{ 251 dprintk("%s Deregistering id:%u\n", __func__, ld_type->id); 252 spin_lock(&pnfs_spinlock); 253 list_del(&ld_type->pnfs_tblid); 254 spin_unlock(&pnfs_spinlock); 255} 256EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); 257 258/* 259 * pNFS client layout cache 260 */ 261 262/* Need to hold i_lock if caller does not already hold reference */ 263void 264pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo) 265{ 266 refcount_inc(&lo->plh_refcount); 267} 268 269static struct pnfs_layout_hdr * 270pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags) 271{ 272 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; 273 return ld->alloc_layout_hdr(ino, gfp_flags); 274} 275 276static void 277pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) 278{ 279 struct nfs_server *server = NFS_SERVER(lo->plh_inode); 280 struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; 281 282 if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) { 283 struct nfs_client *clp = server->nfs_client; 284 285 spin_lock(&clp->cl_lock); 286 list_del_rcu(&lo->plh_layouts); 287 spin_unlock(&clp->cl_lock); 288 } 289 put_cred(lo->plh_lc_cred); 290 return ld->free_layout_hdr(lo); 291} 292 293static void 294pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo) 295{ 296 struct nfs_inode *nfsi = NFS_I(lo->plh_inode); 297 dprintk("%s: freeing layout cache %p\n", __func__, lo); 298 nfsi->layout = NULL; 299 /* Reset MDS Threshold I/O counters */ 300 nfsi->write_io = 0; 301 nfsi->read_io = 0; 302} 303 304void 305pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) 306{ 307 struct inode *inode; 308 unsigned long i_state; 309 310 if (!lo) 311 return; 312 inode = lo->plh_inode; 313 pnfs_layoutreturn_before_put_layout_hdr(lo); 314 315 if (refcount_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { 316 if (!list_empty(&lo->plh_segs)) 317 WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); 318 pnfs_detach_layout_hdr(lo); 319 i_state = inode->i_state; 320 spin_unlock(&inode->i_lock); 321 pnfs_free_layout_hdr(lo); 322 /* Notify pnfs_destroy_layout_final() that we're done */ 323 if (i_state & (I_FREEING | I_CLEAR)) 324 wake_up_var(lo); 325 } 326} 327 328static struct inode * 329pnfs_grab_inode_layout_hdr(struct pnfs_layout_hdr *lo) 330{ 331 struct inode *inode = igrab(lo->plh_inode); 332 if (inode) 333 return inode; 334 set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags); 335 return NULL; 336} 337 338/* 339 * Compare 2 layout stateid sequence ids, to see which is newer, 340 * taking into account wraparound issues. 341 */ 342static bool pnfs_seqid_is_newer(u32 s1, u32 s2) 343{ 344 return (s32)(s1 - s2) > 0; 345} 346 347static void pnfs_barrier_update(struct pnfs_layout_hdr *lo, u32 newseq) 348{ 349 if (pnfs_seqid_is_newer(newseq, lo->plh_barrier) || !lo->plh_barrier) 350 lo->plh_barrier = newseq; 351} 352 353static void 354pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, 355 u32 seq) 356{ 357 if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode) 358 iomode = IOMODE_ANY; 359 lo->plh_return_iomode = iomode; 360 set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); 361 /* 362 * We must set lo->plh_return_seq to avoid livelocks with 363 * pnfs_layout_need_return() 364 */ 365 if (seq == 0) 366 seq = be32_to_cpu(lo->plh_stateid.seqid); 367 if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq)) 368 lo->plh_return_seq = seq; 369 pnfs_barrier_update(lo, seq); 370} 371 372static void 373pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) 374{ 375 struct pnfs_layout_segment *lseg; 376 lo->plh_return_iomode = 0; 377 lo->plh_return_seq = 0; 378 clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); 379 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 380 if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) 381 continue; 382 pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); 383 } 384} 385 386static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) 387{ 388 clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags); 389 clear_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags); 390 smp_mb__after_atomic(); 391 wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); 392 rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); 393} 394 395static void 396pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg, 397 struct list_head *free_me) 398{ 399 clear_bit(NFS_LSEG_ROC, &lseg->pls_flags); 400 clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); 401 if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) 402 pnfs_lseg_dec_and_remove_zero(lseg, free_me); 403 if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) 404 pnfs_lseg_dec_and_remove_zero(lseg, free_me); 405} 406 407/* 408 * Update the seqid of a layout stateid after receiving 409 * NFS4ERR_OLD_STATEID 410 */ 411bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, 412 struct pnfs_layout_range *dst_range, 413 struct inode *inode) 414{ 415 struct pnfs_layout_hdr *lo; 416 struct pnfs_layout_range range = { 417 .iomode = IOMODE_ANY, 418 .offset = 0, 419 .length = NFS4_MAX_UINT64, 420 }; 421 bool ret = false; 422 LIST_HEAD(head); 423 int err; 424 425 spin_lock(&inode->i_lock); 426 lo = NFS_I(inode)->layout; 427 if (lo && pnfs_layout_is_valid(lo) && 428 nfs4_stateid_match_other(dst, &lo->plh_stateid)) { 429 /* Is our call using the most recent seqid? If so, bump it */ 430 if (!nfs4_stateid_is_newer(&lo->plh_stateid, dst)) { 431 nfs4_stateid_seqid_inc(dst); 432 ret = true; 433 goto out; 434 } 435 /* Try to update the seqid to the most recent */ 436 err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0); 437 if (err != -EBUSY) { 438 dst->seqid = lo->plh_stateid.seqid; 439 *dst_range = range; 440 ret = true; 441 } 442 } 443out: 444 spin_unlock(&inode->i_lock); 445 pnfs_free_lseg_list(&head); 446 return ret; 447} 448 449/* 450 * Mark a pnfs_layout_hdr and all associated layout segments as invalid 451 * 452 * In order to continue using the pnfs_layout_hdr, a full recovery 453 * is required. 454 * Note that caller must hold inode->i_lock. 455 */ 456int 457pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, 458 struct list_head *lseg_list) 459{ 460 struct pnfs_layout_range range = { 461 .iomode = IOMODE_ANY, 462 .offset = 0, 463 .length = NFS4_MAX_UINT64, 464 }; 465 struct pnfs_layout_segment *lseg, *next; 466 467 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); 468 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 469 pnfs_clear_lseg_state(lseg, lseg_list); 470 pnfs_clear_layoutreturn_info(lo); 471 pnfs_free_returned_lsegs(lo, lseg_list, &range, 0); 472 set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags); 473 if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) && 474 !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) 475 pnfs_clear_layoutreturn_waitbit(lo); 476 return !list_empty(&lo->plh_segs); 477} 478 479static int 480pnfs_iomode_to_fail_bit(u32 iomode) 481{ 482 return iomode == IOMODE_RW ? 483 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; 484} 485 486static void 487pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) 488{ 489 lo->plh_retry_timestamp = jiffies; 490 if (!test_and_set_bit(fail_bit, &lo->plh_flags)) 491 refcount_inc(&lo->plh_refcount); 492} 493 494static void 495pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) 496{ 497 if (test_and_clear_bit(fail_bit, &lo->plh_flags)) 498 refcount_dec(&lo->plh_refcount); 499} 500 501static void 502pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) 503{ 504 struct inode *inode = lo->plh_inode; 505 struct pnfs_layout_range range = { 506 .iomode = iomode, 507 .offset = 0, 508 .length = NFS4_MAX_UINT64, 509 }; 510 LIST_HEAD(head); 511 512 spin_lock(&inode->i_lock); 513 pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); 514 pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0); 515 spin_unlock(&inode->i_lock); 516 pnfs_free_lseg_list(&head); 517 dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, 518 iomode == IOMODE_RW ? "RW" : "READ"); 519} 520 521static bool 522pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode) 523{ 524 unsigned long start, end; 525 int fail_bit = pnfs_iomode_to_fail_bit(iomode); 526 527 if (test_bit(fail_bit, &lo->plh_flags) == 0) 528 return false; 529 end = jiffies; 530 start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT; 531 if (!time_in_range(lo->plh_retry_timestamp, start, end)) { 532 /* It is time to retry the failed layoutgets */ 533 pnfs_layout_clear_fail_bit(lo, fail_bit); 534 return false; 535 } 536 return true; 537} 538 539static void 540pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, 541 const struct pnfs_layout_range *range, 542 const nfs4_stateid *stateid) 543{ 544 INIT_LIST_HEAD(&lseg->pls_list); 545 INIT_LIST_HEAD(&lseg->pls_lc_list); 546 INIT_LIST_HEAD(&lseg->pls_commits); 547 refcount_set(&lseg->pls_refcount, 1); 548 set_bit(NFS_LSEG_VALID, &lseg->pls_flags); 549 lseg->pls_layout = lo; 550 lseg->pls_range = *range; 551 lseg->pls_seq = be32_to_cpu(stateid->seqid); 552} 553 554static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) 555{ 556 if (lseg != NULL) { 557 struct inode *inode = lseg->pls_layout->plh_inode; 558 NFS_SERVER(inode)->pnfs_curr_ld->free_lseg(lseg); 559 } 560} 561 562static void 563pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, 564 struct pnfs_layout_segment *lseg) 565{ 566 WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 567 list_del_init(&lseg->pls_list); 568 /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ 569 refcount_dec(&lo->plh_refcount); 570 if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) 571 return; 572 if (list_empty(&lo->plh_segs) && 573 !test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) && 574 !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { 575 if (atomic_read(&lo->plh_outstanding) == 0) 576 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); 577 clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); 578 } 579} 580 581static bool 582pnfs_cache_lseg_for_layoutreturn(struct pnfs_layout_hdr *lo, 583 struct pnfs_layout_segment *lseg) 584{ 585 if (test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) && 586 pnfs_layout_is_valid(lo)) { 587 pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); 588 list_move_tail(&lseg->pls_list, &lo->plh_return_segs); 589 return true; 590 } 591 return false; 592} 593 594void 595pnfs_put_lseg(struct pnfs_layout_segment *lseg) 596{ 597 struct pnfs_layout_hdr *lo; 598 struct inode *inode; 599 600 if (!lseg) 601 return; 602 603 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, 604 refcount_read(&lseg->pls_refcount), 605 test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 606 607 lo = lseg->pls_layout; 608 inode = lo->plh_inode; 609 610 if (refcount_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { 611 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { 612 spin_unlock(&inode->i_lock); 613 return; 614 } 615 pnfs_get_layout_hdr(lo); 616 pnfs_layout_remove_lseg(lo, lseg); 617 if (pnfs_cache_lseg_for_layoutreturn(lo, lseg)) 618 lseg = NULL; 619 spin_unlock(&inode->i_lock); 620 pnfs_free_lseg(lseg); 621 pnfs_put_layout_hdr(lo); 622 } 623} 624EXPORT_SYMBOL_GPL(pnfs_put_lseg); 625 626/* 627 * is l2 fully contained in l1? 628 * start1 end1 629 * [----------------------------------) 630 * start2 end2 631 * [----------------) 632 */ 633static bool 634pnfs_lseg_range_contained(const struct pnfs_layout_range *l1, 635 const struct pnfs_layout_range *l2) 636{ 637 u64 start1 = l1->offset; 638 u64 end1 = pnfs_end_offset(start1, l1->length); 639 u64 start2 = l2->offset; 640 u64 end2 = pnfs_end_offset(start2, l2->length); 641 642 return (start1 <= start2) && (end1 >= end2); 643} 644 645static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, 646 struct list_head *tmp_list) 647{ 648 if (!refcount_dec_and_test(&lseg->pls_refcount)) 649 return false; 650 pnfs_layout_remove_lseg(lseg->pls_layout, lseg); 651 list_add(&lseg->pls_list, tmp_list); 652 return true; 653} 654 655/* Returns 1 if lseg is removed from list, 0 otherwise */ 656static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, 657 struct list_head *tmp_list) 658{ 659 int rv = 0; 660 661 if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { 662 /* Remove the reference keeping the lseg in the 663 * list. It will now be removed when all 664 * outstanding io is finished. 665 */ 666 dprintk("%s: lseg %p ref %d\n", __func__, lseg, 667 refcount_read(&lseg->pls_refcount)); 668 if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list)) 669 rv = 1; 670 } 671 return rv; 672} 673 674static bool 675pnfs_should_free_range(const struct pnfs_layout_range *lseg_range, 676 const struct pnfs_layout_range *recall_range) 677{ 678 return (recall_range->iomode == IOMODE_ANY || 679 lseg_range->iomode == recall_range->iomode) && 680 pnfs_lseg_range_intersecting(lseg_range, recall_range); 681} 682 683static bool 684pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg, 685 const struct pnfs_layout_range *recall_range, 686 u32 seq) 687{ 688 if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq)) 689 return false; 690 if (recall_range == NULL) 691 return true; 692 return pnfs_should_free_range(&lseg->pls_range, recall_range); 693} 694 695/** 696 * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later 697 * @lo: layout header containing the lsegs 698 * @tmp_list: list head where doomed lsegs should go 699 * @recall_range: optional recall range argument to match (may be NULL) 700 * @seq: only invalidate lsegs obtained prior to this sequence (may be 0) 701 * 702 * Walk the list of lsegs in the layout header, and tear down any that should 703 * be destroyed. If "recall_range" is specified then the segment must match 704 * that range. If "seq" is non-zero, then only match segments that were handed 705 * out at or before that sequence. 706 * 707 * Returns number of matching invalid lsegs remaining in list after scanning 708 * it and purging them. 709 */ 710int 711pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 712 struct list_head *tmp_list, 713 const struct pnfs_layout_range *recall_range, 714 u32 seq) 715{ 716 struct pnfs_layout_segment *lseg, *next; 717 int remaining = 0; 718 719 dprintk("%s:Begin lo %p\n", __func__, lo); 720 721 if (list_empty(&lo->plh_segs)) 722 return 0; 723 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 724 if (pnfs_match_lseg_recall(lseg, recall_range, seq)) { 725 dprintk("%s: freeing lseg %p iomode %d seq %u " 726 "offset %llu length %llu\n", __func__, 727 lseg, lseg->pls_range.iomode, lseg->pls_seq, 728 lseg->pls_range.offset, lseg->pls_range.length); 729 if (!mark_lseg_invalid(lseg, tmp_list)) 730 remaining++; 731 } 732 dprintk("%s:Return %i\n", __func__, remaining); 733 return remaining; 734} 735 736static void 737pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo, 738 struct list_head *free_me, 739 const struct pnfs_layout_range *range, 740 u32 seq) 741{ 742 struct pnfs_layout_segment *lseg, *next; 743 744 list_for_each_entry_safe(lseg, next, &lo->plh_return_segs, pls_list) { 745 if (pnfs_match_lseg_recall(lseg, range, seq)) 746 list_move_tail(&lseg->pls_list, free_me); 747 } 748} 749 750/* note free_me must contain lsegs from a single layout_hdr */ 751void 752pnfs_free_lseg_list(struct list_head *free_me) 753{ 754 struct pnfs_layout_segment *lseg, *tmp; 755 756 if (list_empty(free_me)) 757 return; 758 759 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { 760 list_del(&lseg->pls_list); 761 pnfs_free_lseg(lseg); 762 } 763} 764 765static struct pnfs_layout_hdr *__pnfs_destroy_layout(struct nfs_inode *nfsi) 766{ 767 struct pnfs_layout_hdr *lo; 768 LIST_HEAD(tmp_list); 769 770 spin_lock(&nfsi->vfs_inode.i_lock); 771 lo = nfsi->layout; 772 if (lo) { 773 pnfs_get_layout_hdr(lo); 774 pnfs_mark_layout_stateid_invalid(lo, &tmp_list); 775 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); 776 pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); 777 spin_unlock(&nfsi->vfs_inode.i_lock); 778 pnfs_free_lseg_list(&tmp_list); 779 nfs_commit_inode(&nfsi->vfs_inode, 0); 780 pnfs_put_layout_hdr(lo); 781 } else 782 spin_unlock(&nfsi->vfs_inode.i_lock); 783 return lo; 784} 785 786void pnfs_destroy_layout(struct nfs_inode *nfsi) 787{ 788 __pnfs_destroy_layout(nfsi); 789} 790EXPORT_SYMBOL_GPL(pnfs_destroy_layout); 791 792static bool pnfs_layout_removed(struct nfs_inode *nfsi, 793 struct pnfs_layout_hdr *lo) 794{ 795 bool ret; 796 797 spin_lock(&nfsi->vfs_inode.i_lock); 798 ret = nfsi->layout != lo; 799 spin_unlock(&nfsi->vfs_inode.i_lock); 800 return ret; 801} 802 803void pnfs_destroy_layout_final(struct nfs_inode *nfsi) 804{ 805 struct pnfs_layout_hdr *lo = __pnfs_destroy_layout(nfsi); 806 807 if (lo) 808 wait_var_event(lo, pnfs_layout_removed(nfsi, lo)); 809} 810 811static bool 812pnfs_layout_add_bulk_destroy_list(struct inode *inode, 813 struct list_head *layout_list) 814{ 815 struct pnfs_layout_hdr *lo; 816 bool ret = false; 817 818 spin_lock(&inode->i_lock); 819 lo = NFS_I(inode)->layout; 820 if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) { 821 pnfs_get_layout_hdr(lo); 822 list_add(&lo->plh_bulk_destroy, layout_list); 823 ret = true; 824 } 825 spin_unlock(&inode->i_lock); 826 return ret; 827} 828 829/* Caller must hold rcu_read_lock and clp->cl_lock */ 830static int 831pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, 832 struct nfs_server *server, 833 struct list_head *layout_list) 834 __must_hold(&clp->cl_lock) 835 __must_hold(RCU) 836{ 837 struct pnfs_layout_hdr *lo, *next; 838 struct inode *inode; 839 840 list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) { 841 if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) || 842 test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) || 843 !list_empty(&lo->plh_bulk_destroy)) 844 continue; 845 /* If the sb is being destroyed, just bail */ 846 if (!nfs_sb_active(server->super)) 847 break; 848 inode = pnfs_grab_inode_layout_hdr(lo); 849 if (inode != NULL) { 850 if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) 851 list_del_rcu(&lo->plh_layouts); 852 if (pnfs_layout_add_bulk_destroy_list(inode, 853 layout_list)) 854 continue; 855 rcu_read_unlock(); 856 spin_unlock(&clp->cl_lock); 857 iput(inode); 858 } else { 859 rcu_read_unlock(); 860 spin_unlock(&clp->cl_lock); 861 } 862 nfs_sb_deactive(server->super); 863 spin_lock(&clp->cl_lock); 864 rcu_read_lock(); 865 return -EAGAIN; 866 } 867 return 0; 868} 869 870static int 871pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, 872 bool is_bulk_recall) 873{ 874 struct pnfs_layout_hdr *lo; 875 struct inode *inode; 876 LIST_HEAD(lseg_list); 877 int ret = 0; 878 879 while (!list_empty(layout_list)) { 880 lo = list_entry(layout_list->next, struct pnfs_layout_hdr, 881 plh_bulk_destroy); 882 dprintk("%s freeing layout for inode %lu\n", __func__, 883 lo->plh_inode->i_ino); 884 inode = lo->plh_inode; 885 886 pnfs_layoutcommit_inode(inode, false); 887 888 spin_lock(&inode->i_lock); 889 list_del_init(&lo->plh_bulk_destroy); 890 if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) { 891 if (is_bulk_recall) 892 set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); 893 ret = -EAGAIN; 894 } 895 spin_unlock(&inode->i_lock); 896 pnfs_free_lseg_list(&lseg_list); 897 /* Free all lsegs that are attached to commit buckets */ 898 nfs_commit_inode(inode, 0); 899 pnfs_put_layout_hdr(lo); 900 nfs_iput_and_deactive(inode); 901 } 902 return ret; 903} 904 905int 906pnfs_destroy_layouts_byfsid(struct nfs_client *clp, 907 struct nfs_fsid *fsid, 908 bool is_recall) 909{ 910 struct nfs_server *server; 911 LIST_HEAD(layout_list); 912 913 spin_lock(&clp->cl_lock); 914 rcu_read_lock(); 915restart: 916 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { 917 if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0) 918 continue; 919 if (pnfs_layout_bulk_destroy_byserver_locked(clp, 920 server, 921 &layout_list) != 0) 922 goto restart; 923 } 924 rcu_read_unlock(); 925 spin_unlock(&clp->cl_lock); 926 927 if (list_empty(&layout_list)) 928 return 0; 929 return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); 930} 931 932int 933pnfs_destroy_layouts_byclid(struct nfs_client *clp, 934 bool is_recall) 935{ 936 struct nfs_server *server; 937 LIST_HEAD(layout_list); 938 939 spin_lock(&clp->cl_lock); 940 rcu_read_lock(); 941restart: 942 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { 943 if (pnfs_layout_bulk_destroy_byserver_locked(clp, 944 server, 945 &layout_list) != 0) 946 goto restart; 947 } 948 rcu_read_unlock(); 949 spin_unlock(&clp->cl_lock); 950 951 if (list_empty(&layout_list)) 952 return 0; 953 return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); 954} 955 956/* 957 * Called by the state manager to remove all layouts established under an 958 * expired lease. 959 */ 960void 961pnfs_destroy_all_layouts(struct nfs_client *clp) 962{ 963 nfs4_deviceid_mark_client_invalid(clp); 964 nfs4_deviceid_purge_client(clp); 965 966 pnfs_destroy_layouts_byclid(clp, false); 967} 968 969static void 970pnfs_set_layout_cred(struct pnfs_layout_hdr *lo, const struct cred *cred) 971{ 972 const struct cred *old; 973 974 if (cred && cred_fscmp(lo->plh_lc_cred, cred) != 0) { 975 old = xchg(&lo->plh_lc_cred, get_cred(cred)); 976 put_cred(old); 977 } 978} 979 980/* update lo->plh_stateid with new if is more recent */ 981void 982pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, 983 const struct cred *cred, bool update_barrier) 984{ 985 u32 oldseq = be32_to_cpu(lo->plh_stateid.seqid); 986 u32 newseq = be32_to_cpu(new->seqid); 987 988 if (!pnfs_layout_is_valid(lo)) { 989 pnfs_set_layout_cred(lo, cred); 990 nfs4_stateid_copy(&lo->plh_stateid, new); 991 lo->plh_barrier = newseq; 992 pnfs_clear_layoutreturn_info(lo); 993 clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); 994 return; 995 } 996 997 if (pnfs_seqid_is_newer(newseq, oldseq)) 998 nfs4_stateid_copy(&lo->plh_stateid, new); 999 1000 if (update_barrier) { 1001 pnfs_barrier_update(lo, newseq); 1002 return; 1003 } 1004 /* 1005 * Because of wraparound, we want to keep the barrier 1006 * "close" to the current seqids. We really only want to 1007 * get here from a layoutget call. 1008 */ 1009 if (atomic_read(&lo->plh_outstanding) == 1) 1010 pnfs_barrier_update(lo, be32_to_cpu(lo->plh_stateid.seqid)); 1011} 1012 1013static bool 1014pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo, 1015 const nfs4_stateid *stateid) 1016{ 1017 u32 seqid = be32_to_cpu(stateid->seqid); 1018 1019 return lo->plh_barrier && pnfs_seqid_is_newer(lo->plh_barrier, seqid); 1020} 1021 1022/* lget is set to 1 if called from inside send_layoutget call chain */ 1023static bool 1024pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo) 1025{ 1026 return lo->plh_block_lgets || 1027 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); 1028} 1029 1030static struct nfs_server * 1031pnfs_find_server(struct inode *inode, struct nfs_open_context *ctx) 1032{ 1033 struct nfs_server *server; 1034 1035 if (inode) { 1036 server = NFS_SERVER(inode); 1037 } else { 1038 struct dentry *parent_dir = dget_parent(ctx->dentry); 1039 server = NFS_SERVER(parent_dir->d_inode); 1040 dput(parent_dir); 1041 } 1042 return server; 1043} 1044 1045static void nfs4_free_pages(struct page **pages, size_t size) 1046{ 1047 int i; 1048 1049 if (!pages) 1050 return; 1051 1052 for (i = 0; i < size; i++) { 1053 if (!pages[i]) 1054 break; 1055 __free_page(pages[i]); 1056 } 1057 kfree(pages); 1058} 1059 1060static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags) 1061{ 1062 struct page **pages; 1063 int i; 1064 1065 pages = kmalloc_array(size, sizeof(struct page *), gfp_flags); 1066 if (!pages) { 1067 dprintk("%s: can't alloc array of %zu pages\n", __func__, size); 1068 return NULL; 1069 } 1070 1071 for (i = 0; i < size; i++) { 1072 pages[i] = alloc_page(gfp_flags); 1073 if (!pages[i]) { 1074 dprintk("%s: failed to allocate page\n", __func__); 1075 nfs4_free_pages(pages, i); 1076 return NULL; 1077 } 1078 } 1079 1080 return pages; 1081} 1082 1083static struct nfs4_layoutget * 1084pnfs_alloc_init_layoutget_args(struct inode *ino, 1085 struct nfs_open_context *ctx, 1086 const nfs4_stateid *stateid, 1087 const struct pnfs_layout_range *range, 1088 gfp_t gfp_flags) 1089{ 1090 struct nfs_server *server = pnfs_find_server(ino, ctx); 1091 size_t max_reply_sz = server->pnfs_curr_ld->max_layoutget_response; 1092 size_t max_pages = max_response_pages(server); 1093 struct nfs4_layoutget *lgp; 1094 1095 dprintk("--> %s\n", __func__); 1096 1097 lgp = kzalloc(sizeof(*lgp), gfp_flags); 1098 if (lgp == NULL) 1099 return NULL; 1100 1101 if (max_reply_sz) { 1102 size_t npages = (max_reply_sz + PAGE_SIZE - 1) >> PAGE_SHIFT; 1103 if (npages < max_pages) 1104 max_pages = npages; 1105 } 1106 1107 lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags); 1108 if (!lgp->args.layout.pages) { 1109 kfree(lgp); 1110 return NULL; 1111 } 1112 lgp->args.layout.pglen = max_pages * PAGE_SIZE; 1113 lgp->res.layoutp = &lgp->args.layout; 1114 1115 /* Don't confuse uninitialised result and success */ 1116 lgp->res.status = -NFS4ERR_DELAY; 1117 1118 lgp->args.minlength = PAGE_SIZE; 1119 if (lgp->args.minlength > range->length) 1120 lgp->args.minlength = range->length; 1121 if (ino) { 1122 loff_t i_size = i_size_read(ino); 1123 1124 if (range->iomode == IOMODE_READ) { 1125 if (range->offset >= i_size) 1126 lgp->args.minlength = 0; 1127 else if (i_size - range->offset < lgp->args.minlength) 1128 lgp->args.minlength = i_size - range->offset; 1129 } 1130 } 1131 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; 1132 pnfs_copy_range(&lgp->args.range, range); 1133 lgp->args.type = server->pnfs_curr_ld->id; 1134 lgp->args.inode = ino; 1135 lgp->args.ctx = get_nfs_open_context(ctx); 1136 nfs4_stateid_copy(&lgp->args.stateid, stateid); 1137 lgp->gfp_flags = gfp_flags; 1138 lgp->cred = ctx->cred; 1139 return lgp; 1140} 1141 1142void pnfs_layoutget_free(struct nfs4_layoutget *lgp) 1143{ 1144 size_t max_pages = lgp->args.layout.pglen / PAGE_SIZE; 1145 1146 nfs4_free_pages(lgp->args.layout.pages, max_pages); 1147 if (lgp->args.inode) 1148 pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout); 1149 put_nfs_open_context(lgp->args.ctx); 1150 kfree(lgp); 1151} 1152 1153static void pnfs_clear_layoutcommit(struct inode *inode, 1154 struct list_head *head) 1155{ 1156 struct nfs_inode *nfsi = NFS_I(inode); 1157 struct pnfs_layout_segment *lseg, *tmp; 1158 1159 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) 1160 return; 1161 list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) { 1162 if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) 1163 continue; 1164 pnfs_lseg_dec_and_remove_zero(lseg, head); 1165 } 1166} 1167 1168void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, 1169 const nfs4_stateid *arg_stateid, 1170 const struct pnfs_layout_range *range, 1171 const nfs4_stateid *stateid) 1172{ 1173 struct inode *inode = lo->plh_inode; 1174 LIST_HEAD(freeme); 1175 1176 spin_lock(&inode->i_lock); 1177 if (!pnfs_layout_is_valid(lo) || !arg_stateid || 1178 !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) 1179 goto out_unlock; 1180 if (stateid) { 1181 u32 seq = be32_to_cpu(arg_stateid->seqid); 1182 1183 pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq); 1184 pnfs_free_returned_lsegs(lo, &freeme, range, seq); 1185 pnfs_set_layout_stateid(lo, stateid, NULL, true); 1186 } else 1187 pnfs_mark_layout_stateid_invalid(lo, &freeme); 1188out_unlock: 1189 pnfs_clear_layoutreturn_waitbit(lo); 1190 spin_unlock(&inode->i_lock); 1191 pnfs_free_lseg_list(&freeme); 1192 1193} 1194 1195static bool 1196pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, 1197 nfs4_stateid *stateid, 1198 const struct cred **cred, 1199 enum pnfs_iomode *iomode) 1200{ 1201 /* Serialise LAYOUTGET/LAYOUTRETURN */ 1202 if (atomic_read(&lo->plh_outstanding) != 0) 1203 return false; 1204 if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) 1205 return false; 1206 set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); 1207 pnfs_get_layout_hdr(lo); 1208 nfs4_stateid_copy(stateid, &lo->plh_stateid); 1209 *cred = get_cred(lo->plh_lc_cred); 1210 if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) { 1211 if (lo->plh_return_seq != 0) 1212 stateid->seqid = cpu_to_be32(lo->plh_return_seq); 1213 if (iomode != NULL) 1214 *iomode = lo->plh_return_iomode; 1215 pnfs_clear_layoutreturn_info(lo); 1216 } else if (iomode != NULL) 1217 *iomode = IOMODE_ANY; 1218 pnfs_barrier_update(lo, be32_to_cpu(stateid->seqid)); 1219 return true; 1220} 1221 1222static void 1223pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args, 1224 struct pnfs_layout_hdr *lo, 1225 const nfs4_stateid *stateid, 1226 enum pnfs_iomode iomode) 1227{ 1228 struct inode *inode = lo->plh_inode; 1229 1230 args->layout_type = NFS_SERVER(inode)->pnfs_curr_ld->id; 1231 args->inode = inode; 1232 args->range.iomode = iomode; 1233 args->range.offset = 0; 1234 args->range.length = NFS4_MAX_UINT64; 1235 args->layout = lo; 1236 nfs4_stateid_copy(&args->stateid, stateid); 1237} 1238 1239static int 1240pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, 1241 const nfs4_stateid *stateid, 1242 const struct cred **pcred, 1243 enum pnfs_iomode iomode, 1244 bool sync) 1245{ 1246 struct inode *ino = lo->plh_inode; 1247 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; 1248 struct nfs4_layoutreturn *lrp; 1249 const struct cred *cred = *pcred; 1250 int status = 0; 1251 1252 *pcred = NULL; 1253 lrp = kzalloc(sizeof(*lrp), GFP_NOFS); 1254 if (unlikely(lrp == NULL)) { 1255 status = -ENOMEM; 1256 spin_lock(&ino->i_lock); 1257 pnfs_clear_layoutreturn_waitbit(lo); 1258 spin_unlock(&ino->i_lock); 1259 put_cred(cred); 1260 pnfs_put_layout_hdr(lo); 1261 goto out; 1262 } 1263 1264 pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode); 1265 lrp->args.ld_private = &lrp->ld_private; 1266 lrp->clp = NFS_SERVER(ino)->nfs_client; 1267 lrp->cred = cred; 1268 if (ld->prepare_layoutreturn) 1269 ld->prepare_layoutreturn(&lrp->args); 1270 1271 status = nfs4_proc_layoutreturn(lrp, sync); 1272out: 1273 dprintk("<-- %s status: %d\n", __func__, status); 1274 return status; 1275} 1276 1277static bool 1278pnfs_layout_segments_returnable(struct pnfs_layout_hdr *lo, 1279 enum pnfs_iomode iomode, 1280 u32 seq) 1281{ 1282 struct pnfs_layout_range recall_range = { 1283 .length = NFS4_MAX_UINT64, 1284 .iomode = iomode, 1285 }; 1286 return pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, 1287 &recall_range, seq) != -EBUSY; 1288} 1289 1290/* Return true if layoutreturn is needed */ 1291static bool 1292pnfs_layout_need_return(struct pnfs_layout_hdr *lo) 1293{ 1294 if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) 1295 return false; 1296 return pnfs_layout_segments_returnable(lo, lo->plh_return_iomode, 1297 lo->plh_return_seq); 1298} 1299 1300static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) 1301{ 1302 struct inode *inode= lo->plh_inode; 1303 1304 if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) 1305 return; 1306 spin_lock(&inode->i_lock); 1307 if (pnfs_layout_need_return(lo)) { 1308 const struct cred *cred; 1309 nfs4_stateid stateid; 1310 enum pnfs_iomode iomode; 1311 bool send; 1312 1313 send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode); 1314 spin_unlock(&inode->i_lock); 1315 if (send) { 1316 /* Send an async layoutreturn so we dont deadlock */ 1317 pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false); 1318 } 1319 } else 1320 spin_unlock(&inode->i_lock); 1321} 1322 1323/* 1324 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr 1325 * when the layout segment list is empty. 1326 * 1327 * Note that a pnfs_layout_hdr can exist with an empty layout segment 1328 * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the 1329 * deviceid is marked invalid. 1330 */ 1331int 1332_pnfs_return_layout(struct inode *ino) 1333{ 1334 struct pnfs_layout_hdr *lo = NULL; 1335 struct nfs_inode *nfsi = NFS_I(ino); 1336 struct pnfs_layout_range range = { 1337 .iomode = IOMODE_ANY, 1338 .offset = 0, 1339 .length = NFS4_MAX_UINT64, 1340 }; 1341 LIST_HEAD(tmp_list); 1342 const struct cred *cred; 1343 nfs4_stateid stateid; 1344 int status = 0; 1345 bool send, valid_layout; 1346 1347 dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); 1348 1349 spin_lock(&ino->i_lock); 1350 lo = nfsi->layout; 1351 if (!lo) { 1352 spin_unlock(&ino->i_lock); 1353 dprintk("NFS: %s no layout to return\n", __func__); 1354 goto out; 1355 } 1356 /* Reference matched in nfs4_layoutreturn_release */ 1357 pnfs_get_layout_hdr(lo); 1358 /* Is there an outstanding layoutreturn ? */ 1359 if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { 1360 spin_unlock(&ino->i_lock); 1361 if (wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, 1362 TASK_UNINTERRUPTIBLE)) 1363 goto out_put_layout_hdr; 1364 spin_lock(&ino->i_lock); 1365 } 1366 valid_layout = pnfs_layout_is_valid(lo); 1367 pnfs_clear_layoutcommit(ino, &tmp_list); 1368 pnfs_mark_matching_lsegs_return(lo, &tmp_list, &range, 0); 1369 1370 if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) 1371 NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range); 1372 1373 /* Don't send a LAYOUTRETURN if list was initially empty */ 1374 if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) || 1375 !valid_layout) { 1376 spin_unlock(&ino->i_lock); 1377 dprintk("NFS: %s no layout segments to return\n", __func__); 1378 goto out_wait_layoutreturn; 1379 } 1380 1381 send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL); 1382 spin_unlock(&ino->i_lock); 1383 if (send) 1384 status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, true); 1385out_wait_layoutreturn: 1386 wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE); 1387out_put_layout_hdr: 1388 pnfs_free_lseg_list(&tmp_list); 1389 pnfs_put_layout_hdr(lo); 1390out: 1391 dprintk("<-- %s status: %d\n", __func__, status); 1392 return status; 1393} 1394 1395int 1396pnfs_commit_and_return_layout(struct inode *inode) 1397{ 1398 struct pnfs_layout_hdr *lo; 1399 int ret; 1400 1401 spin_lock(&inode->i_lock); 1402 lo = NFS_I(inode)->layout; 1403 if (lo == NULL) { 1404 spin_unlock(&inode->i_lock); 1405 return 0; 1406 } 1407 pnfs_get_layout_hdr(lo); 1408 /* Block new layoutgets and read/write to ds */ 1409 lo->plh_block_lgets++; 1410 spin_unlock(&inode->i_lock); 1411 filemap_fdatawait(inode->i_mapping); 1412 ret = pnfs_layoutcommit_inode(inode, true); 1413 if (ret == 0) 1414 ret = _pnfs_return_layout(inode); 1415 spin_lock(&inode->i_lock); 1416 lo->plh_block_lgets--; 1417 spin_unlock(&inode->i_lock); 1418 pnfs_put_layout_hdr(lo); 1419 return ret; 1420} 1421 1422bool pnfs_roc(struct inode *ino, 1423 struct nfs4_layoutreturn_args *args, 1424 struct nfs4_layoutreturn_res *res, 1425 const struct cred *cred) 1426{ 1427 struct nfs_inode *nfsi = NFS_I(ino); 1428 struct nfs_open_context *ctx; 1429 struct nfs4_state *state; 1430 struct pnfs_layout_hdr *lo; 1431 struct pnfs_layout_segment *lseg, *next; 1432 const struct cred *lc_cred; 1433 nfs4_stateid stateid; 1434 enum pnfs_iomode iomode = 0; 1435 bool layoutreturn = false, roc = false; 1436 bool skip_read = false; 1437 1438 if (!nfs_have_layout(ino)) 1439 return false; 1440retry: 1441 rcu_read_lock(); 1442 spin_lock(&ino->i_lock); 1443 lo = nfsi->layout; 1444 if (!lo || !pnfs_layout_is_valid(lo) || 1445 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { 1446 lo = NULL; 1447 goto out_noroc; 1448 } 1449 pnfs_get_layout_hdr(lo); 1450 if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { 1451 spin_unlock(&ino->i_lock); 1452 rcu_read_unlock(); 1453 wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, 1454 TASK_UNINTERRUPTIBLE); 1455 pnfs_put_layout_hdr(lo); 1456 goto retry; 1457 } 1458 1459 /* no roc if we hold a delegation */ 1460 if (nfs4_check_delegation(ino, FMODE_READ)) { 1461 if (nfs4_check_delegation(ino, FMODE_WRITE)) 1462 goto out_noroc; 1463 skip_read = true; 1464 } 1465 1466 list_for_each_entry_rcu(ctx, &nfsi->open_files, list) { 1467 state = ctx->state; 1468 if (state == NULL) 1469 continue; 1470 /* Don't return layout if there is open file state */ 1471 if (state->state & FMODE_WRITE) 1472 goto out_noroc; 1473 if (state->state & FMODE_READ) 1474 skip_read = true; 1475 } 1476 1477 1478 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) { 1479 if (skip_read && lseg->pls_range.iomode == IOMODE_READ) 1480 continue; 1481 /* If we are sending layoutreturn, invalidate all valid lsegs */ 1482 if (!test_and_clear_bit(NFS_LSEG_ROC, &lseg->pls_flags)) 1483 continue; 1484 /* 1485 * Note: mark lseg for return so pnfs_layout_remove_lseg 1486 * doesn't invalidate the layout for us. 1487 */ 1488 set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); 1489 if (!mark_lseg_invalid(lseg, &lo->plh_return_segs)) 1490 continue; 1491 pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); 1492 } 1493 1494 if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) 1495 goto out_noroc; 1496 1497 /* ROC in two conditions: 1498 * 1. there are ROC lsegs 1499 * 2. we don't send layoutreturn 1500 */ 1501 /* lo ref dropped in pnfs_roc_release() */ 1502 layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &lc_cred, &iomode); 1503 /* If the creds don't match, we can't compound the layoutreturn */ 1504 if (!layoutreturn || cred_fscmp(cred, lc_cred) != 0) 1505 goto out_noroc; 1506 1507 roc = layoutreturn; 1508 pnfs_init_layoutreturn_args(args, lo, &stateid, iomode); 1509 res->lrs_present = 0; 1510 layoutreturn = false; 1511 put_cred(lc_cred); 1512 1513out_noroc: 1514 spin_unlock(&ino->i_lock); 1515 rcu_read_unlock(); 1516 pnfs_layoutcommit_inode(ino, true); 1517 if (roc) { 1518 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; 1519 if (ld->prepare_layoutreturn) 1520 ld->prepare_layoutreturn(args); 1521 pnfs_put_layout_hdr(lo); 1522 return true; 1523 } 1524 if (layoutreturn) 1525 pnfs_send_layoutreturn(lo, &stateid, &lc_cred, iomode, true); 1526 pnfs_put_layout_hdr(lo); 1527 return false; 1528} 1529 1530int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp, 1531 struct nfs4_layoutreturn_res **respp, int *ret) 1532{ 1533 struct nfs4_layoutreturn_args *arg = *argpp; 1534 int retval = -EAGAIN; 1535 1536 if (!arg) 1537 return 0; 1538 /* Handle Layoutreturn errors */ 1539 switch (*ret) { 1540 case 0: 1541 retval = 0; 1542 break; 1543 case -NFS4ERR_NOMATCHING_LAYOUT: 1544 /* Was there an RPC level error? If not, retry */ 1545 if (task->tk_rpc_status == 0) 1546 break; 1547 /* If the call was not sent, let caller handle it */ 1548 if (!RPC_WAS_SENT(task)) 1549 return 0; 1550 /* 1551 * Otherwise, assume the call succeeded and 1552 * that we need to release the layout 1553 */ 1554 *ret = 0; 1555 (*respp)->lrs_present = 0; 1556 retval = 0; 1557 break; 1558 case -NFS4ERR_DELAY: 1559 /* Let the caller handle the retry */ 1560 *ret = -NFS4ERR_NOMATCHING_LAYOUT; 1561 return 0; 1562 case -NFS4ERR_OLD_STATEID: 1563 if (!nfs4_layout_refresh_old_stateid(&arg->stateid, 1564 &arg->range, arg->inode)) 1565 break; 1566 *ret = -NFS4ERR_NOMATCHING_LAYOUT; 1567 return -EAGAIN; 1568 } 1569 *argpp = NULL; 1570 *respp = NULL; 1571 return retval; 1572} 1573 1574void pnfs_roc_release(struct nfs4_layoutreturn_args *args, 1575 struct nfs4_layoutreturn_res *res, 1576 int ret) 1577{ 1578 struct pnfs_layout_hdr *lo = args->layout; 1579 struct inode *inode = args->inode; 1580 const nfs4_stateid *arg_stateid = NULL; 1581 const nfs4_stateid *res_stateid = NULL; 1582 struct nfs4_xdr_opaque_data *ld_private = args->ld_private; 1583 1584 switch (ret) { 1585 case -NFS4ERR_NOMATCHING_LAYOUT: 1586 spin_lock(&inode->i_lock); 1587 if (pnfs_layout_is_valid(lo) && 1588 nfs4_stateid_match_other(&args->stateid, &lo->plh_stateid)) 1589 pnfs_set_plh_return_info(lo, args->range.iomode, 0); 1590 spin_unlock(&inode->i_lock); 1591 break; 1592 case 0: 1593 if (res->lrs_present) 1594 res_stateid = &res->stateid; 1595 fallthrough; 1596 default: 1597 arg_stateid = &args->stateid; 1598 } 1599 trace_nfs4_layoutreturn_on_close(args->inode, &args->stateid, ret); 1600 pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range, 1601 res_stateid); 1602 if (ld_private && ld_private->ops && ld_private->ops->free) 1603 ld_private->ops->free(ld_private); 1604 pnfs_put_layout_hdr(lo); 1605} 1606 1607bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) 1608{ 1609 struct nfs_inode *nfsi = NFS_I(ino); 1610 struct pnfs_layout_hdr *lo; 1611 bool sleep = false; 1612 1613 /* we might not have grabbed lo reference. so need to check under 1614 * i_lock */ 1615 spin_lock(&ino->i_lock); 1616 lo = nfsi->layout; 1617 if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { 1618 rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); 1619 sleep = true; 1620 } 1621 spin_unlock(&ino->i_lock); 1622 return sleep; 1623} 1624 1625/* 1626 * Compare two layout segments for sorting into layout cache. 1627 * We want to preferentially return RW over RO layouts, so ensure those 1628 * are seen first. 1629 */ 1630static s64 1631pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1, 1632 const struct pnfs_layout_range *l2) 1633{ 1634 s64 d; 1635 1636 /* high offset > low offset */ 1637 d = l1->offset - l2->offset; 1638 if (d) 1639 return d; 1640 1641 /* short length > long length */ 1642 d = l2->length - l1->length; 1643 if (d) 1644 return d; 1645 1646 /* read > read/write */ 1647 return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ); 1648} 1649 1650static bool 1651pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1, 1652 const struct pnfs_layout_range *l2) 1653{ 1654 return pnfs_lseg_range_cmp(l1, l2) > 0; 1655} 1656 1657static bool 1658pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg, 1659 struct pnfs_layout_segment *old) 1660{ 1661 return false; 1662} 1663 1664void 1665pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo, 1666 struct pnfs_layout_segment *lseg, 1667 bool (*is_after)(const struct pnfs_layout_range *, 1668 const struct pnfs_layout_range *), 1669 bool (*do_merge)(struct pnfs_layout_segment *, 1670 struct pnfs_layout_segment *), 1671 struct list_head *free_me) 1672{ 1673 struct pnfs_layout_segment *lp, *tmp; 1674 1675 dprintk("%s:Begin\n", __func__); 1676 1677 list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) { 1678 if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0) 1679 continue; 1680 if (do_merge(lseg, lp)) { 1681 mark_lseg_invalid(lp, free_me); 1682 continue; 1683 } 1684 if (is_after(&lseg->pls_range, &lp->pls_range)) 1685 continue; 1686 list_add_tail(&lseg->pls_list, &lp->pls_list); 1687 dprintk("%s: inserted lseg %p " 1688 "iomode %d offset %llu length %llu before " 1689 "lp %p iomode %d offset %llu length %llu\n", 1690 __func__, lseg, lseg->pls_range.iomode, 1691 lseg->pls_range.offset, lseg->pls_range.length, 1692 lp, lp->pls_range.iomode, lp->pls_range.offset, 1693 lp->pls_range.length); 1694 goto out; 1695 } 1696 list_add_tail(&lseg->pls_list, &lo->plh_segs); 1697 dprintk("%s: inserted lseg %p " 1698 "iomode %d offset %llu length %llu at tail\n", 1699 __func__, lseg, lseg->pls_range.iomode, 1700 lseg->pls_range.offset, lseg->pls_range.length); 1701out: 1702 pnfs_get_layout_hdr(lo); 1703 1704 dprintk("%s:Return\n", __func__); 1705} 1706EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg); 1707 1708static void 1709pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo, 1710 struct pnfs_layout_segment *lseg, 1711 struct list_head *free_me) 1712{ 1713 struct inode *inode = lo->plh_inode; 1714 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; 1715 1716 if (ld->add_lseg != NULL) 1717 ld->add_lseg(lo, lseg, free_me); 1718 else 1719 pnfs_generic_layout_insert_lseg(lo, lseg, 1720 pnfs_lseg_range_is_after, 1721 pnfs_lseg_no_merge, 1722 free_me); 1723} 1724 1725static struct pnfs_layout_hdr * 1726alloc_init_layout_hdr(struct inode *ino, 1727 struct nfs_open_context *ctx, 1728 gfp_t gfp_flags) 1729{ 1730 struct pnfs_layout_hdr *lo; 1731 1732 lo = pnfs_alloc_layout_hdr(ino, gfp_flags); 1733 if (!lo) 1734 return NULL; 1735 refcount_set(&lo->plh_refcount, 1); 1736 INIT_LIST_HEAD(&lo->plh_layouts); 1737 INIT_LIST_HEAD(&lo->plh_segs); 1738 INIT_LIST_HEAD(&lo->plh_return_segs); 1739 INIT_LIST_HEAD(&lo->plh_bulk_destroy); 1740 lo->plh_inode = ino; 1741 lo->plh_lc_cred = get_cred(ctx->cred); 1742 lo->plh_flags |= 1 << NFS_LAYOUT_INVALID_STID; 1743 return lo; 1744} 1745 1746static struct pnfs_layout_hdr * 1747pnfs_find_alloc_layout(struct inode *ino, 1748 struct nfs_open_context *ctx, 1749 gfp_t gfp_flags) 1750 __releases(&ino->i_lock) 1751 __acquires(&ino->i_lock) 1752{ 1753 struct nfs_inode *nfsi = NFS_I(ino); 1754 struct pnfs_layout_hdr *new = NULL; 1755 1756 dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout); 1757 1758 if (nfsi->layout != NULL) 1759 goto out_existing; 1760 spin_unlock(&ino->i_lock); 1761 new = alloc_init_layout_hdr(ino, ctx, gfp_flags); 1762 spin_lock(&ino->i_lock); 1763 1764 if (likely(nfsi->layout == NULL)) { /* Won the race? */ 1765 nfsi->layout = new; 1766 return new; 1767 } else if (new != NULL) 1768 pnfs_free_layout_hdr(new); 1769out_existing: 1770 pnfs_get_layout_hdr(nfsi->layout); 1771 return nfsi->layout; 1772} 1773 1774/* 1775 * iomode matching rules: 1776 * iomode lseg strict match 1777 * iomode 1778 * ----- ----- ------ ----- 1779 * ANY READ N/A true 1780 * ANY RW N/A true 1781 * RW READ N/A false 1782 * RW RW N/A true 1783 * READ READ N/A true 1784 * READ RW true false 1785 * READ RW false true 1786 */ 1787static bool 1788pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, 1789 const struct pnfs_layout_range *range, 1790 bool strict_iomode) 1791{ 1792 struct pnfs_layout_range range1; 1793 1794 if ((range->iomode == IOMODE_RW && 1795 ls_range->iomode != IOMODE_RW) || 1796 (range->iomode != ls_range->iomode && 1797 strict_iomode) || 1798 !pnfs_lseg_range_intersecting(ls_range, range)) 1799 return false; 1800 1801 /* range1 covers only the first byte in the range */ 1802 range1 = *range; 1803 range1.length = 1; 1804 return pnfs_lseg_range_contained(ls_range, &range1); 1805} 1806 1807/* 1808 * lookup range in layout 1809 */ 1810static struct pnfs_layout_segment * 1811pnfs_find_lseg(struct pnfs_layout_hdr *lo, 1812 struct pnfs_layout_range *range, 1813 bool strict_iomode) 1814{ 1815 struct pnfs_layout_segment *lseg, *ret = NULL; 1816 1817 dprintk("%s:Begin\n", __func__); 1818 1819 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 1820 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 1821 pnfs_lseg_range_match(&lseg->pls_range, range, 1822 strict_iomode)) { 1823 ret = pnfs_get_lseg(lseg); 1824 break; 1825 } 1826 } 1827 1828 dprintk("%s:Return lseg %p ref %d\n", 1829 __func__, ret, ret ? refcount_read(&ret->pls_refcount) : 0); 1830 return ret; 1831} 1832 1833/* 1834 * Use mdsthreshold hints set at each OPEN to determine if I/O should go 1835 * to the MDS or over pNFS 1836 * 1837 * The nfs_inode read_io and write_io fields are cumulative counters reset 1838 * when there are no layout segments. Note that in pnfs_update_layout iomode 1839 * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a 1840 * WRITE request. 1841 * 1842 * A return of true means use MDS I/O. 1843 * 1844 * From rfc 5661: 1845 * If a file's size is smaller than the file size threshold, data accesses 1846 * SHOULD be sent to the metadata server. If an I/O request has a length that 1847 * is below the I/O size threshold, the I/O SHOULD be sent to the metadata 1848 * server. If both file size and I/O size are provided, the client SHOULD 1849 * reach or exceed both thresholds before sending its read or write 1850 * requests to the data server. 1851 */ 1852static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx, 1853 struct inode *ino, int iomode) 1854{ 1855 struct nfs4_threshold *t = ctx->mdsthreshold; 1856 struct nfs_inode *nfsi = NFS_I(ino); 1857 loff_t fsize = i_size_read(ino); 1858 bool size = false, size_set = false, io = false, io_set = false, ret = false; 1859 1860 if (t == NULL) 1861 return ret; 1862 1863 dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n", 1864 __func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz); 1865 1866 switch (iomode) { 1867 case IOMODE_READ: 1868 if (t->bm & THRESHOLD_RD) { 1869 dprintk("%s fsize %llu\n", __func__, fsize); 1870 size_set = true; 1871 if (fsize < t->rd_sz) 1872 size = true; 1873 } 1874 if (t->bm & THRESHOLD_RD_IO) { 1875 dprintk("%s nfsi->read_io %llu\n", __func__, 1876 nfsi->read_io); 1877 io_set = true; 1878 if (nfsi->read_io < t->rd_io_sz) 1879 io = true; 1880 } 1881 break; 1882 case IOMODE_RW: 1883 if (t->bm & THRESHOLD_WR) { 1884 dprintk("%s fsize %llu\n", __func__, fsize); 1885 size_set = true; 1886 if (fsize < t->wr_sz) 1887 size = true; 1888 } 1889 if (t->bm & THRESHOLD_WR_IO) { 1890 dprintk("%s nfsi->write_io %llu\n", __func__, 1891 nfsi->write_io); 1892 io_set = true; 1893 if (nfsi->write_io < t->wr_io_sz) 1894 io = true; 1895 } 1896 break; 1897 } 1898 if (size_set && io_set) { 1899 if (size && io) 1900 ret = true; 1901 } else if (size || io) 1902 ret = true; 1903 1904 dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret); 1905 return ret; 1906} 1907 1908static int pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) 1909{ 1910 /* 1911 * send layoutcommit as it can hold up layoutreturn due to lseg 1912 * reference 1913 */ 1914 pnfs_layoutcommit_inode(lo->plh_inode, false); 1915 return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN, 1916 nfs_wait_bit_killable, 1917 TASK_KILLABLE); 1918} 1919 1920static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo) 1921{ 1922 atomic_inc(&lo->plh_outstanding); 1923} 1924 1925static void nfs_layoutget_end(struct pnfs_layout_hdr *lo) 1926{ 1927 if (atomic_dec_and_test(&lo->plh_outstanding) && 1928 test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) 1929 wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN); 1930} 1931 1932static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo) 1933{ 1934 return test_bit(NFS_LAYOUT_FIRST_LAYOUTGET, &lo->plh_flags); 1935} 1936 1937static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo) 1938{ 1939 unsigned long *bitlock = &lo->plh_flags; 1940 1941 clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock); 1942 smp_mb__after_atomic(); 1943 wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET); 1944} 1945 1946static void _add_to_server_list(struct pnfs_layout_hdr *lo, 1947 struct nfs_server *server) 1948{ 1949 if (!test_and_set_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) { 1950 struct nfs_client *clp = server->nfs_client; 1951 1952 /* The lo must be on the clp list if there is any 1953 * chance of a CB_LAYOUTRECALL(FILE) coming in. 1954 */ 1955 spin_lock(&clp->cl_lock); 1956 list_add_tail_rcu(&lo->plh_layouts, &server->layouts); 1957 spin_unlock(&clp->cl_lock); 1958 } 1959} 1960 1961/* 1962 * Layout segment is retreived from the server if not cached. 1963 * The appropriate layout segment is referenced and returned to the caller. 1964 */ 1965struct pnfs_layout_segment * 1966pnfs_update_layout(struct inode *ino, 1967 struct nfs_open_context *ctx, 1968 loff_t pos, 1969 u64 count, 1970 enum pnfs_iomode iomode, 1971 bool strict_iomode, 1972 gfp_t gfp_flags) 1973{ 1974 struct pnfs_layout_range arg = { 1975 .iomode = iomode, 1976 .offset = pos, 1977 .length = count, 1978 }; 1979 unsigned pg_offset; 1980 struct nfs_server *server = NFS_SERVER(ino); 1981 struct nfs_client *clp = server->nfs_client; 1982 struct pnfs_layout_hdr *lo = NULL; 1983 struct pnfs_layout_segment *lseg = NULL; 1984 struct nfs4_layoutget *lgp; 1985 nfs4_stateid stateid; 1986 long timeout = 0; 1987 unsigned long giveup = jiffies + (clp->cl_lease_time << 1); 1988 bool first; 1989 1990 if (!pnfs_enabled_sb(NFS_SERVER(ino))) { 1991 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, 1992 PNFS_UPDATE_LAYOUT_NO_PNFS); 1993 goto out; 1994 } 1995 1996 if (pnfs_within_mdsthreshold(ctx, ino, iomode)) { 1997 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, 1998 PNFS_UPDATE_LAYOUT_MDSTHRESH); 1999 goto out; 2000 } 2001 2002lookup_again: 2003 lseg = ERR_PTR(nfs4_client_recover_expired_lease(clp)); 2004 if (IS_ERR(lseg)) 2005 goto out; 2006 first = false; 2007 spin_lock(&ino->i_lock); 2008 lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); 2009 if (lo == NULL) { 2010 spin_unlock(&ino->i_lock); 2011 lseg = ERR_PTR(-ENOMEM); 2012 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, 2013 PNFS_UPDATE_LAYOUT_NOMEM); 2014 goto out; 2015 } 2016 2017 /* Do we even need to bother with this? */ 2018 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { 2019 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, 2020 PNFS_UPDATE_LAYOUT_BULK_RECALL); 2021 dprintk("%s matches recall, use MDS\n", __func__); 2022 goto out_unlock; 2023 } 2024 2025 /* if LAYOUTGET already failed once we don't try again */ 2026 if (pnfs_layout_io_test_failed(lo, iomode)) { 2027 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, 2028 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL); 2029 goto out_unlock; 2030 } 2031 2032 /* 2033 * If the layout segment list is empty, but there are outstanding 2034 * layoutget calls, then they might be subject to a layoutrecall. 2035 */ 2036 if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) && 2037 atomic_read(&lo->plh_outstanding) != 0) { 2038 spin_unlock(&ino->i_lock); 2039 lseg = ERR_PTR(wait_on_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN, 2040 TASK_KILLABLE)); 2041 if (IS_ERR(lseg)) 2042 goto out_put_layout_hdr; 2043 pnfs_put_layout_hdr(lo); 2044 goto lookup_again; 2045 } 2046 2047 /* 2048 * Because we free lsegs when sending LAYOUTRETURN, we need to wait 2049 * for LAYOUTRETURN. 2050 */ 2051 if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { 2052 spin_unlock(&ino->i_lock); 2053 dprintk("%s wait for layoutreturn\n", __func__); 2054 lseg = ERR_PTR(pnfs_prepare_to_retry_layoutget(lo)); 2055 if (!IS_ERR(lseg)) { 2056 pnfs_put_layout_hdr(lo); 2057 dprintk("%s retrying\n", __func__); 2058 trace_pnfs_update_layout(ino, pos, count, iomode, lo, 2059 lseg, 2060 PNFS_UPDATE_LAYOUT_RETRY); 2061 goto lookup_again; 2062 } 2063 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, 2064 PNFS_UPDATE_LAYOUT_RETURN); 2065 goto out_put_layout_hdr; 2066 } 2067 2068 lseg = pnfs_find_lseg(lo, &arg, strict_iomode); 2069 if (lseg) { 2070 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, 2071 PNFS_UPDATE_LAYOUT_FOUND_CACHED); 2072 goto out_unlock; 2073 } 2074 2075 /* 2076 * Choose a stateid for the LAYOUTGET. If we don't have a layout 2077 * stateid, or it has been invalidated, then we must use the open 2078 * stateid. 2079 */ 2080 if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { 2081 int status; 2082 2083 /* 2084 * The first layoutget for the file. Need to serialize per 2085 * RFC 5661 Errata 3208. 2086 */ 2087 if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, 2088 &lo->plh_flags)) { 2089 spin_unlock(&ino->i_lock); 2090 lseg = ERR_PTR(wait_on_bit(&lo->plh_flags, 2091 NFS_LAYOUT_FIRST_LAYOUTGET, 2092 TASK_KILLABLE)); 2093 if (IS_ERR(lseg)) 2094 goto out_put_layout_hdr; 2095 pnfs_put_layout_hdr(lo); 2096 dprintk("%s retrying\n", __func__); 2097 goto lookup_again; 2098 } 2099 2100 spin_unlock(&ino->i_lock); 2101 first = true; 2102 status = nfs4_select_rw_stateid(ctx->state, 2103 iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ, 2104 NULL, &stateid, NULL); 2105 if (status != 0) { 2106 lseg = ERR_PTR(status); 2107 trace_pnfs_update_layout(ino, pos, count, 2108 iomode, lo, lseg, 2109 PNFS_UPDATE_LAYOUT_INVALID_OPEN); 2110 nfs4_schedule_stateid_recovery(server, ctx->state); 2111 pnfs_clear_first_layoutget(lo); 2112 pnfs_put_layout_hdr(lo); 2113 goto lookup_again; 2114 } 2115 spin_lock(&ino->i_lock); 2116 } else { 2117 nfs4_stateid_copy(&stateid, &lo->plh_stateid); 2118 } 2119 2120 if (pnfs_layoutgets_blocked(lo)) { 2121 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, 2122 PNFS_UPDATE_LAYOUT_BLOCKED); 2123 goto out_unlock; 2124 } 2125 nfs_layoutget_begin(lo); 2126 spin_unlock(&ino->i_lock); 2127 2128 _add_to_server_list(lo, server); 2129 2130 pg_offset = arg.offset & ~PAGE_MASK; 2131 if (pg_offset) { 2132 arg.offset -= pg_offset; 2133 arg.length += pg_offset; 2134 } 2135 if (arg.length != NFS4_MAX_UINT64) 2136 arg.length = PAGE_ALIGN(arg.length); 2137 2138 lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &stateid, &arg, gfp_flags); 2139 if (!lgp) { 2140 lseg = ERR_PTR(-ENOMEM); 2141 trace_pnfs_update_layout(ino, pos, count, iomode, lo, NULL, 2142 PNFS_UPDATE_LAYOUT_NOMEM); 2143 nfs_layoutget_end(lo); 2144 goto out_put_layout_hdr; 2145 } 2146 2147 lseg = nfs4_proc_layoutget(lgp, &timeout); 2148 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, 2149 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); 2150 nfs_layoutget_end(lo); 2151 if (IS_ERR(lseg)) { 2152 switch(PTR_ERR(lseg)) { 2153 case -EBUSY: 2154 if (time_after(jiffies, giveup)) 2155 lseg = NULL; 2156 break; 2157 case -ERECALLCONFLICT: 2158 case -EAGAIN: 2159 break; 2160 case -ENODATA: 2161 /* The server returned NFS4ERR_LAYOUTUNAVAILABLE */ 2162 pnfs_layout_set_fail_bit( 2163 lo, pnfs_iomode_to_fail_bit(iomode)); 2164 lseg = NULL; 2165 goto out_put_layout_hdr; 2166 default: 2167 if (!nfs_error_is_fatal(PTR_ERR(lseg))) { 2168 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); 2169 lseg = NULL; 2170 } 2171 goto out_put_layout_hdr; 2172 } 2173 if (lseg) { 2174 if (first) 2175 pnfs_clear_first_layoutget(lo); 2176 trace_pnfs_update_layout(ino, pos, count, 2177 iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY); 2178 pnfs_put_layout_hdr(lo); 2179 goto lookup_again; 2180 } 2181 } else { 2182 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); 2183 } 2184 2185out_put_layout_hdr: 2186 if (first) 2187 pnfs_clear_first_layoutget(lo); 2188 trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, 2189 PNFS_UPDATE_LAYOUT_EXIT); 2190 pnfs_put_layout_hdr(lo); 2191out: 2192 dprintk("%s: inode %s/%llu pNFS layout segment %s for " 2193 "(%s, offset: %llu, length: %llu)\n", 2194 __func__, ino->i_sb->s_id, 2195 (unsigned long long)NFS_FILEID(ino), 2196 IS_ERR_OR_NULL(lseg) ? "not found" : "found", 2197 iomode==IOMODE_RW ? "read/write" : "read-only", 2198 (unsigned long long)pos, 2199 (unsigned long long)count); 2200 return lseg; 2201out_unlock: 2202 spin_unlock(&ino->i_lock); 2203 goto out_put_layout_hdr; 2204} 2205EXPORT_SYMBOL_GPL(pnfs_update_layout); 2206 2207static bool 2208pnfs_sanity_check_layout_range(struct pnfs_layout_range *range) 2209{ 2210 switch (range->iomode) { 2211 case IOMODE_READ: 2212 case IOMODE_RW: 2213 break; 2214 default: 2215 return false; 2216 } 2217 if (range->offset == NFS4_MAX_UINT64) 2218 return false; 2219 if (range->length == 0) 2220 return false; 2221 if (range->length != NFS4_MAX_UINT64 && 2222 range->length > NFS4_MAX_UINT64 - range->offset) 2223 return false; 2224 return true; 2225} 2226 2227static struct pnfs_layout_hdr * 2228_pnfs_grab_empty_layout(struct inode *ino, struct nfs_open_context *ctx) 2229{ 2230 struct pnfs_layout_hdr *lo; 2231 2232 spin_lock(&ino->i_lock); 2233 lo = pnfs_find_alloc_layout(ino, ctx, GFP_KERNEL); 2234 if (!lo) 2235 goto out_unlock; 2236 if (!test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) 2237 goto out_unlock; 2238 if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) 2239 goto out_unlock; 2240 if (pnfs_layoutgets_blocked(lo)) 2241 goto out_unlock; 2242 if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, &lo->plh_flags)) 2243 goto out_unlock; 2244 nfs_layoutget_begin(lo); 2245 spin_unlock(&ino->i_lock); 2246 _add_to_server_list(lo, NFS_SERVER(ino)); 2247 return lo; 2248 2249out_unlock: 2250 spin_unlock(&ino->i_lock); 2251 pnfs_put_layout_hdr(lo); 2252 return NULL; 2253} 2254 2255static void _lgopen_prepare_attached(struct nfs4_opendata *data, 2256 struct nfs_open_context *ctx) 2257{ 2258 struct inode *ino = data->dentry->d_inode; 2259 struct pnfs_layout_range rng = { 2260 .iomode = (data->o_arg.fmode & FMODE_WRITE) ? 2261 IOMODE_RW: IOMODE_READ, 2262 .offset = 0, 2263 .length = NFS4_MAX_UINT64, 2264 }; 2265 struct nfs4_layoutget *lgp; 2266 struct pnfs_layout_hdr *lo; 2267 2268 /* Heuristic: don't send layoutget if we have cached data */ 2269 if (rng.iomode == IOMODE_READ && 2270 (i_size_read(ino) == 0 || ino->i_mapping->nrpages != 0)) 2271 return; 2272 2273 lo = _pnfs_grab_empty_layout(ino, ctx); 2274 if (!lo) 2275 return; 2276 lgp = pnfs_alloc_init_layoutget_args(ino, ctx, ¤t_stateid, 2277 &rng, GFP_KERNEL); 2278 if (!lgp) { 2279 pnfs_clear_first_layoutget(lo); 2280 nfs_layoutget_end(lo); 2281 pnfs_put_layout_hdr(lo); 2282 return; 2283 } 2284 data->lgp = lgp; 2285 data->o_arg.lg_args = &lgp->args; 2286 data->o_res.lg_res = &lgp->res; 2287} 2288 2289static void _lgopen_prepare_floating(struct nfs4_opendata *data, 2290 struct nfs_open_context *ctx) 2291{ 2292 struct pnfs_layout_range rng = { 2293 .iomode = (data->o_arg.fmode & FMODE_WRITE) ? 2294 IOMODE_RW: IOMODE_READ, 2295 .offset = 0, 2296 .length = NFS4_MAX_UINT64, 2297 }; 2298 struct nfs4_layoutget *lgp; 2299 2300 lgp = pnfs_alloc_init_layoutget_args(NULL, ctx, ¤t_stateid, 2301 &rng, GFP_KERNEL); 2302 if (!lgp) 2303 return; 2304 data->lgp = lgp; 2305 data->o_arg.lg_args = &lgp->args; 2306 data->o_res.lg_res = &lgp->res; 2307} 2308 2309void pnfs_lgopen_prepare(struct nfs4_opendata *data, 2310 struct nfs_open_context *ctx) 2311{ 2312 struct nfs_server *server = NFS_SERVER(data->dir->d_inode); 2313 2314 if (!(pnfs_enabled_sb(server) && 2315 server->pnfs_curr_ld->flags & PNFS_LAYOUTGET_ON_OPEN)) 2316 return; 2317 /* Could check on max_ops, but currently hardcoded high enough */ 2318 if (!nfs_server_capable(data->dir->d_inode, NFS_CAP_LGOPEN)) 2319 return; 2320 if (data->state) 2321 _lgopen_prepare_attached(data, ctx); 2322 else 2323 _lgopen_prepare_floating(data, ctx); 2324} 2325 2326void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp, 2327 struct nfs_open_context *ctx) 2328{ 2329 struct pnfs_layout_hdr *lo; 2330 struct pnfs_layout_segment *lseg; 2331 struct nfs_server *srv = NFS_SERVER(ino); 2332 u32 iomode; 2333 2334 if (!lgp) 2335 return; 2336 dprintk("%s: entered with status %i\n", __func__, lgp->res.status); 2337 if (lgp->res.status) { 2338 switch (lgp->res.status) { 2339 default: 2340 break; 2341 /* 2342 * Halt lgopen attempts if the server doesn't recognise 2343 * the "current stateid" value, the layout type, or the 2344 * layoutget operation as being valid. 2345 * Also if it complains about too many ops in the compound 2346 * or of the request/reply being too big. 2347 */ 2348 case -NFS4ERR_BAD_STATEID: 2349 case -NFS4ERR_NOTSUPP: 2350 case -NFS4ERR_REP_TOO_BIG: 2351 case -NFS4ERR_REP_TOO_BIG_TO_CACHE: 2352 case -NFS4ERR_REQ_TOO_BIG: 2353 case -NFS4ERR_TOO_MANY_OPS: 2354 case -NFS4ERR_UNKNOWN_LAYOUTTYPE: 2355 srv->caps &= ~NFS_CAP_LGOPEN; 2356 } 2357 return; 2358 } 2359 if (!lgp->args.inode) { 2360 lo = _pnfs_grab_empty_layout(ino, ctx); 2361 if (!lo) 2362 return; 2363 lgp->args.inode = ino; 2364 } else 2365 lo = NFS_I(lgp->args.inode)->layout; 2366 2367 lseg = pnfs_layout_process(lgp); 2368 if (!IS_ERR(lseg)) { 2369 iomode = lgp->args.range.iomode; 2370 pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); 2371 pnfs_put_lseg(lseg); 2372 } 2373} 2374 2375void nfs4_lgopen_release(struct nfs4_layoutget *lgp) 2376{ 2377 if (lgp != NULL) { 2378 struct inode *inode = lgp->args.inode; 2379 if (inode) { 2380 struct pnfs_layout_hdr *lo = NFS_I(inode)->layout; 2381 pnfs_clear_first_layoutget(lo); 2382 nfs_layoutget_end(lo); 2383 } 2384 pnfs_layoutget_free(lgp); 2385 } 2386} 2387 2388struct pnfs_layout_segment * 2389pnfs_layout_process(struct nfs4_layoutget *lgp) 2390{ 2391 struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; 2392 struct nfs4_layoutget_res *res = &lgp->res; 2393 struct pnfs_layout_segment *lseg; 2394 struct inode *ino = lo->plh_inode; 2395 LIST_HEAD(free_me); 2396 2397 if (!pnfs_sanity_check_layout_range(&res->range)) 2398 return ERR_PTR(-EINVAL); 2399 2400 /* Inject layout blob into I/O device driver */ 2401 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); 2402 if (IS_ERR_OR_NULL(lseg)) { 2403 if (!lseg) 2404 lseg = ERR_PTR(-ENOMEM); 2405 2406 dprintk("%s: Could not allocate layout: error %ld\n", 2407 __func__, PTR_ERR(lseg)); 2408 return lseg; 2409 } 2410 2411 pnfs_init_lseg(lo, lseg, &res->range, &res->stateid); 2412 2413 spin_lock(&ino->i_lock); 2414 if (pnfs_layoutgets_blocked(lo)) { 2415 dprintk("%s forget reply due to state\n", __func__); 2416 goto out_forget; 2417 } 2418 2419 if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) && 2420 !pnfs_is_first_layoutget(lo)) 2421 goto out_forget; 2422 2423 if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { 2424 /* existing state ID, make sure the sequence number matches. */ 2425 if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { 2426 if (!pnfs_layout_is_valid(lo)) 2427 lo->plh_barrier = 0; 2428 dprintk("%s forget reply due to sequence\n", __func__); 2429 goto out_forget; 2430 } 2431 pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, false); 2432 } else if (pnfs_layout_is_valid(lo)) { 2433 /* 2434 * We got an entirely new state ID. Mark all segments for the 2435 * inode invalid, and retry the layoutget 2436 */ 2437 struct pnfs_layout_range range = { 2438 .iomode = IOMODE_ANY, 2439 .length = NFS4_MAX_UINT64, 2440 }; 2441 pnfs_set_plh_return_info(lo, IOMODE_ANY, 0); 2442 pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, 2443 &range, 0); 2444 goto out_forget; 2445 } else { 2446 /* We have a completely new layout */ 2447 pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true); 2448 } 2449 2450 pnfs_get_lseg(lseg); 2451 pnfs_layout_insert_lseg(lo, lseg, &free_me); 2452 2453 2454 if (res->return_on_close) 2455 set_bit(NFS_LSEG_ROC, &lseg->pls_flags); 2456 2457 spin_unlock(&ino->i_lock); 2458 pnfs_free_lseg_list(&free_me); 2459 return lseg; 2460 2461out_forget: 2462 spin_unlock(&ino->i_lock); 2463 lseg->pls_layout = lo; 2464 NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); 2465 pnfs_free_lseg_list(&free_me); 2466 return ERR_PTR(-EAGAIN); 2467} 2468 2469/** 2470 * pnfs_mark_matching_lsegs_return - Free or return matching layout segments 2471 * @lo: pointer to layout header 2472 * @tmp_list: list header to be used with pnfs_free_lseg_list() 2473 * @return_range: describe layout segment ranges to be returned 2474 * @seq: stateid seqid to match 2475 * 2476 * This function is mainly intended for use by layoutrecall. It attempts 2477 * to free the layout segment immediately, or else to mark it for return 2478 * as soon as its reference count drops to zero. 2479 * 2480 * Returns 2481 * - 0: a layoutreturn needs to be scheduled. 2482 * - EBUSY: there are layout segment that are still in use. 2483 * - ENOENT: there are no layout segments that need to be returned. 2484 */ 2485int 2486pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, 2487 struct list_head *tmp_list, 2488 const struct pnfs_layout_range *return_range, 2489 u32 seq) 2490{ 2491 struct pnfs_layout_segment *lseg, *next; 2492 int remaining = 0; 2493 2494 dprintk("%s:Begin lo %p\n", __func__, lo); 2495 2496 assert_spin_locked(&lo->plh_inode->i_lock); 2497 2498 if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) 2499 tmp_list = &lo->plh_return_segs; 2500 2501 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 2502 if (pnfs_match_lseg_recall(lseg, return_range, seq)) { 2503 dprintk("%s: marking lseg %p iomode %d " 2504 "offset %llu length %llu\n", __func__, 2505 lseg, lseg->pls_range.iomode, 2506 lseg->pls_range.offset, 2507 lseg->pls_range.length); 2508 if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) 2509 tmp_list = &lo->plh_return_segs; 2510 if (mark_lseg_invalid(lseg, tmp_list)) 2511 continue; 2512 remaining++; 2513 set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); 2514 } 2515 2516 if (remaining) { 2517 pnfs_set_plh_return_info(lo, return_range->iomode, seq); 2518 return -EBUSY; 2519 } 2520 2521 if (!list_empty(&lo->plh_return_segs)) { 2522 pnfs_set_plh_return_info(lo, return_range->iomode, seq); 2523 return 0; 2524 } 2525 2526 return -ENOENT; 2527} 2528 2529static void 2530pnfs_mark_layout_for_return(struct inode *inode, 2531 const struct pnfs_layout_range *range) 2532{ 2533 struct pnfs_layout_hdr *lo; 2534 bool return_now = false; 2535 2536 spin_lock(&inode->i_lock); 2537 lo = NFS_I(inode)->layout; 2538 if (!pnfs_layout_is_valid(lo)) { 2539 spin_unlock(&inode->i_lock); 2540 return; 2541 } 2542 pnfs_set_plh_return_info(lo, range->iomode, 0); 2543 /* 2544 * mark all matching lsegs so that we are sure to have no live 2545 * segments at hand when sending layoutreturn. See pnfs_put_lseg() 2546 * for how it works. 2547 */ 2548 if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, range, 0) != -EBUSY) { 2549 const struct cred *cred; 2550 nfs4_stateid stateid; 2551 enum pnfs_iomode iomode; 2552 2553 return_now = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode); 2554 spin_unlock(&inode->i_lock); 2555 if (return_now) 2556 pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false); 2557 } else { 2558 spin_unlock(&inode->i_lock); 2559 nfs_commit_inode(inode, 0); 2560 } 2561} 2562 2563void pnfs_error_mark_layout_for_return(struct inode *inode, 2564 struct pnfs_layout_segment *lseg) 2565{ 2566 struct pnfs_layout_range range = { 2567 .iomode = lseg->pls_range.iomode, 2568 .offset = 0, 2569 .length = NFS4_MAX_UINT64, 2570 }; 2571 2572 pnfs_mark_layout_for_return(inode, &range); 2573} 2574EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); 2575 2576static bool 2577pnfs_layout_can_be_returned(struct pnfs_layout_hdr *lo) 2578{ 2579 return pnfs_layout_is_valid(lo) && 2580 !test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) && 2581 !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); 2582} 2583 2584static struct pnfs_layout_segment * 2585pnfs_find_first_lseg(struct pnfs_layout_hdr *lo, 2586 const struct pnfs_layout_range *range, 2587 enum pnfs_iomode iomode) 2588{ 2589 struct pnfs_layout_segment *lseg; 2590 2591 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 2592 if (!test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) 2593 continue; 2594 if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) 2595 continue; 2596 if (lseg->pls_range.iomode != iomode && iomode != IOMODE_ANY) 2597 continue; 2598 if (pnfs_lseg_range_intersecting(&lseg->pls_range, range)) 2599 return lseg; 2600 } 2601 return NULL; 2602} 2603 2604/* Find open file states whose mode matches that of the range */ 2605static bool 2606pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo, 2607 const struct pnfs_layout_range *range) 2608{ 2609 struct list_head *head; 2610 struct nfs_open_context *ctx; 2611 fmode_t mode = 0; 2612 2613 if (!pnfs_layout_can_be_returned(lo) || 2614 !pnfs_find_first_lseg(lo, range, range->iomode)) 2615 return false; 2616 2617 head = &NFS_I(lo->plh_inode)->open_files; 2618 list_for_each_entry_rcu(ctx, head, list) { 2619 if (ctx->state) 2620 mode |= ctx->state->state & (FMODE_READ|FMODE_WRITE); 2621 } 2622 2623 switch (range->iomode) { 2624 default: 2625 break; 2626 case IOMODE_READ: 2627 mode &= ~FMODE_WRITE; 2628 break; 2629 case IOMODE_RW: 2630 if (pnfs_find_first_lseg(lo, range, IOMODE_READ)) 2631 mode &= ~FMODE_READ; 2632 } 2633 return mode == 0; 2634} 2635 2636static int pnfs_layout_return_unused_byserver(struct nfs_server *server, 2637 void *data) 2638{ 2639 const struct pnfs_layout_range *range = data; 2640 const struct cred *cred; 2641 struct pnfs_layout_hdr *lo; 2642 struct inode *inode; 2643 nfs4_stateid stateid; 2644 enum pnfs_iomode iomode; 2645 2646restart: 2647 rcu_read_lock(); 2648 list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) { 2649 inode = lo->plh_inode; 2650 if (!inode || !pnfs_layout_can_be_returned(lo) || 2651 test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) 2652 continue; 2653 spin_lock(&inode->i_lock); 2654 if (!lo->plh_inode || 2655 !pnfs_should_return_unused_layout(lo, range)) { 2656 spin_unlock(&inode->i_lock); 2657 continue; 2658 } 2659 pnfs_get_layout_hdr(lo); 2660 pnfs_set_plh_return_info(lo, range->iomode, 0); 2661 if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, 2662 range, 0) != 0 || 2663 !pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode)) { 2664 spin_unlock(&inode->i_lock); 2665 rcu_read_unlock(); 2666 pnfs_put_layout_hdr(lo); 2667 cond_resched(); 2668 goto restart; 2669 } 2670 spin_unlock(&inode->i_lock); 2671 rcu_read_unlock(); 2672 pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false); 2673 pnfs_put_layout_hdr(lo); 2674 cond_resched(); 2675 goto restart; 2676 } 2677 rcu_read_unlock(); 2678 return 0; 2679} 2680 2681void 2682pnfs_layout_return_unused_byclid(struct nfs_client *clp, 2683 enum pnfs_iomode iomode) 2684{ 2685 struct pnfs_layout_range range = { 2686 .iomode = iomode, 2687 .offset = 0, 2688 .length = NFS4_MAX_UINT64, 2689 }; 2690 2691 nfs_client_for_each_server(clp, pnfs_layout_return_unused_byserver, 2692 &range); 2693} 2694 2695void 2696pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio) 2697{ 2698 if (pgio->pg_lseg == NULL || 2699 test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags)) 2700 return; 2701 pnfs_put_lseg(pgio->pg_lseg); 2702 pgio->pg_lseg = NULL; 2703} 2704EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout); 2705 2706/* 2707 * Check for any intersection between the request and the pgio->pg_lseg, 2708 * and if none, put this pgio->pg_lseg away. 2709 */ 2710void 2711pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 2712{ 2713 if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) { 2714 pnfs_put_lseg(pgio->pg_lseg); 2715 pgio->pg_lseg = NULL; 2716 } 2717} 2718EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range); 2719 2720void 2721pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 2722{ 2723 u64 rd_size = req->wb_bytes; 2724 2725 pnfs_generic_pg_check_layout(pgio); 2726 pnfs_generic_pg_check_range(pgio, req); 2727 if (pgio->pg_lseg == NULL) { 2728 if (pgio->pg_dreq == NULL) 2729 rd_size = i_size_read(pgio->pg_inode) - req_offset(req); 2730 else 2731 rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); 2732 2733 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 2734 nfs_req_openctx(req), 2735 req_offset(req), 2736 rd_size, 2737 IOMODE_READ, 2738 false, 2739 GFP_KERNEL); 2740 if (IS_ERR(pgio->pg_lseg)) { 2741 pgio->pg_error = PTR_ERR(pgio->pg_lseg); 2742 pgio->pg_lseg = NULL; 2743 return; 2744 } 2745 } 2746 /* If no lseg, fall back to read through mds */ 2747 if (pgio->pg_lseg == NULL) 2748 nfs_pageio_reset_read_mds(pgio); 2749 2750} 2751EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read); 2752 2753void 2754pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, 2755 struct nfs_page *req, u64 wb_size) 2756{ 2757 pnfs_generic_pg_check_layout(pgio); 2758 pnfs_generic_pg_check_range(pgio, req); 2759 if (pgio->pg_lseg == NULL) { 2760 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 2761 nfs_req_openctx(req), 2762 req_offset(req), 2763 wb_size, 2764 IOMODE_RW, 2765 false, 2766 GFP_KERNEL); 2767 if (IS_ERR(pgio->pg_lseg)) { 2768 pgio->pg_error = PTR_ERR(pgio->pg_lseg); 2769 pgio->pg_lseg = NULL; 2770 return; 2771 } 2772 } 2773 /* If no lseg, fall back to write through mds */ 2774 if (pgio->pg_lseg == NULL) 2775 nfs_pageio_reset_write_mds(pgio); 2776} 2777EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); 2778 2779void 2780pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc) 2781{ 2782 if (desc->pg_lseg) { 2783 pnfs_put_lseg(desc->pg_lseg); 2784 desc->pg_lseg = NULL; 2785 } 2786} 2787EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup); 2788 2789/* 2790 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number 2791 * of bytes (maximum @req->wb_bytes) that can be coalesced. 2792 */ 2793size_t 2794pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, 2795 struct nfs_page *prev, struct nfs_page *req) 2796{ 2797 unsigned int size; 2798 u64 seg_end, req_start, seg_left; 2799 2800 size = nfs_generic_pg_test(pgio, prev, req); 2801 if (!size) 2802 return 0; 2803 2804 /* 2805 * 'size' contains the number of bytes left in the current page (up 2806 * to the original size asked for in @req->wb_bytes). 2807 * 2808 * Calculate how many bytes are left in the layout segment 2809 * and if there are less bytes than 'size', return that instead. 2810 * 2811 * Please also note that 'end_offset' is actually the offset of the 2812 * first byte that lies outside the pnfs_layout_range. FIXME? 2813 * 2814 */ 2815 if (pgio->pg_lseg) { 2816 seg_end = pnfs_end_offset(pgio->pg_lseg->pls_range.offset, 2817 pgio->pg_lseg->pls_range.length); 2818 req_start = req_offset(req); 2819 2820 /* start of request is past the last byte of this segment */ 2821 if (req_start >= seg_end) 2822 return 0; 2823 2824 /* adjust 'size' iff there are fewer bytes left in the 2825 * segment than what nfs_generic_pg_test returned */ 2826 seg_left = seg_end - req_start; 2827 if (seg_left < size) 2828 size = (unsigned int)seg_left; 2829 } 2830 2831 return size; 2832} 2833EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); 2834 2835int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr) 2836{ 2837 struct nfs_pageio_descriptor pgio; 2838 2839 /* Resend all requests through the MDS */ 2840 nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true, 2841 hdr->completion_ops); 2842 set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags); 2843 return nfs_pageio_resend(&pgio, hdr); 2844} 2845EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); 2846 2847static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr) 2848{ 2849 2850 dprintk("pnfs write error = %d\n", hdr->pnfs_error); 2851 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & 2852 PNFS_LAYOUTRET_ON_ERROR) { 2853 pnfs_return_layout(hdr->inode); 2854 } 2855 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) 2856 hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr); 2857} 2858 2859/* 2860 * Called by non rpc-based layout drivers 2861 */ 2862void pnfs_ld_write_done(struct nfs_pgio_header *hdr) 2863{ 2864 if (likely(!hdr->pnfs_error)) { 2865 pnfs_set_layoutcommit(hdr->inode, hdr->lseg, 2866 hdr->mds_offset + hdr->res.count); 2867 hdr->mds_ops->rpc_call_done(&hdr->task, hdr); 2868 } 2869 trace_nfs4_pnfs_write(hdr, hdr->pnfs_error); 2870 if (unlikely(hdr->pnfs_error)) 2871 pnfs_ld_handle_write_error(hdr); 2872 hdr->mds_ops->rpc_release(hdr); 2873} 2874EXPORT_SYMBOL_GPL(pnfs_ld_write_done); 2875 2876static void 2877pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, 2878 struct nfs_pgio_header *hdr) 2879{ 2880 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); 2881 2882 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 2883 list_splice_tail_init(&hdr->pages, &mirror->pg_list); 2884 nfs_pageio_reset_write_mds(desc); 2885 mirror->pg_recoalesce = 1; 2886 } 2887 hdr->completion_ops->completion(hdr); 2888} 2889 2890static enum pnfs_try_status 2891pnfs_try_to_write_data(struct nfs_pgio_header *hdr, 2892 const struct rpc_call_ops *call_ops, 2893 struct pnfs_layout_segment *lseg, 2894 int how) 2895{ 2896 struct inode *inode = hdr->inode; 2897 enum pnfs_try_status trypnfs; 2898 struct nfs_server *nfss = NFS_SERVER(inode); 2899 2900 hdr->mds_ops = call_ops; 2901 2902 dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, 2903 inode->i_ino, hdr->args.count, hdr->args.offset, how); 2904 trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how); 2905 if (trypnfs != PNFS_NOT_ATTEMPTED) 2906 nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); 2907 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); 2908 return trypnfs; 2909} 2910 2911static void 2912pnfs_do_write(struct nfs_pageio_descriptor *desc, 2913 struct nfs_pgio_header *hdr, int how) 2914{ 2915 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; 2916 struct pnfs_layout_segment *lseg = desc->pg_lseg; 2917 enum pnfs_try_status trypnfs; 2918 2919 trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how); 2920 switch (trypnfs) { 2921 case PNFS_NOT_ATTEMPTED: 2922 pnfs_write_through_mds(desc, hdr); 2923 case PNFS_ATTEMPTED: 2924 break; 2925 case PNFS_TRY_AGAIN: 2926 /* cleanup hdr and prepare to redo pnfs */ 2927 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 2928 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); 2929 list_splice_init(&hdr->pages, &mirror->pg_list); 2930 mirror->pg_recoalesce = 1; 2931 } 2932 hdr->mds_ops->rpc_release(hdr); 2933 } 2934} 2935 2936static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) 2937{ 2938 pnfs_put_lseg(hdr->lseg); 2939 nfs_pgio_header_free(hdr); 2940} 2941 2942int 2943pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) 2944{ 2945 struct nfs_pgio_header *hdr; 2946 int ret; 2947 2948 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); 2949 if (!hdr) { 2950 desc->pg_error = -ENOMEM; 2951 return desc->pg_error; 2952 } 2953 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); 2954 2955 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 2956 ret = nfs_generic_pgio(desc, hdr); 2957 if (!ret) 2958 pnfs_do_write(desc, hdr, desc->pg_ioflags); 2959 2960 return ret; 2961} 2962EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); 2963 2964int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr) 2965{ 2966 struct nfs_pageio_descriptor pgio; 2967 2968 /* Resend all requests through the MDS */ 2969 nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops); 2970 return nfs_pageio_resend(&pgio, hdr); 2971} 2972EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds); 2973 2974static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr) 2975{ 2976 dprintk("pnfs read error = %d\n", hdr->pnfs_error); 2977 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & 2978 PNFS_LAYOUTRET_ON_ERROR) { 2979 pnfs_return_layout(hdr->inode); 2980 } 2981 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) 2982 hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr); 2983} 2984 2985/* 2986 * Called by non rpc-based layout drivers 2987 */ 2988void pnfs_ld_read_done(struct nfs_pgio_header *hdr) 2989{ 2990 if (likely(!hdr->pnfs_error)) 2991 hdr->mds_ops->rpc_call_done(&hdr->task, hdr); 2992 trace_nfs4_pnfs_read(hdr, hdr->pnfs_error); 2993 if (unlikely(hdr->pnfs_error)) 2994 pnfs_ld_handle_read_error(hdr); 2995 hdr->mds_ops->rpc_release(hdr); 2996} 2997EXPORT_SYMBOL_GPL(pnfs_ld_read_done); 2998 2999static void 3000pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, 3001 struct nfs_pgio_header *hdr) 3002{ 3003 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); 3004 3005 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 3006 list_splice_tail_init(&hdr->pages, &mirror->pg_list); 3007 nfs_pageio_reset_read_mds(desc); 3008 mirror->pg_recoalesce = 1; 3009 } 3010 hdr->completion_ops->completion(hdr); 3011} 3012 3013/* 3014 * Call the appropriate parallel I/O subsystem read function. 3015 */ 3016static enum pnfs_try_status 3017pnfs_try_to_read_data(struct nfs_pgio_header *hdr, 3018 const struct rpc_call_ops *call_ops, 3019 struct pnfs_layout_segment *lseg) 3020{ 3021 struct inode *inode = hdr->inode; 3022 struct nfs_server *nfss = NFS_SERVER(inode); 3023 enum pnfs_try_status trypnfs; 3024 3025 hdr->mds_ops = call_ops; 3026 3027 dprintk("%s: Reading ino:%lu %u@%llu\n", 3028 __func__, inode->i_ino, hdr->args.count, hdr->args.offset); 3029 3030 trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr); 3031 if (trypnfs != PNFS_NOT_ATTEMPTED) 3032 nfs_inc_stats(inode, NFSIOS_PNFS_READ); 3033 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); 3034 return trypnfs; 3035} 3036 3037/* Resend all requests through pnfs. */ 3038void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr, 3039 unsigned int mirror_idx) 3040{ 3041 struct nfs_pageio_descriptor pgio; 3042 3043 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 3044 /* Prevent deadlocks with layoutreturn! */ 3045 pnfs_put_lseg(hdr->lseg); 3046 hdr->lseg = NULL; 3047 3048 nfs_pageio_init_read(&pgio, hdr->inode, false, 3049 hdr->completion_ops); 3050 pgio.pg_mirror_idx = mirror_idx; 3051 hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr); 3052 } 3053} 3054EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs); 3055 3056static void 3057pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) 3058{ 3059 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; 3060 struct pnfs_layout_segment *lseg = desc->pg_lseg; 3061 enum pnfs_try_status trypnfs; 3062 3063 trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg); 3064 switch (trypnfs) { 3065 case PNFS_NOT_ATTEMPTED: 3066 pnfs_read_through_mds(desc, hdr); 3067 case PNFS_ATTEMPTED: 3068 break; 3069 case PNFS_TRY_AGAIN: 3070 /* cleanup hdr and prepare to redo pnfs */ 3071 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 3072 struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); 3073 list_splice_init(&hdr->pages, &mirror->pg_list); 3074 mirror->pg_recoalesce = 1; 3075 } 3076 hdr->mds_ops->rpc_release(hdr); 3077 } 3078} 3079 3080static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) 3081{ 3082 pnfs_put_lseg(hdr->lseg); 3083 nfs_pgio_header_free(hdr); 3084} 3085 3086int 3087pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) 3088{ 3089 struct nfs_pgio_header *hdr; 3090 int ret; 3091 3092 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); 3093 if (!hdr) { 3094 desc->pg_error = -ENOMEM; 3095 return desc->pg_error; 3096 } 3097 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); 3098 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 3099 ret = nfs_generic_pgio(desc, hdr); 3100 if (!ret) 3101 pnfs_do_read(desc, hdr); 3102 return ret; 3103} 3104EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); 3105 3106static void pnfs_clear_layoutcommitting(struct inode *inode) 3107{ 3108 unsigned long *bitlock = &NFS_I(inode)->flags; 3109 3110 clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); 3111 smp_mb__after_atomic(); 3112 wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); 3113} 3114 3115/* 3116 * There can be multiple RW segments. 3117 */ 3118static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) 3119{ 3120 struct pnfs_layout_segment *lseg; 3121 3122 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) { 3123 if (lseg->pls_range.iomode == IOMODE_RW && 3124 test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) 3125 list_add(&lseg->pls_lc_list, listp); 3126 } 3127} 3128 3129static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp) 3130{ 3131 struct pnfs_layout_segment *lseg, *tmp; 3132 3133 /* Matched by references in pnfs_set_layoutcommit */ 3134 list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) { 3135 list_del_init(&lseg->pls_lc_list); 3136 pnfs_put_lseg(lseg); 3137 } 3138 3139 pnfs_clear_layoutcommitting(inode); 3140} 3141 3142void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) 3143{ 3144 pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode); 3145} 3146EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); 3147 3148void 3149pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg, 3150 loff_t end_pos) 3151{ 3152 struct nfs_inode *nfsi = NFS_I(inode); 3153 bool mark_as_dirty = false; 3154 3155 spin_lock(&inode->i_lock); 3156 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { 3157 nfsi->layout->plh_lwb = end_pos; 3158 mark_as_dirty = true; 3159 dprintk("%s: Set layoutcommit for inode %lu ", 3160 __func__, inode->i_ino); 3161 } else if (end_pos > nfsi->layout->plh_lwb) 3162 nfsi->layout->plh_lwb = end_pos; 3163 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) { 3164 /* references matched in nfs4_layoutcommit_release */ 3165 pnfs_get_lseg(lseg); 3166 } 3167 spin_unlock(&inode->i_lock); 3168 dprintk("%s: lseg %p end_pos %llu\n", 3169 __func__, lseg, nfsi->layout->plh_lwb); 3170 3171 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one 3172 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ 3173 if (mark_as_dirty) 3174 mark_inode_dirty_sync(inode); 3175} 3176EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); 3177 3178void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) 3179{ 3180 struct nfs_server *nfss = NFS_SERVER(data->args.inode); 3181 3182 if (nfss->pnfs_curr_ld->cleanup_layoutcommit) 3183 nfss->pnfs_curr_ld->cleanup_layoutcommit(data); 3184 pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list); 3185} 3186 3187/* 3188 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and 3189 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough 3190 * data to disk to allow the server to recover the data if it crashes. 3191 * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag 3192 * is off, and a COMMIT is sent to a data server, or 3193 * if WRITEs to a data server return NFS_DATA_SYNC. 3194 */ 3195int 3196pnfs_layoutcommit_inode(struct inode *inode, bool sync) 3197{ 3198 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; 3199 struct nfs4_layoutcommit_data *data; 3200 struct nfs_inode *nfsi = NFS_I(inode); 3201 loff_t end_pos; 3202 int status; 3203 3204 if (!pnfs_layoutcommit_outstanding(inode)) 3205 return 0; 3206 3207 dprintk("--> %s inode %lu\n", __func__, inode->i_ino); 3208 3209 status = -EAGAIN; 3210 if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { 3211 if (!sync) 3212 goto out; 3213 status = wait_on_bit_lock_action(&nfsi->flags, 3214 NFS_INO_LAYOUTCOMMITTING, 3215 nfs_wait_bit_killable, 3216 TASK_KILLABLE); 3217 if (status) 3218 goto out; 3219 } 3220 3221 status = -ENOMEM; 3222 /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ 3223 data = kzalloc(sizeof(*data), GFP_NOFS); 3224 if (!data) 3225 goto clear_layoutcommitting; 3226 3227 status = 0; 3228 spin_lock(&inode->i_lock); 3229 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) 3230 goto out_unlock; 3231 3232 INIT_LIST_HEAD(&data->lseg_list); 3233 pnfs_list_write_lseg(inode, &data->lseg_list); 3234 3235 end_pos = nfsi->layout->plh_lwb; 3236 3237 nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); 3238 data->cred = get_cred(nfsi->layout->plh_lc_cred); 3239 spin_unlock(&inode->i_lock); 3240 3241 data->args.inode = inode; 3242 nfs_fattr_init(&data->fattr); 3243 data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; 3244 data->res.fattr = &data->fattr; 3245 if (end_pos != 0) 3246 data->args.lastbytewritten = end_pos - 1; 3247 else 3248 data->args.lastbytewritten = U64_MAX; 3249 data->res.server = NFS_SERVER(inode); 3250 3251 if (ld->prepare_layoutcommit) { 3252 status = ld->prepare_layoutcommit(&data->args); 3253 if (status) { 3254 put_cred(data->cred); 3255 spin_lock(&inode->i_lock); 3256 set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); 3257 if (end_pos > nfsi->layout->plh_lwb) 3258 nfsi->layout->plh_lwb = end_pos; 3259 goto out_unlock; 3260 } 3261 } 3262 3263 3264 status = nfs4_proc_layoutcommit(data, sync); 3265out: 3266 if (status) 3267 mark_inode_dirty_sync(inode); 3268 dprintk("<-- %s status %d\n", __func__, status); 3269 return status; 3270out_unlock: 3271 spin_unlock(&inode->i_lock); 3272 kfree(data); 3273clear_layoutcommitting: 3274 pnfs_clear_layoutcommitting(inode); 3275 goto out; 3276} 3277EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode); 3278 3279int 3280pnfs_generic_sync(struct inode *inode, bool datasync) 3281{ 3282 return pnfs_layoutcommit_inode(inode, true); 3283} 3284EXPORT_SYMBOL_GPL(pnfs_generic_sync); 3285 3286struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) 3287{ 3288 struct nfs4_threshold *thp; 3289 3290 thp = kzalloc(sizeof(*thp), GFP_NOFS); 3291 if (!thp) { 3292 dprintk("%s mdsthreshold allocation failed\n", __func__); 3293 return NULL; 3294 } 3295 return thp; 3296} 3297 3298#if IS_ENABLED(CONFIG_NFS_V4_2) 3299int 3300pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags) 3301{ 3302 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; 3303 struct nfs_server *server = NFS_SERVER(inode); 3304 struct nfs_inode *nfsi = NFS_I(inode); 3305 struct nfs42_layoutstat_data *data; 3306 struct pnfs_layout_hdr *hdr; 3307 int status = 0; 3308 3309 if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats) 3310 goto out; 3311 3312 if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS)) 3313 goto out; 3314 3315 if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags)) 3316 goto out; 3317 3318 spin_lock(&inode->i_lock); 3319 if (!NFS_I(inode)->layout) { 3320 spin_unlock(&inode->i_lock); 3321 goto out_clear_layoutstats; 3322 } 3323 hdr = NFS_I(inode)->layout; 3324 pnfs_get_layout_hdr(hdr); 3325 spin_unlock(&inode->i_lock); 3326 3327 data = kzalloc(sizeof(*data), gfp_flags); 3328 if (!data) { 3329 status = -ENOMEM; 3330 goto out_put; 3331 } 3332 3333 data->args.fh = NFS_FH(inode); 3334 data->args.inode = inode; 3335 status = ld->prepare_layoutstats(&data->args); 3336 if (status) 3337 goto out_free; 3338 3339 status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data); 3340 3341out: 3342 dprintk("%s returns %d\n", __func__, status); 3343 return status; 3344 3345out_free: 3346 kfree(data); 3347out_put: 3348 pnfs_put_layout_hdr(hdr); 3349out_clear_layoutstats: 3350 smp_mb__before_atomic(); 3351 clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags); 3352 smp_mb__after_atomic(); 3353 goto out; 3354} 3355EXPORT_SYMBOL_GPL(pnfs_report_layoutstat); 3356#endif 3357 3358unsigned int layoutstats_timer; 3359module_param(layoutstats_timer, uint, 0644); 3360EXPORT_SYMBOL_GPL(layoutstats_timer); 3361