1// SPDX-License-Identifier: GPL-2.0 2#include <linux/ceph/ceph_debug.h> 3#include <linux/ceph/striper.h> 4 5#include <linux/module.h> 6#include <linux/sched.h> 7#include <linux/slab.h> 8#include <linux/file.h> 9#include <linux/mount.h> 10#include <linux/namei.h> 11#include <linux/writeback.h> 12#include <linux/falloc.h> 13#include <linux/iversion.h> 14#include <linux/ktime.h> 15 16#include "super.h" 17#include "mds_client.h" 18#include "cache.h" 19#include "io.h" 20#include "metric.h" 21 22static __le32 ceph_flags_sys2wire(u32 flags) 23{ 24 u32 wire_flags = 0; 25 26 switch (flags & O_ACCMODE) { 27 case O_RDONLY: 28 wire_flags |= CEPH_O_RDONLY; 29 break; 30 case O_WRONLY: 31 wire_flags |= CEPH_O_WRONLY; 32 break; 33 case O_RDWR: 34 wire_flags |= CEPH_O_RDWR; 35 break; 36 } 37 38 flags &= ~O_ACCMODE; 39 40#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; } 41 42 ceph_sys2wire(O_CREAT); 43 ceph_sys2wire(O_EXCL); 44 ceph_sys2wire(O_TRUNC); 45 ceph_sys2wire(O_DIRECTORY); 46 ceph_sys2wire(O_NOFOLLOW); 47 48#undef ceph_sys2wire 49 50 if (flags) 51 dout("unused open flags: %x\n", flags); 52 53 return cpu_to_le32(wire_flags); 54} 55 56/* 57 * Ceph file operations 58 * 59 * Implement basic open/close functionality, and implement 60 * read/write. 61 * 62 * We implement three modes of file I/O: 63 * - buffered uses the generic_file_aio_{read,write} helpers 64 * 65 * - synchronous is used when there is multi-client read/write 66 * sharing, avoids the page cache, and synchronously waits for an 67 * ack from the OSD. 68 * 69 * - direct io takes the variant of the sync path that references 70 * user pages directly. 71 * 72 * fsync() flushes and waits on dirty pages, but just queues metadata 73 * for writeback: since the MDS can recover size and mtime there is no 74 * need to wait for MDS acknowledgement. 75 */ 76 77/* 78 * How many pages to get in one call to iov_iter_get_pages(). This 79 * determines the size of the on-stack array used as a buffer. 80 */ 81#define ITER_GET_BVECS_PAGES 64 82 83static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize, 84 struct bio_vec *bvecs) 85{ 86 size_t size = 0; 87 int bvec_idx = 0; 88 89 if (maxsize > iov_iter_count(iter)) 90 maxsize = iov_iter_count(iter); 91 92 while (size < maxsize) { 93 struct page *pages[ITER_GET_BVECS_PAGES]; 94 ssize_t bytes; 95 size_t start; 96 int idx = 0; 97 98 bytes = iov_iter_get_pages(iter, pages, maxsize - size, 99 ITER_GET_BVECS_PAGES, &start); 100 if (bytes < 0) 101 return size ?: bytes; 102 103 iov_iter_advance(iter, bytes); 104 size += bytes; 105 106 for ( ; bytes; idx++, bvec_idx++) { 107 struct bio_vec bv = { 108 .bv_page = pages[idx], 109 .bv_len = min_t(int, bytes, PAGE_SIZE - start), 110 .bv_offset = start, 111 }; 112 113 bvecs[bvec_idx] = bv; 114 bytes -= bv.bv_len; 115 start = 0; 116 } 117 } 118 119 return size; 120} 121 122/* 123 * iov_iter_get_pages() only considers one iov_iter segment, no matter 124 * what maxsize or maxpages are given. For ITER_BVEC that is a single 125 * page. 126 * 127 * Attempt to get up to @maxsize bytes worth of pages from @iter. 128 * Return the number of bytes in the created bio_vec array, or an error. 129 */ 130static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize, 131 struct bio_vec **bvecs, int *num_bvecs) 132{ 133 struct bio_vec *bv; 134 size_t orig_count = iov_iter_count(iter); 135 ssize_t bytes; 136 int npages; 137 138 iov_iter_truncate(iter, maxsize); 139 npages = iov_iter_npages(iter, INT_MAX); 140 iov_iter_reexpand(iter, orig_count); 141 142 /* 143 * __iter_get_bvecs() may populate only part of the array -- zero it 144 * out. 145 */ 146 bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL | __GFP_ZERO); 147 if (!bv) 148 return -ENOMEM; 149 150 bytes = __iter_get_bvecs(iter, maxsize, bv); 151 if (bytes < 0) { 152 /* 153 * No pages were pinned -- just free the array. 154 */ 155 kvfree(bv); 156 return bytes; 157 } 158 159 *bvecs = bv; 160 *num_bvecs = npages; 161 return bytes; 162} 163 164static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty) 165{ 166 int i; 167 168 for (i = 0; i < num_bvecs; i++) { 169 if (bvecs[i].bv_page) { 170 if (should_dirty) 171 set_page_dirty_lock(bvecs[i].bv_page); 172 put_page(bvecs[i].bv_page); 173 } 174 } 175 kvfree(bvecs); 176} 177 178/* 179 * Prepare an open request. Preallocate ceph_cap to avoid an 180 * inopportune ENOMEM later. 181 */ 182static struct ceph_mds_request * 183prepare_open_request(struct super_block *sb, int flags, int create_mode) 184{ 185 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb); 186 struct ceph_mds_request *req; 187 int want_auth = USE_ANY_MDS; 188 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; 189 190 if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC)) 191 want_auth = USE_AUTH_MDS; 192 193 req = ceph_mdsc_create_request(mdsc, op, want_auth); 194 if (IS_ERR(req)) 195 goto out; 196 req->r_fmode = ceph_flags_to_mode(flags); 197 req->r_args.open.flags = ceph_flags_sys2wire(flags); 198 req->r_args.open.mode = cpu_to_le32(create_mode); 199out: 200 return req; 201} 202 203static int ceph_init_file_info(struct inode *inode, struct file *file, 204 int fmode, bool isdir) 205{ 206 struct ceph_inode_info *ci = ceph_inode(inode); 207 struct ceph_file_info *fi; 208 209 dout("%s %p %p 0%o (%s)\n", __func__, inode, file, 210 inode->i_mode, isdir ? "dir" : "regular"); 211 BUG_ON(inode->i_fop->release != ceph_release); 212 213 if (isdir) { 214 struct ceph_dir_file_info *dfi = 215 kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); 216 if (!dfi) 217 return -ENOMEM; 218 219 file->private_data = dfi; 220 fi = &dfi->file_info; 221 dfi->next_offset = 2; 222 dfi->readdir_cache_idx = -1; 223 } else { 224 fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); 225 if (!fi) 226 return -ENOMEM; 227 228 file->private_data = fi; 229 } 230 231 ceph_get_fmode(ci, fmode, 1); 232 fi->fmode = fmode; 233 234 spin_lock_init(&fi->rw_contexts_lock); 235 INIT_LIST_HEAD(&fi->rw_contexts); 236 fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); 237 238 return 0; 239} 240 241/* 242 * initialize private struct file data. 243 * if we fail, clean up by dropping fmode reference on the ceph_inode 244 */ 245static int ceph_init_file(struct inode *inode, struct file *file, int fmode) 246{ 247 int ret = 0; 248 249 switch (inode->i_mode & S_IFMT) { 250 case S_IFREG: 251 ceph_fscache_register_inode_cookie(inode); 252 ceph_fscache_file_set_cookie(inode, file); 253 fallthrough; 254 case S_IFDIR: 255 ret = ceph_init_file_info(inode, file, fmode, 256 S_ISDIR(inode->i_mode)); 257 break; 258 259 case S_IFLNK: 260 dout("init_file %p %p 0%o (symlink)\n", inode, file, 261 inode->i_mode); 262 break; 263 264 default: 265 dout("init_file %p %p 0%o (special)\n", inode, file, 266 inode->i_mode); 267 /* 268 * we need to drop the open ref now, since we don't 269 * have .release set to ceph_release. 270 */ 271 BUG_ON(inode->i_fop->release == ceph_release); 272 273 /* call the proper open fop */ 274 ret = inode->i_fop->open(inode, file); 275 } 276 return ret; 277} 278 279/* 280 * try renew caps after session gets killed. 281 */ 282int ceph_renew_caps(struct inode *inode, int fmode) 283{ 284 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); 285 struct ceph_inode_info *ci = ceph_inode(inode); 286 struct ceph_mds_request *req; 287 int err, flags, wanted; 288 289 spin_lock(&ci->i_ceph_lock); 290 __ceph_touch_fmode(ci, mdsc, fmode); 291 wanted = __ceph_caps_file_wanted(ci); 292 if (__ceph_is_any_real_caps(ci) && 293 (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) { 294 int issued = __ceph_caps_issued(ci, NULL); 295 spin_unlock(&ci->i_ceph_lock); 296 dout("renew caps %p want %s issued %s updating mds_wanted\n", 297 inode, ceph_cap_string(wanted), ceph_cap_string(issued)); 298 ceph_check_caps(ci, 0, NULL); 299 return 0; 300 } 301 spin_unlock(&ci->i_ceph_lock); 302 303 flags = 0; 304 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR)) 305 flags = O_RDWR; 306 else if (wanted & CEPH_CAP_FILE_RD) 307 flags = O_RDONLY; 308 else if (wanted & CEPH_CAP_FILE_WR) 309 flags = O_WRONLY; 310#ifdef O_LAZY 311 if (wanted & CEPH_CAP_FILE_LAZYIO) 312 flags |= O_LAZY; 313#endif 314 315 req = prepare_open_request(inode->i_sb, flags, 0); 316 if (IS_ERR(req)) { 317 err = PTR_ERR(req); 318 goto out; 319 } 320 321 req->r_inode = inode; 322 ihold(inode); 323 req->r_num_caps = 1; 324 325 err = ceph_mdsc_do_request(mdsc, NULL, req); 326 ceph_mdsc_put_request(req); 327out: 328 dout("renew caps %p open result=%d\n", inode, err); 329 return err < 0 ? err : 0; 330} 331 332/* 333 * If we already have the requisite capabilities, we can satisfy 334 * the open request locally (no need to request new caps from the 335 * MDS). We do, however, need to inform the MDS (asynchronously) 336 * if our wanted caps set expands. 337 */ 338int ceph_open(struct inode *inode, struct file *file) 339{ 340 struct ceph_inode_info *ci = ceph_inode(inode); 341 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); 342 struct ceph_mds_client *mdsc = fsc->mdsc; 343 struct ceph_mds_request *req; 344 struct ceph_file_info *fi = file->private_data; 345 int err; 346 int flags, fmode, wanted; 347 348 if (fi) { 349 dout("open file %p is already opened\n", file); 350 return 0; 351 } 352 353 /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */ 354 flags = file->f_flags & ~(O_CREAT|O_EXCL); 355 if (S_ISDIR(inode->i_mode)) 356 flags = O_DIRECTORY; /* mds likes to know */ 357 358 dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode, 359 ceph_vinop(inode), file, flags, file->f_flags); 360 fmode = ceph_flags_to_mode(flags); 361 wanted = ceph_caps_for_mode(fmode); 362 363 /* snapped files are read-only */ 364 if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE)) 365 return -EROFS; 366 367 /* trivially open snapdir */ 368 if (ceph_snap(inode) == CEPH_SNAPDIR) { 369 return ceph_init_file(inode, file, fmode); 370 } 371 372 /* 373 * No need to block if we have caps on the auth MDS (for 374 * write) or any MDS (for read). Update wanted set 375 * asynchronously. 376 */ 377 spin_lock(&ci->i_ceph_lock); 378 if (__ceph_is_any_real_caps(ci) && 379 (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { 380 int mds_wanted = __ceph_caps_mds_wanted(ci, true); 381 int issued = __ceph_caps_issued(ci, NULL); 382 383 dout("open %p fmode %d want %s issued %s using existing\n", 384 inode, fmode, ceph_cap_string(wanted), 385 ceph_cap_string(issued)); 386 __ceph_touch_fmode(ci, mdsc, fmode); 387 spin_unlock(&ci->i_ceph_lock); 388 389 /* adjust wanted? */ 390 if ((issued & wanted) != wanted && 391 (mds_wanted & wanted) != wanted && 392 ceph_snap(inode) != CEPH_SNAPDIR) 393 ceph_check_caps(ci, 0, NULL); 394 395 return ceph_init_file(inode, file, fmode); 396 } else if (ceph_snap(inode) != CEPH_NOSNAP && 397 (ci->i_snap_caps & wanted) == wanted) { 398 __ceph_touch_fmode(ci, mdsc, fmode); 399 spin_unlock(&ci->i_ceph_lock); 400 return ceph_init_file(inode, file, fmode); 401 } 402 403 spin_unlock(&ci->i_ceph_lock); 404 405 dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); 406 req = prepare_open_request(inode->i_sb, flags, 0); 407 if (IS_ERR(req)) { 408 err = PTR_ERR(req); 409 goto out; 410 } 411 req->r_inode = inode; 412 ihold(inode); 413 414 req->r_num_caps = 1; 415 err = ceph_mdsc_do_request(mdsc, NULL, req); 416 if (!err) 417 err = ceph_init_file(inode, file, req->r_fmode); 418 ceph_mdsc_put_request(req); 419 dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode)); 420out: 421 return err; 422} 423 424/* Clone the layout from a synchronous create, if the dir now has Dc caps */ 425static void 426cache_file_layout(struct inode *dst, struct inode *src) 427{ 428 struct ceph_inode_info *cdst = ceph_inode(dst); 429 struct ceph_inode_info *csrc = ceph_inode(src); 430 431 spin_lock(&cdst->i_ceph_lock); 432 if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) && 433 !ceph_file_layout_is_valid(&cdst->i_cached_layout)) { 434 memcpy(&cdst->i_cached_layout, &csrc->i_layout, 435 sizeof(cdst->i_cached_layout)); 436 rcu_assign_pointer(cdst->i_cached_layout.pool_ns, 437 ceph_try_get_string(csrc->i_layout.pool_ns)); 438 } 439 spin_unlock(&cdst->i_ceph_lock); 440} 441 442/* 443 * Try to set up an async create. We need caps, a file layout, and inode number, 444 * and either a lease on the dentry or complete dir info. If any of those 445 * criteria are not satisfied, then return false and the caller can go 446 * synchronous. 447 */ 448static int try_prep_async_create(struct inode *dir, struct dentry *dentry, 449 struct ceph_file_layout *lo, u64 *pino) 450{ 451 struct ceph_inode_info *ci = ceph_inode(dir); 452 struct ceph_dentry_info *di = ceph_dentry(dentry); 453 int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE; 454 u64 ino; 455 456 spin_lock(&ci->i_ceph_lock); 457 /* No auth cap means no chance for Dc caps */ 458 if (!ci->i_auth_cap) 459 goto no_async; 460 461 /* Any delegated inos? */ 462 if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos)) 463 goto no_async; 464 465 if (!ceph_file_layout_is_valid(&ci->i_cached_layout)) 466 goto no_async; 467 468 if ((__ceph_caps_issued(ci, NULL) & want) != want) 469 goto no_async; 470 471 if (d_in_lookup(dentry)) { 472 if (!__ceph_dir_is_complete(ci)) 473 goto no_async; 474 spin_lock(&dentry->d_lock); 475 di->lease_shared_gen = atomic_read(&ci->i_shared_gen); 476 spin_unlock(&dentry->d_lock); 477 } else if (atomic_read(&ci->i_shared_gen) != 478 READ_ONCE(di->lease_shared_gen)) { 479 goto no_async; 480 } 481 482 ino = ceph_get_deleg_ino(ci->i_auth_cap->session); 483 if (!ino) 484 goto no_async; 485 486 *pino = ino; 487 ceph_take_cap_refs(ci, want, false); 488 memcpy(lo, &ci->i_cached_layout, sizeof(*lo)); 489 rcu_assign_pointer(lo->pool_ns, 490 ceph_try_get_string(ci->i_cached_layout.pool_ns)); 491 got = want; 492no_async: 493 spin_unlock(&ci->i_ceph_lock); 494 return got; 495} 496 497static void restore_deleg_ino(struct inode *dir, u64 ino) 498{ 499 struct ceph_inode_info *ci = ceph_inode(dir); 500 struct ceph_mds_session *s = NULL; 501 502 spin_lock(&ci->i_ceph_lock); 503 if (ci->i_auth_cap) 504 s = ceph_get_mds_session(ci->i_auth_cap->session); 505 spin_unlock(&ci->i_ceph_lock); 506 if (s) { 507 int err = ceph_restore_deleg_ino(s, ino); 508 if (err) 509 pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n", 510 ino, err); 511 ceph_put_mds_session(s); 512 } 513} 514 515static void ceph_async_create_cb(struct ceph_mds_client *mdsc, 516 struct ceph_mds_request *req) 517{ 518 int result = req->r_err ? req->r_err : 519 le32_to_cpu(req->r_reply_info.head->result); 520 521 if (result == -EJUKEBOX) 522 goto out; 523 524 mapping_set_error(req->r_parent->i_mapping, result); 525 526 if (result) { 527 struct dentry *dentry = req->r_dentry; 528 int pathlen = 0; 529 u64 base = 0; 530 char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, 531 &base, 0); 532 533 ceph_dir_clear_complete(req->r_parent); 534 if (!d_unhashed(dentry)) 535 d_drop(dentry); 536 537 /* FIXME: start returning I/O errors on all accesses? */ 538 pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", 539 base, IS_ERR(path) ? "<<bad>>" : path, result); 540 ceph_mdsc_free_path(path, pathlen); 541 } 542 543 if (req->r_target_inode) { 544 struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); 545 u64 ino = ceph_vino(req->r_target_inode).ino; 546 547 if (req->r_deleg_ino != ino) 548 pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n", 549 __func__, req->r_err, req->r_deleg_ino, ino); 550 mapping_set_error(req->r_target_inode->i_mapping, result); 551 552 spin_lock(&ci->i_ceph_lock); 553 if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { 554 ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; 555 wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); 556 } 557 ceph_kick_flushing_inode_caps(req->r_session, ci); 558 spin_unlock(&ci->i_ceph_lock); 559 } else { 560 pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__, 561 req->r_deleg_ino); 562 } 563out: 564 ceph_mdsc_release_dir_caps(req); 565} 566 567static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, 568 struct file *file, umode_t mode, 569 struct ceph_mds_request *req, 570 struct ceph_acl_sec_ctx *as_ctx, 571 struct ceph_file_layout *lo) 572{ 573 int ret; 574 char xattr_buf[4]; 575 struct ceph_mds_reply_inode in = { }; 576 struct ceph_mds_reply_info_in iinfo = { .in = &in }; 577 struct ceph_inode_info *ci = ceph_inode(dir); 578 struct inode *inode; 579 struct timespec64 now; 580 struct ceph_string *pool_ns; 581 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); 582 struct ceph_vino vino = { .ino = req->r_deleg_ino, 583 .snap = CEPH_NOSNAP }; 584 585 ktime_get_real_ts64(&now); 586 587 inode = ceph_get_inode(dentry->d_sb, vino); 588 if (IS_ERR(inode)) 589 return PTR_ERR(inode); 590 591 iinfo.inline_version = CEPH_INLINE_NONE; 592 iinfo.change_attr = 1; 593 ceph_encode_timespec64(&iinfo.btime, &now); 594 595 if (req->r_pagelist) { 596 iinfo.xattr_len = req->r_pagelist->length; 597 iinfo.xattr_data = req->r_pagelist->mapped_tail; 598 } else { 599 /* fake it */ 600 iinfo.xattr_len = ARRAY_SIZE(xattr_buf); 601 iinfo.xattr_data = xattr_buf; 602 memset(iinfo.xattr_data, 0, iinfo.xattr_len); 603 } 604 605 in.ino = cpu_to_le64(vino.ino); 606 in.snapid = cpu_to_le64(CEPH_NOSNAP); 607 in.version = cpu_to_le64(1); // ??? 608 in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE); 609 in.cap.cap_id = cpu_to_le64(1); 610 in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino); 611 in.cap.flags = CEPH_CAP_FLAG_AUTH; 612 in.ctime = in.mtime = in.atime = iinfo.btime; 613 in.truncate_seq = cpu_to_le32(1); 614 in.truncate_size = cpu_to_le64(-1ULL); 615 in.xattr_version = cpu_to_le64(1); 616 in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid())); 617 if (dir->i_mode & S_ISGID) { 618 in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid)); 619 620 /* Directories always inherit the setgid bit. */ 621 if (S_ISDIR(mode)) 622 mode |= S_ISGID; 623 else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) && 624 !in_group_p(dir->i_gid) && 625 !capable_wrt_inode_uidgid(dir, CAP_FSETID)) 626 mode &= ~S_ISGID; 627 } else { 628 in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid())); 629 } 630 in.mode = cpu_to_le32((u32)mode); 631 632 in.nlink = cpu_to_le32(1); 633 in.max_size = cpu_to_le64(lo->stripe_unit); 634 635 ceph_file_layout_to_legacy(lo, &in.layout); 636 /* lo is private, so pool_ns can't change */ 637 pool_ns = rcu_dereference_raw(lo->pool_ns); 638 if (pool_ns) { 639 iinfo.pool_ns_len = pool_ns->len; 640 iinfo.pool_ns_data = pool_ns->str; 641 } 642 643 down_read(&mdsc->snap_rwsem); 644 ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session, 645 req->r_fmode, NULL); 646 up_read(&mdsc->snap_rwsem); 647 if (ret) { 648 dout("%s failed to fill inode: %d\n", __func__, ret); 649 ceph_dir_clear_complete(dir); 650 if (!d_unhashed(dentry)) 651 d_drop(dentry); 652 if (inode->i_state & I_NEW) 653 discard_new_inode(inode); 654 } else { 655 struct dentry *dn; 656 657 dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__, 658 vino.ino, ceph_ino(dir), dentry->d_name.name); 659 ceph_dir_clear_ordered(dir); 660 ceph_init_inode_acls(inode, as_ctx); 661 if (inode->i_state & I_NEW) { 662 /* 663 * If it's not I_NEW, then someone created this before 664 * we got here. Assume the server is aware of it at 665 * that point and don't worry about setting 666 * CEPH_I_ASYNC_CREATE. 667 */ 668 ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE; 669 unlock_new_inode(inode); 670 } 671 if (d_in_lookup(dentry) || d_really_is_negative(dentry)) { 672 if (!d_unhashed(dentry)) 673 d_drop(dentry); 674 dn = d_splice_alias(inode, dentry); 675 WARN_ON_ONCE(dn && dn != dentry); 676 } 677 file->f_mode |= FMODE_CREATED; 678 ret = finish_open(file, dentry, ceph_open); 679 } 680 return ret; 681} 682 683/* 684 * Do a lookup + open with a single request. If we get a non-existent 685 * file or symlink, return 1 so the VFS can retry. 686 */ 687int ceph_atomic_open(struct inode *dir, struct dentry *dentry, 688 struct file *file, unsigned flags, umode_t mode) 689{ 690 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 691 struct ceph_mds_client *mdsc = fsc->mdsc; 692 struct ceph_mds_request *req; 693 struct dentry *dn; 694 struct ceph_acl_sec_ctx as_ctx = {}; 695 bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS); 696 int mask; 697 int err; 698 699 dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n", 700 dir, dentry, dentry, 701 d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode); 702 703 if (dentry->d_name.len > NAME_MAX) 704 return -ENAMETOOLONG; 705 706 /* 707 * Do not truncate the file, since atomic_open is called before the 708 * permission check. The caller will do the truncation afterward. 709 */ 710 flags &= ~O_TRUNC; 711 712 if (flags & O_CREAT) { 713 if (ceph_quota_is_max_files_exceeded(dir)) 714 return -EDQUOT; 715 err = ceph_pre_init_acls(dir, &mode, &as_ctx); 716 if (err < 0) 717 return err; 718 err = ceph_security_init_secctx(dentry, mode, &as_ctx); 719 if (err < 0) 720 goto out_ctx; 721 /* Async create can't handle more than a page of xattrs */ 722 if (as_ctx.pagelist && 723 !list_is_singular(&as_ctx.pagelist->head)) 724 try_async = false; 725 } else if (!d_in_lookup(dentry)) { 726 /* If it's not being looked up, it's negative */ 727 return -ENOENT; 728 } 729retry: 730 /* do the open */ 731 req = prepare_open_request(dir->i_sb, flags, mode); 732 if (IS_ERR(req)) { 733 err = PTR_ERR(req); 734 goto out_ctx; 735 } 736 req->r_dentry = dget(dentry); 737 req->r_num_caps = 2; 738 mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; 739 if (ceph_security_xattr_wanted(dir)) 740 mask |= CEPH_CAP_XATTR_SHARED; 741 req->r_args.open.mask = cpu_to_le32(mask); 742 req->r_parent = dir; 743 744 if (flags & O_CREAT) { 745 struct ceph_file_layout lo; 746 747 req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; 748 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 749 if (as_ctx.pagelist) { 750 req->r_pagelist = as_ctx.pagelist; 751 as_ctx.pagelist = NULL; 752 } 753 if (try_async && 754 (req->r_dir_caps = 755 try_prep_async_create(dir, dentry, &lo, 756 &req->r_deleg_ino))) { 757 set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); 758 req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL); 759 req->r_callback = ceph_async_create_cb; 760 err = ceph_mdsc_submit_request(mdsc, dir, req); 761 if (!err) { 762 err = ceph_finish_async_create(dir, dentry, 763 file, mode, req, 764 &as_ctx, &lo); 765 } else if (err == -EJUKEBOX) { 766 restore_deleg_ino(dir, req->r_deleg_ino); 767 ceph_mdsc_put_request(req); 768 try_async = false; 769 ceph_put_string(rcu_dereference_raw(lo.pool_ns)); 770 goto retry; 771 } 772 ceph_put_string(rcu_dereference_raw(lo.pool_ns)); 773 goto out_req; 774 } 775 } 776 777 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); 778 err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req); 779 err = ceph_handle_snapdir(req, dentry, err); 780 if (err) 781 goto out_req; 782 783 if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) 784 err = ceph_handle_notrace_create(dir, dentry); 785 786 if (d_in_lookup(dentry)) { 787 dn = ceph_finish_lookup(req, dentry, err); 788 if (IS_ERR(dn)) 789 err = PTR_ERR(dn); 790 } else { 791 /* we were given a hashed negative dentry */ 792 dn = NULL; 793 } 794 if (err) 795 goto out_req; 796 if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) { 797 /* make vfs retry on splice, ENOENT, or symlink */ 798 dout("atomic_open finish_no_open on dn %p\n", dn); 799 err = finish_no_open(file, dn); 800 } else { 801 dout("atomic_open finish_open on dn %p\n", dn); 802 if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { 803 struct inode *newino = d_inode(dentry); 804 805 cache_file_layout(dir, newino); 806 ceph_init_inode_acls(newino, &as_ctx); 807 file->f_mode |= FMODE_CREATED; 808 } 809 err = finish_open(file, dentry, ceph_open); 810 } 811out_req: 812 ceph_mdsc_put_request(req); 813out_ctx: 814 ceph_release_acl_sec_ctx(&as_ctx); 815 dout("atomic_open result=%d\n", err); 816 return err; 817} 818 819int ceph_release(struct inode *inode, struct file *file) 820{ 821 struct ceph_inode_info *ci = ceph_inode(inode); 822 823 if (S_ISDIR(inode->i_mode)) { 824 struct ceph_dir_file_info *dfi = file->private_data; 825 dout("release inode %p dir file %p\n", inode, file); 826 WARN_ON(!list_empty(&dfi->file_info.rw_contexts)); 827 828 ceph_put_fmode(ci, dfi->file_info.fmode, 1); 829 830 if (dfi->last_readdir) 831 ceph_mdsc_put_request(dfi->last_readdir); 832 kfree(dfi->last_name); 833 kfree(dfi->dir_info); 834 kmem_cache_free(ceph_dir_file_cachep, dfi); 835 } else { 836 struct ceph_file_info *fi = file->private_data; 837 dout("release inode %p regular file %p\n", inode, file); 838 WARN_ON(!list_empty(&fi->rw_contexts)); 839 840 ceph_put_fmode(ci, fi->fmode, 1); 841 842 kmem_cache_free(ceph_file_cachep, fi); 843 } 844 845 /* wake up anyone waiting for caps on this inode */ 846 wake_up_all(&ci->i_cap_wq); 847 return 0; 848} 849 850enum { 851 HAVE_RETRIED = 1, 852 CHECK_EOF = 2, 853 READ_INLINE = 3, 854}; 855 856/* 857 * Completely synchronous read and write methods. Direct from __user 858 * buffer to osd, or directly to user pages (if O_DIRECT). 859 * 860 * If the read spans object boundary, just do multiple reads. (That's not 861 * atomic, but good enough for now.) 862 * 863 * If we get a short result from the OSD, check against i_size; we need to 864 * only return a short read to the caller if we hit EOF. 865 */ 866static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, 867 int *retry_op) 868{ 869 struct file *file = iocb->ki_filp; 870 struct inode *inode = file_inode(file); 871 struct ceph_inode_info *ci = ceph_inode(inode); 872 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 873 struct ceph_osd_client *osdc = &fsc->client->osdc; 874 ssize_t ret; 875 u64 off = iocb->ki_pos; 876 u64 len = iov_iter_count(to); 877 878 dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, 879 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 880 881 if (!len) 882 return 0; 883 /* 884 * flush any page cache pages in this range. this 885 * will make concurrent normal and sync io slow, 886 * but it will at least behave sensibly when they are 887 * in sequence. 888 */ 889 ret = filemap_write_and_wait_range(inode->i_mapping, 890 off, off + len - 1); 891 if (ret < 0) 892 return ret; 893 894 ret = 0; 895 while ((len = iov_iter_count(to)) > 0) { 896 struct ceph_osd_request *req; 897 struct page **pages; 898 int num_pages; 899 size_t page_off; 900 u64 i_size; 901 bool more; 902 int idx; 903 size_t left; 904 905 req = ceph_osdc_new_request(osdc, &ci->i_layout, 906 ci->i_vino, off, &len, 0, 1, 907 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 908 NULL, ci->i_truncate_seq, 909 ci->i_truncate_size, false); 910 if (IS_ERR(req)) { 911 ret = PTR_ERR(req); 912 break; 913 } 914 915 more = len < iov_iter_count(to); 916 917 num_pages = calc_pages_for(off, len); 918 page_off = off & ~PAGE_MASK; 919 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 920 if (IS_ERR(pages)) { 921 ceph_osdc_put_request(req); 922 ret = PTR_ERR(pages); 923 break; 924 } 925 926 osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, 927 false, false); 928 ret = ceph_osdc_start_request(osdc, req, false); 929 if (!ret) 930 ret = ceph_osdc_wait_request(osdc, req); 931 932 ceph_update_read_latency(&fsc->mdsc->metric, 933 req->r_start_latency, 934 req->r_end_latency, 935 ret); 936 937 ceph_osdc_put_request(req); 938 939 i_size = i_size_read(inode); 940 dout("sync_read %llu~%llu got %zd i_size %llu%s\n", 941 off, len, ret, i_size, (more ? " MORE" : "")); 942 943 if (ret == -ENOENT) 944 ret = 0; 945 if (ret >= 0 && ret < len && (off + ret < i_size)) { 946 int zlen = min(len - ret, i_size - off - ret); 947 int zoff = page_off + ret; 948 dout("sync_read zero gap %llu~%llu\n", 949 off + ret, off + ret + zlen); 950 ceph_zero_page_vector_range(zoff, zlen, pages); 951 ret += zlen; 952 } 953 954 idx = 0; 955 left = ret > 0 ? ret : 0; 956 while (left > 0) { 957 size_t len, copied; 958 page_off = off & ~PAGE_MASK; 959 len = min_t(size_t, left, PAGE_SIZE - page_off); 960 SetPageUptodate(pages[idx]); 961 copied = copy_page_to_iter(pages[idx++], 962 page_off, len, to); 963 off += copied; 964 left -= copied; 965 if (copied < len) { 966 ret = -EFAULT; 967 break; 968 } 969 } 970 ceph_release_page_vector(pages, num_pages); 971 972 if (ret < 0) { 973 if (ret == -EBLOCKLISTED) 974 fsc->blocklisted = true; 975 break; 976 } 977 978 if (off >= i_size || !more) 979 break; 980 } 981 982 if (off > iocb->ki_pos) { 983 if (ret >= 0 && 984 iov_iter_count(to) > 0 && off >= i_size_read(inode)) 985 *retry_op = CHECK_EOF; 986 ret = off - iocb->ki_pos; 987 iocb->ki_pos = off; 988 } 989 990 dout("sync_read result %zd retry_op %d\n", ret, *retry_op); 991 return ret; 992} 993 994struct ceph_aio_request { 995 struct kiocb *iocb; 996 size_t total_len; 997 bool write; 998 bool should_dirty; 999 int error; 1000 struct list_head osd_reqs; 1001 unsigned num_reqs; 1002 atomic_t pending_reqs; 1003 struct timespec64 mtime; 1004 struct ceph_cap_flush *prealloc_cf; 1005}; 1006 1007struct ceph_aio_work { 1008 struct work_struct work; 1009 struct ceph_osd_request *req; 1010}; 1011 1012static void ceph_aio_retry_work(struct work_struct *work); 1013 1014static void ceph_aio_complete(struct inode *inode, 1015 struct ceph_aio_request *aio_req) 1016{ 1017 struct ceph_inode_info *ci = ceph_inode(inode); 1018 int ret; 1019 1020 if (!atomic_dec_and_test(&aio_req->pending_reqs)) 1021 return; 1022 1023 if (aio_req->iocb->ki_flags & IOCB_DIRECT) 1024 inode_dio_end(inode); 1025 1026 ret = aio_req->error; 1027 if (!ret) 1028 ret = aio_req->total_len; 1029 1030 dout("ceph_aio_complete %p rc %d\n", inode, ret); 1031 1032 if (ret >= 0 && aio_req->write) { 1033 int dirty; 1034 1035 loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len; 1036 if (endoff > i_size_read(inode)) { 1037 if (ceph_inode_set_size(inode, endoff)) 1038 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 1039 } 1040 1041 spin_lock(&ci->i_ceph_lock); 1042 ci->i_inline_version = CEPH_INLINE_NONE; 1043 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, 1044 &aio_req->prealloc_cf); 1045 spin_unlock(&ci->i_ceph_lock); 1046 if (dirty) 1047 __mark_inode_dirty(inode, dirty); 1048 1049 } 1050 1051 ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR : 1052 CEPH_CAP_FILE_RD)); 1053 1054 aio_req->iocb->ki_complete(aio_req->iocb, ret, 0); 1055 1056 ceph_free_cap_flush(aio_req->prealloc_cf); 1057 kfree(aio_req); 1058} 1059 1060static void ceph_aio_complete_req(struct ceph_osd_request *req) 1061{ 1062 int rc = req->r_result; 1063 struct inode *inode = req->r_inode; 1064 struct ceph_aio_request *aio_req = req->r_priv; 1065 struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); 1066 struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric; 1067 1068 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); 1069 BUG_ON(!osd_data->num_bvecs); 1070 1071 dout("ceph_aio_complete_req %p rc %d bytes %u\n", 1072 inode, rc, osd_data->bvec_pos.iter.bi_size); 1073 1074 /* r_start_latency == 0 means the request was not submitted */ 1075 if (req->r_start_latency) { 1076 if (aio_req->write) 1077 ceph_update_write_latency(metric, req->r_start_latency, 1078 req->r_end_latency, rc); 1079 else 1080 ceph_update_read_latency(metric, req->r_start_latency, 1081 req->r_end_latency, rc); 1082 } 1083 1084 if (rc == -EOLDSNAPC) { 1085 struct ceph_aio_work *aio_work; 1086 BUG_ON(!aio_req->write); 1087 1088 aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS); 1089 if (aio_work) { 1090 INIT_WORK(&aio_work->work, ceph_aio_retry_work); 1091 aio_work->req = req; 1092 queue_work(ceph_inode_to_client(inode)->inode_wq, 1093 &aio_work->work); 1094 return; 1095 } 1096 rc = -ENOMEM; 1097 } else if (!aio_req->write) { 1098 if (rc == -ENOENT) 1099 rc = 0; 1100 if (rc >= 0 && osd_data->bvec_pos.iter.bi_size > rc) { 1101 struct iov_iter i; 1102 int zlen = osd_data->bvec_pos.iter.bi_size - rc; 1103 1104 /* 1105 * If read is satisfied by single OSD request, 1106 * it can pass EOF. Otherwise read is within 1107 * i_size. 1108 */ 1109 if (aio_req->num_reqs == 1) { 1110 loff_t i_size = i_size_read(inode); 1111 loff_t endoff = aio_req->iocb->ki_pos + rc; 1112 if (endoff < i_size) 1113 zlen = min_t(size_t, zlen, 1114 i_size - endoff); 1115 aio_req->total_len = rc + zlen; 1116 } 1117 1118 iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs, 1119 osd_data->num_bvecs, 1120 osd_data->bvec_pos.iter.bi_size); 1121 iov_iter_advance(&i, rc); 1122 iov_iter_zero(zlen, &i); 1123 } 1124 } 1125 1126 put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs, 1127 aio_req->should_dirty); 1128 ceph_osdc_put_request(req); 1129 1130 if (rc < 0) 1131 cmpxchg(&aio_req->error, 0, rc); 1132 1133 ceph_aio_complete(inode, aio_req); 1134 return; 1135} 1136 1137static void ceph_aio_retry_work(struct work_struct *work) 1138{ 1139 struct ceph_aio_work *aio_work = 1140 container_of(work, struct ceph_aio_work, work); 1141 struct ceph_osd_request *orig_req = aio_work->req; 1142 struct ceph_aio_request *aio_req = orig_req->r_priv; 1143 struct inode *inode = orig_req->r_inode; 1144 struct ceph_inode_info *ci = ceph_inode(inode); 1145 struct ceph_snap_context *snapc; 1146 struct ceph_osd_request *req; 1147 int ret; 1148 1149 spin_lock(&ci->i_ceph_lock); 1150 if (__ceph_have_pending_cap_snap(ci)) { 1151 struct ceph_cap_snap *capsnap = 1152 list_last_entry(&ci->i_cap_snaps, 1153 struct ceph_cap_snap, 1154 ci_item); 1155 snapc = ceph_get_snap_context(capsnap->context); 1156 } else { 1157 BUG_ON(!ci->i_head_snapc); 1158 snapc = ceph_get_snap_context(ci->i_head_snapc); 1159 } 1160 spin_unlock(&ci->i_ceph_lock); 1161 1162 req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 1, 1163 false, GFP_NOFS); 1164 if (!req) { 1165 ret = -ENOMEM; 1166 req = orig_req; 1167 goto out; 1168 } 1169 1170 req->r_flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; 1171 ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); 1172 ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); 1173 1174 req->r_ops[0] = orig_req->r_ops[0]; 1175 1176 req->r_mtime = aio_req->mtime; 1177 req->r_data_offset = req->r_ops[0].extent.offset; 1178 1179 ret = ceph_osdc_alloc_messages(req, GFP_NOFS); 1180 if (ret) { 1181 ceph_osdc_put_request(req); 1182 req = orig_req; 1183 goto out; 1184 } 1185 1186 ceph_osdc_put_request(orig_req); 1187 1188 req->r_callback = ceph_aio_complete_req; 1189 req->r_inode = inode; 1190 req->r_priv = aio_req; 1191 1192 ret = ceph_osdc_start_request(req->r_osdc, req, false); 1193out: 1194 if (ret < 0) { 1195 req->r_result = ret; 1196 ceph_aio_complete_req(req); 1197 } 1198 1199 ceph_put_snap_context(snapc); 1200 kfree(aio_work); 1201} 1202 1203static ssize_t 1204ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, 1205 struct ceph_snap_context *snapc, 1206 struct ceph_cap_flush **pcf) 1207{ 1208 struct file *file = iocb->ki_filp; 1209 struct inode *inode = file_inode(file); 1210 struct ceph_inode_info *ci = ceph_inode(inode); 1211 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1212 struct ceph_client_metric *metric = &fsc->mdsc->metric; 1213 struct ceph_vino vino; 1214 struct ceph_osd_request *req; 1215 struct bio_vec *bvecs; 1216 struct ceph_aio_request *aio_req = NULL; 1217 int num_pages = 0; 1218 int flags; 1219 int ret = 0; 1220 struct timespec64 mtime = current_time(inode); 1221 size_t count = iov_iter_count(iter); 1222 loff_t pos = iocb->ki_pos; 1223 bool write = iov_iter_rw(iter) == WRITE; 1224 bool should_dirty = !write && iter_is_iovec(iter); 1225 1226 if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) 1227 return -EROFS; 1228 1229 dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n", 1230 (write ? "write" : "read"), file, pos, (unsigned)count, 1231 snapc, snapc ? snapc->seq : 0); 1232 1233 if (write) { 1234 int ret2 = invalidate_inode_pages2_range(inode->i_mapping, 1235 pos >> PAGE_SHIFT, 1236 (pos + count - 1) >> PAGE_SHIFT); 1237 if (ret2 < 0) 1238 dout("invalidate_inode_pages2_range returned %d\n", ret2); 1239 1240 flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; 1241 } else { 1242 flags = CEPH_OSD_FLAG_READ; 1243 } 1244 1245 while (iov_iter_count(iter) > 0) { 1246 u64 size = iov_iter_count(iter); 1247 ssize_t len; 1248 1249 if (write) 1250 size = min_t(u64, size, fsc->mount_options->wsize); 1251 else 1252 size = min_t(u64, size, fsc->mount_options->rsize); 1253 1254 vino = ceph_vino(inode); 1255 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 1256 vino, pos, &size, 0, 1257 1, 1258 write ? CEPH_OSD_OP_WRITE : 1259 CEPH_OSD_OP_READ, 1260 flags, snapc, 1261 ci->i_truncate_seq, 1262 ci->i_truncate_size, 1263 false); 1264 if (IS_ERR(req)) { 1265 ret = PTR_ERR(req); 1266 break; 1267 } 1268 1269 len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages); 1270 if (len < 0) { 1271 ceph_osdc_put_request(req); 1272 ret = len; 1273 break; 1274 } 1275 if (len != size) 1276 osd_req_op_extent_update(req, 0, len); 1277 1278 /* 1279 * To simplify error handling, allow AIO when IO within i_size 1280 * or IO can be satisfied by single OSD request. 1281 */ 1282 if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) && 1283 (len == count || pos + count <= i_size_read(inode))) { 1284 aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL); 1285 if (aio_req) { 1286 aio_req->iocb = iocb; 1287 aio_req->write = write; 1288 aio_req->should_dirty = should_dirty; 1289 INIT_LIST_HEAD(&aio_req->osd_reqs); 1290 if (write) { 1291 aio_req->mtime = mtime; 1292 swap(aio_req->prealloc_cf, *pcf); 1293 } 1294 } 1295 /* ignore error */ 1296 } 1297 1298 if (write) { 1299 /* 1300 * throw out any page cache pages in this range. this 1301 * may block. 1302 */ 1303 truncate_inode_pages_range(inode->i_mapping, pos, 1304 PAGE_ALIGN(pos + len) - 1); 1305 1306 req->r_mtime = mtime; 1307 } 1308 1309 osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); 1310 1311 if (aio_req) { 1312 aio_req->total_len += len; 1313 aio_req->num_reqs++; 1314 atomic_inc(&aio_req->pending_reqs); 1315 1316 req->r_callback = ceph_aio_complete_req; 1317 req->r_inode = inode; 1318 req->r_priv = aio_req; 1319 list_add_tail(&req->r_private_item, &aio_req->osd_reqs); 1320 1321 pos += len; 1322 continue; 1323 } 1324 1325 ret = ceph_osdc_start_request(req->r_osdc, req, false); 1326 if (!ret) 1327 ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 1328 1329 if (write) 1330 ceph_update_write_latency(metric, req->r_start_latency, 1331 req->r_end_latency, ret); 1332 else 1333 ceph_update_read_latency(metric, req->r_start_latency, 1334 req->r_end_latency, ret); 1335 1336 size = i_size_read(inode); 1337 if (!write) { 1338 if (ret == -ENOENT) 1339 ret = 0; 1340 if (ret >= 0 && ret < len && pos + ret < size) { 1341 struct iov_iter i; 1342 int zlen = min_t(size_t, len - ret, 1343 size - pos - ret); 1344 1345 iov_iter_bvec(&i, READ, bvecs, num_pages, len); 1346 iov_iter_advance(&i, ret); 1347 iov_iter_zero(zlen, &i); 1348 ret += zlen; 1349 } 1350 if (ret >= 0) 1351 len = ret; 1352 } 1353 1354 put_bvecs(bvecs, num_pages, should_dirty); 1355 ceph_osdc_put_request(req); 1356 if (ret < 0) 1357 break; 1358 1359 pos += len; 1360 if (!write && pos >= size) 1361 break; 1362 1363 if (write && pos > size) { 1364 if (ceph_inode_set_size(inode, pos)) 1365 ceph_check_caps(ceph_inode(inode), 1366 CHECK_CAPS_AUTHONLY, 1367 NULL); 1368 } 1369 } 1370 1371 if (aio_req) { 1372 LIST_HEAD(osd_reqs); 1373 1374 if (aio_req->num_reqs == 0) { 1375 kfree(aio_req); 1376 return ret; 1377 } 1378 1379 ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR : 1380 CEPH_CAP_FILE_RD); 1381 1382 list_splice(&aio_req->osd_reqs, &osd_reqs); 1383 inode_dio_begin(inode); 1384 while (!list_empty(&osd_reqs)) { 1385 req = list_first_entry(&osd_reqs, 1386 struct ceph_osd_request, 1387 r_private_item); 1388 list_del_init(&req->r_private_item); 1389 if (ret >= 0) 1390 ret = ceph_osdc_start_request(req->r_osdc, 1391 req, false); 1392 if (ret < 0) { 1393 req->r_result = ret; 1394 ceph_aio_complete_req(req); 1395 } 1396 } 1397 return -EIOCBQUEUED; 1398 } 1399 1400 if (ret != -EOLDSNAPC && pos > iocb->ki_pos) { 1401 ret = pos - iocb->ki_pos; 1402 iocb->ki_pos = pos; 1403 } 1404 return ret; 1405} 1406 1407/* 1408 * Synchronous write, straight from __user pointer or user pages. 1409 * 1410 * If write spans object boundary, just do multiple writes. (For a 1411 * correct atomic write, we should e.g. take write locks on all 1412 * objects, rollback on failure, etc.) 1413 */ 1414static ssize_t 1415ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, 1416 struct ceph_snap_context *snapc) 1417{ 1418 struct file *file = iocb->ki_filp; 1419 struct inode *inode = file_inode(file); 1420 struct ceph_inode_info *ci = ceph_inode(inode); 1421 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1422 struct ceph_vino vino; 1423 struct ceph_osd_request *req; 1424 struct page **pages; 1425 u64 len; 1426 int num_pages; 1427 int written = 0; 1428 int flags; 1429 int ret; 1430 bool check_caps = false; 1431 struct timespec64 mtime = current_time(inode); 1432 size_t count = iov_iter_count(from); 1433 1434 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) 1435 return -EROFS; 1436 1437 dout("sync_write on file %p %lld~%u snapc %p seq %lld\n", 1438 file, pos, (unsigned)count, snapc, snapc->seq); 1439 1440 ret = filemap_write_and_wait_range(inode->i_mapping, 1441 pos, pos + count - 1); 1442 if (ret < 0) 1443 return ret; 1444 1445 ret = invalidate_inode_pages2_range(inode->i_mapping, 1446 pos >> PAGE_SHIFT, 1447 (pos + count - 1) >> PAGE_SHIFT); 1448 if (ret < 0) 1449 dout("invalidate_inode_pages2_range returned %d\n", ret); 1450 1451 flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; 1452 1453 while ((len = iov_iter_count(from)) > 0) { 1454 size_t left; 1455 int n; 1456 1457 vino = ceph_vino(inode); 1458 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 1459 vino, pos, &len, 0, 1, 1460 CEPH_OSD_OP_WRITE, flags, snapc, 1461 ci->i_truncate_seq, 1462 ci->i_truncate_size, 1463 false); 1464 if (IS_ERR(req)) { 1465 ret = PTR_ERR(req); 1466 break; 1467 } 1468 1469 /* 1470 * write from beginning of first page, 1471 * regardless of io alignment 1472 */ 1473 num_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1474 1475 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 1476 if (IS_ERR(pages)) { 1477 ret = PTR_ERR(pages); 1478 goto out; 1479 } 1480 1481 left = len; 1482 for (n = 0; n < num_pages; n++) { 1483 size_t plen = min_t(size_t, left, PAGE_SIZE); 1484 ret = copy_page_from_iter(pages[n], 0, plen, from); 1485 if (ret != plen) { 1486 ret = -EFAULT; 1487 break; 1488 } 1489 left -= ret; 1490 } 1491 1492 if (ret < 0) { 1493 ceph_release_page_vector(pages, num_pages); 1494 goto out; 1495 } 1496 1497 req->r_inode = inode; 1498 1499 osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, 1500 false, true); 1501 1502 req->r_mtime = mtime; 1503 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1504 if (!ret) 1505 ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 1506 1507 ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, 1508 req->r_end_latency, ret); 1509out: 1510 ceph_osdc_put_request(req); 1511 if (ret != 0) { 1512 ceph_set_error_write(ci); 1513 break; 1514 } 1515 1516 ceph_clear_error_write(ci); 1517 pos += len; 1518 written += len; 1519 if (pos > i_size_read(inode)) { 1520 check_caps = ceph_inode_set_size(inode, pos); 1521 if (check_caps) 1522 ceph_check_caps(ceph_inode(inode), 1523 CHECK_CAPS_AUTHONLY, 1524 NULL); 1525 } 1526 1527 } 1528 1529 if (ret != -EOLDSNAPC && written > 0) { 1530 ret = written; 1531 iocb->ki_pos = pos; 1532 } 1533 return ret; 1534} 1535 1536/* 1537 * Wrap generic_file_aio_read with checks for cap bits on the inode. 1538 * Atomically grab references, so that those bits are not released 1539 * back to the MDS mid-read. 1540 * 1541 * Hmm, the sync read case isn't actually async... should it be? 1542 */ 1543static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) 1544{ 1545 struct file *filp = iocb->ki_filp; 1546 struct ceph_file_info *fi = filp->private_data; 1547 size_t len = iov_iter_count(to); 1548 struct inode *inode = file_inode(filp); 1549 struct ceph_inode_info *ci = ceph_inode(inode); 1550 struct page *pinned_page = NULL; 1551 bool direct_lock = iocb->ki_flags & IOCB_DIRECT; 1552 ssize_t ret; 1553 int want, got = 0; 1554 int retry_op = 0, read = 0; 1555 1556again: 1557 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", 1558 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); 1559 1560 if (direct_lock) 1561 ceph_start_io_direct(inode); 1562 else 1563 ceph_start_io_read(inode); 1564 1565 if (fi->fmode & CEPH_FILE_MODE_LAZY) 1566 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 1567 else 1568 want = CEPH_CAP_FILE_CACHE; 1569 ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, 1570 &got, &pinned_page); 1571 if (ret < 0) { 1572 if (iocb->ki_flags & IOCB_DIRECT) 1573 ceph_end_io_direct(inode); 1574 else 1575 ceph_end_io_read(inode); 1576 return ret; 1577 } 1578 1579 if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || 1580 (iocb->ki_flags & IOCB_DIRECT) || 1581 (fi->flags & CEPH_F_SYNC)) { 1582 1583 dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n", 1584 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 1585 ceph_cap_string(got)); 1586 1587 if (ci->i_inline_version == CEPH_INLINE_NONE) { 1588 if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { 1589 ret = ceph_direct_read_write(iocb, to, 1590 NULL, NULL); 1591 if (ret >= 0 && ret < len) 1592 retry_op = CHECK_EOF; 1593 } else { 1594 ret = ceph_sync_read(iocb, to, &retry_op); 1595 } 1596 } else { 1597 retry_op = READ_INLINE; 1598 } 1599 } else { 1600 CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); 1601 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", 1602 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 1603 ceph_cap_string(got)); 1604 ceph_add_rw_context(fi, &rw_ctx); 1605 ret = generic_file_read_iter(iocb, to); 1606 ceph_del_rw_context(fi, &rw_ctx); 1607 } 1608 1609 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", 1610 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); 1611 if (pinned_page) { 1612 put_page(pinned_page); 1613 pinned_page = NULL; 1614 } 1615 ceph_put_cap_refs(ci, got); 1616 1617 if (direct_lock) 1618 ceph_end_io_direct(inode); 1619 else 1620 ceph_end_io_read(inode); 1621 1622 if (retry_op > HAVE_RETRIED && ret >= 0) { 1623 int statret; 1624 struct page *page = NULL; 1625 loff_t i_size; 1626 if (retry_op == READ_INLINE) { 1627 page = __page_cache_alloc(GFP_KERNEL); 1628 if (!page) 1629 return -ENOMEM; 1630 } 1631 1632 statret = __ceph_do_getattr(inode, page, 1633 CEPH_STAT_CAP_INLINE_DATA, !!page); 1634 if (statret < 0) { 1635 if (page) 1636 __free_page(page); 1637 if (statret == -ENODATA) { 1638 BUG_ON(retry_op != READ_INLINE); 1639 goto again; 1640 } 1641 return statret; 1642 } 1643 1644 i_size = i_size_read(inode); 1645 if (retry_op == READ_INLINE) { 1646 BUG_ON(ret > 0 || read > 0); 1647 if (iocb->ki_pos < i_size && 1648 iocb->ki_pos < PAGE_SIZE) { 1649 loff_t end = min_t(loff_t, i_size, 1650 iocb->ki_pos + len); 1651 end = min_t(loff_t, end, PAGE_SIZE); 1652 if (statret < end) 1653 zero_user_segment(page, statret, end); 1654 ret = copy_page_to_iter(page, 1655 iocb->ki_pos & ~PAGE_MASK, 1656 end - iocb->ki_pos, to); 1657 iocb->ki_pos += ret; 1658 read += ret; 1659 } 1660 if (iocb->ki_pos < i_size && read < len) { 1661 size_t zlen = min_t(size_t, len - read, 1662 i_size - iocb->ki_pos); 1663 ret = iov_iter_zero(zlen, to); 1664 iocb->ki_pos += ret; 1665 read += ret; 1666 } 1667 __free_pages(page, 0); 1668 return read; 1669 } 1670 1671 /* hit EOF or hole? */ 1672 if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && 1673 ret < len) { 1674 dout("sync_read hit hole, ppos %lld < size %lld" 1675 ", reading more\n", iocb->ki_pos, i_size); 1676 1677 read += ret; 1678 len -= ret; 1679 retry_op = HAVE_RETRIED; 1680 goto again; 1681 } 1682 } 1683 1684 if (ret >= 0) 1685 ret += read; 1686 1687 return ret; 1688} 1689 1690/* 1691 * Take cap references to avoid releasing caps to MDS mid-write. 1692 * 1693 * If we are synchronous, and write with an old snap context, the OSD 1694 * may return EOLDSNAPC. In that case, retry the write.. _after_ 1695 * dropping our cap refs and allowing the pending snap to logically 1696 * complete _before_ this write occurs. 1697 * 1698 * If we are near ENOSPC, write synchronously. 1699 */ 1700static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) 1701{ 1702 struct file *file = iocb->ki_filp; 1703 struct ceph_file_info *fi = file->private_data; 1704 struct inode *inode = file_inode(file); 1705 struct ceph_inode_info *ci = ceph_inode(inode); 1706 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1707 struct ceph_osd_client *osdc = &fsc->client->osdc; 1708 struct ceph_cap_flush *prealloc_cf; 1709 ssize_t count, written = 0; 1710 int err, want, got; 1711 bool direct_lock = false; 1712 u32 map_flags; 1713 u64 pool_flags; 1714 loff_t pos; 1715 loff_t limit = max(i_size_read(inode), fsc->max_file_size); 1716 1717 if (ceph_snap(inode) != CEPH_NOSNAP) 1718 return -EROFS; 1719 1720 prealloc_cf = ceph_alloc_cap_flush(); 1721 if (!prealloc_cf) 1722 return -ENOMEM; 1723 1724 if ((iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND)) == IOCB_DIRECT) 1725 direct_lock = true; 1726 1727retry_snap: 1728 if (direct_lock) 1729 ceph_start_io_direct(inode); 1730 else 1731 ceph_start_io_write(inode); 1732 1733 /* We can write back this queue in page reclaim */ 1734 current->backing_dev_info = inode_to_bdi(inode); 1735 1736 if (iocb->ki_flags & IOCB_APPEND) { 1737 err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); 1738 if (err < 0) 1739 goto out; 1740 } 1741 1742 err = generic_write_checks(iocb, from); 1743 if (err <= 0) 1744 goto out; 1745 1746 pos = iocb->ki_pos; 1747 if (unlikely(pos >= limit)) { 1748 err = -EFBIG; 1749 goto out; 1750 } else { 1751 iov_iter_truncate(from, limit - pos); 1752 } 1753 1754 count = iov_iter_count(from); 1755 if (ceph_quota_is_max_bytes_exceeded(inode, pos + count)) { 1756 err = -EDQUOT; 1757 goto out; 1758 } 1759 1760 down_read(&osdc->lock); 1761 map_flags = osdc->osdmap->flags; 1762 pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id); 1763 up_read(&osdc->lock); 1764 if ((map_flags & CEPH_OSDMAP_FULL) || 1765 (pool_flags & CEPH_POOL_FLAG_FULL)) { 1766 err = -ENOSPC; 1767 goto out; 1768 } 1769 1770 err = file_remove_privs(file); 1771 if (err) 1772 goto out; 1773 1774 if (ci->i_inline_version != CEPH_INLINE_NONE) { 1775 err = ceph_uninline_data(file, NULL); 1776 if (err < 0) 1777 goto out; 1778 } 1779 1780 dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", 1781 inode, ceph_vinop(inode), pos, count, i_size_read(inode)); 1782 if (fi->fmode & CEPH_FILE_MODE_LAZY) 1783 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; 1784 else 1785 want = CEPH_CAP_FILE_BUFFER; 1786 got = 0; 1787 err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, 1788 &got, NULL); 1789 if (err < 0) 1790 goto out; 1791 1792 err = file_update_time(file); 1793 if (err) 1794 goto out_caps; 1795 1796 inode_inc_iversion_raw(inode); 1797 1798 dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n", 1799 inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); 1800 1801 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || 1802 (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) || 1803 (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { 1804 struct ceph_snap_context *snapc; 1805 struct iov_iter data; 1806 1807 spin_lock(&ci->i_ceph_lock); 1808 if (__ceph_have_pending_cap_snap(ci)) { 1809 struct ceph_cap_snap *capsnap = 1810 list_last_entry(&ci->i_cap_snaps, 1811 struct ceph_cap_snap, 1812 ci_item); 1813 snapc = ceph_get_snap_context(capsnap->context); 1814 } else { 1815 BUG_ON(!ci->i_head_snapc); 1816 snapc = ceph_get_snap_context(ci->i_head_snapc); 1817 } 1818 spin_unlock(&ci->i_ceph_lock); 1819 1820 /* we might need to revert back to that point */ 1821 data = *from; 1822 if (iocb->ki_flags & IOCB_DIRECT) 1823 written = ceph_direct_read_write(iocb, &data, snapc, 1824 &prealloc_cf); 1825 else 1826 written = ceph_sync_write(iocb, &data, pos, snapc); 1827 if (direct_lock) 1828 ceph_end_io_direct(inode); 1829 else 1830 ceph_end_io_write(inode); 1831 if (written > 0) 1832 iov_iter_advance(from, written); 1833 ceph_put_snap_context(snapc); 1834 } else { 1835 /* 1836 * No need to acquire the i_truncate_mutex. Because 1837 * the MDS revokes Fwb caps before sending truncate 1838 * message to us. We can't get Fwb cap while there 1839 * are pending vmtruncate. So write and vmtruncate 1840 * can not run at the same time 1841 */ 1842 written = generic_perform_write(file, from, pos); 1843 if (likely(written >= 0)) 1844 iocb->ki_pos = pos + written; 1845 ceph_end_io_write(inode); 1846 } 1847 1848 if (written >= 0) { 1849 int dirty; 1850 1851 spin_lock(&ci->i_ceph_lock); 1852 ci->i_inline_version = CEPH_INLINE_NONE; 1853 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, 1854 &prealloc_cf); 1855 spin_unlock(&ci->i_ceph_lock); 1856 if (dirty) 1857 __mark_inode_dirty(inode, dirty); 1858 if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos)) 1859 ceph_check_caps(ci, 0, NULL); 1860 } 1861 1862 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", 1863 inode, ceph_vinop(inode), pos, (unsigned)count, 1864 ceph_cap_string(got)); 1865 ceph_put_cap_refs(ci, got); 1866 1867 if (written == -EOLDSNAPC) { 1868 dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", 1869 inode, ceph_vinop(inode), pos, (unsigned)count); 1870 goto retry_snap; 1871 } 1872 1873 if (written >= 0) { 1874 if ((map_flags & CEPH_OSDMAP_NEARFULL) || 1875 (pool_flags & CEPH_POOL_FLAG_NEARFULL)) 1876 iocb->ki_flags |= IOCB_DSYNC; 1877 written = generic_write_sync(iocb, written); 1878 } 1879 1880 goto out_unlocked; 1881out_caps: 1882 ceph_put_cap_refs(ci, got); 1883out: 1884 if (direct_lock) 1885 ceph_end_io_direct(inode); 1886 else 1887 ceph_end_io_write(inode); 1888out_unlocked: 1889 ceph_free_cap_flush(prealloc_cf); 1890 current->backing_dev_info = NULL; 1891 return written ? written : err; 1892} 1893 1894/* 1895 * llseek. be sure to verify file size on SEEK_END. 1896 */ 1897static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) 1898{ 1899 struct inode *inode = file->f_mapping->host; 1900 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1901 loff_t i_size; 1902 loff_t ret; 1903 1904 inode_lock(inode); 1905 1906 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { 1907 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); 1908 if (ret < 0) 1909 goto out; 1910 } 1911 1912 i_size = i_size_read(inode); 1913 switch (whence) { 1914 case SEEK_END: 1915 offset += i_size; 1916 break; 1917 case SEEK_CUR: 1918 /* 1919 * Here we special-case the lseek(fd, 0, SEEK_CUR) 1920 * position-querying operation. Avoid rewriting the "same" 1921 * f_pos value back to the file because a concurrent read(), 1922 * write() or lseek() might have altered it 1923 */ 1924 if (offset == 0) { 1925 ret = file->f_pos; 1926 goto out; 1927 } 1928 offset += file->f_pos; 1929 break; 1930 case SEEK_DATA: 1931 if (offset < 0 || offset >= i_size) { 1932 ret = -ENXIO; 1933 goto out; 1934 } 1935 break; 1936 case SEEK_HOLE: 1937 if (offset < 0 || offset >= i_size) { 1938 ret = -ENXIO; 1939 goto out; 1940 } 1941 offset = i_size; 1942 break; 1943 } 1944 1945 ret = vfs_setpos(file, offset, max(i_size, fsc->max_file_size)); 1946 1947out: 1948 inode_unlock(inode); 1949 return ret; 1950} 1951 1952static inline void ceph_zero_partial_page( 1953 struct inode *inode, loff_t offset, unsigned size) 1954{ 1955 struct page *page; 1956 pgoff_t index = offset >> PAGE_SHIFT; 1957 1958 page = find_lock_page(inode->i_mapping, index); 1959 if (page) { 1960 wait_on_page_writeback(page); 1961 zero_user(page, offset & (PAGE_SIZE - 1), size); 1962 unlock_page(page); 1963 put_page(page); 1964 } 1965} 1966 1967static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset, 1968 loff_t length) 1969{ 1970 loff_t nearly = round_up(offset, PAGE_SIZE); 1971 if (offset < nearly) { 1972 loff_t size = nearly - offset; 1973 if (length < size) 1974 size = length; 1975 ceph_zero_partial_page(inode, offset, size); 1976 offset += size; 1977 length -= size; 1978 } 1979 if (length >= PAGE_SIZE) { 1980 loff_t size = round_down(length, PAGE_SIZE); 1981 truncate_pagecache_range(inode, offset, offset + size - 1); 1982 offset += size; 1983 length -= size; 1984 } 1985 if (length) 1986 ceph_zero_partial_page(inode, offset, length); 1987} 1988 1989static int ceph_zero_partial_object(struct inode *inode, 1990 loff_t offset, loff_t *length) 1991{ 1992 struct ceph_inode_info *ci = ceph_inode(inode); 1993 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1994 struct ceph_osd_request *req; 1995 int ret = 0; 1996 loff_t zero = 0; 1997 int op; 1998 1999 if (!length) { 2000 op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE; 2001 length = &zero; 2002 } else { 2003 op = CEPH_OSD_OP_ZERO; 2004 } 2005 2006 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 2007 ceph_vino(inode), 2008 offset, length, 2009 0, 1, op, 2010 CEPH_OSD_FLAG_WRITE, 2011 NULL, 0, 0, false); 2012 if (IS_ERR(req)) { 2013 ret = PTR_ERR(req); 2014 goto out; 2015 } 2016 2017 req->r_mtime = inode->i_mtime; 2018 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 2019 if (!ret) { 2020 ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 2021 if (ret == -ENOENT) 2022 ret = 0; 2023 } 2024 ceph_osdc_put_request(req); 2025 2026out: 2027 return ret; 2028} 2029 2030static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length) 2031{ 2032 int ret = 0; 2033 struct ceph_inode_info *ci = ceph_inode(inode); 2034 s32 stripe_unit = ci->i_layout.stripe_unit; 2035 s32 stripe_count = ci->i_layout.stripe_count; 2036 s32 object_size = ci->i_layout.object_size; 2037 u64 object_set_size = object_size * stripe_count; 2038 u64 nearly, t; 2039 2040 /* round offset up to next period boundary */ 2041 nearly = offset + object_set_size - 1; 2042 t = nearly; 2043 nearly -= do_div(t, object_set_size); 2044 2045 while (length && offset < nearly) { 2046 loff_t size = length; 2047 ret = ceph_zero_partial_object(inode, offset, &size); 2048 if (ret < 0) 2049 return ret; 2050 offset += size; 2051 length -= size; 2052 } 2053 while (length >= object_set_size) { 2054 int i; 2055 loff_t pos = offset; 2056 for (i = 0; i < stripe_count; ++i) { 2057 ret = ceph_zero_partial_object(inode, pos, NULL); 2058 if (ret < 0) 2059 return ret; 2060 pos += stripe_unit; 2061 } 2062 offset += object_set_size; 2063 length -= object_set_size; 2064 } 2065 while (length) { 2066 loff_t size = length; 2067 ret = ceph_zero_partial_object(inode, offset, &size); 2068 if (ret < 0) 2069 return ret; 2070 offset += size; 2071 length -= size; 2072 } 2073 return ret; 2074} 2075 2076static long ceph_fallocate(struct file *file, int mode, 2077 loff_t offset, loff_t length) 2078{ 2079 struct ceph_file_info *fi = file->private_data; 2080 struct inode *inode = file_inode(file); 2081 struct ceph_inode_info *ci = ceph_inode(inode); 2082 struct ceph_cap_flush *prealloc_cf; 2083 int want, got = 0; 2084 int dirty; 2085 int ret = 0; 2086 loff_t endoff = 0; 2087 loff_t size; 2088 2089 if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 2090 return -EOPNOTSUPP; 2091 2092 if (!S_ISREG(inode->i_mode)) 2093 return -EOPNOTSUPP; 2094 2095 prealloc_cf = ceph_alloc_cap_flush(); 2096 if (!prealloc_cf) 2097 return -ENOMEM; 2098 2099 inode_lock(inode); 2100 2101 if (ceph_snap(inode) != CEPH_NOSNAP) { 2102 ret = -EROFS; 2103 goto unlock; 2104 } 2105 2106 if (ci->i_inline_version != CEPH_INLINE_NONE) { 2107 ret = ceph_uninline_data(file, NULL); 2108 if (ret < 0) 2109 goto unlock; 2110 } 2111 2112 size = i_size_read(inode); 2113 2114 /* Are we punching a hole beyond EOF? */ 2115 if (offset >= size) 2116 goto unlock; 2117 if ((offset + length) > size) 2118 length = size - offset; 2119 2120 if (fi->fmode & CEPH_FILE_MODE_LAZY) 2121 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; 2122 else 2123 want = CEPH_CAP_FILE_BUFFER; 2124 2125 ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); 2126 if (ret < 0) 2127 goto unlock; 2128 2129 ceph_zero_pagecache_range(inode, offset, length); 2130 ret = ceph_zero_objects(inode, offset, length); 2131 2132 if (!ret) { 2133 spin_lock(&ci->i_ceph_lock); 2134 ci->i_inline_version = CEPH_INLINE_NONE; 2135 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, 2136 &prealloc_cf); 2137 spin_unlock(&ci->i_ceph_lock); 2138 if (dirty) 2139 __mark_inode_dirty(inode, dirty); 2140 } 2141 2142 ceph_put_cap_refs(ci, got); 2143unlock: 2144 inode_unlock(inode); 2145 ceph_free_cap_flush(prealloc_cf); 2146 return ret; 2147} 2148 2149/* 2150 * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for 2151 * src_ci. Two attempts are made to obtain both caps, and an error is return if 2152 * this fails; zero is returned on success. 2153 */ 2154static int get_rd_wr_caps(struct file *src_filp, int *src_got, 2155 struct file *dst_filp, 2156 loff_t dst_endoff, int *dst_got) 2157{ 2158 int ret = 0; 2159 bool retrying = false; 2160 2161retry_caps: 2162 ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, 2163 dst_endoff, dst_got, NULL); 2164 if (ret < 0) 2165 return ret; 2166 2167 /* 2168 * Since we're already holding the FILE_WR capability for the dst file, 2169 * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some 2170 * retry dance instead to try to get both capabilities. 2171 */ 2172 ret = ceph_try_get_caps(file_inode(src_filp), 2173 CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, 2174 false, src_got); 2175 if (ret <= 0) { 2176 /* Start by dropping dst_ci caps and getting src_ci caps */ 2177 ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got); 2178 if (retrying) { 2179 if (!ret) 2180 /* ceph_try_get_caps masks EAGAIN */ 2181 ret = -EAGAIN; 2182 return ret; 2183 } 2184 ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD, 2185 CEPH_CAP_FILE_SHARED, -1, src_got, NULL); 2186 if (ret < 0) 2187 return ret; 2188 /*... drop src_ci caps too, and retry */ 2189 ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got); 2190 retrying = true; 2191 goto retry_caps; 2192 } 2193 return ret; 2194} 2195 2196static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got, 2197 struct ceph_inode_info *dst_ci, int dst_got) 2198{ 2199 ceph_put_cap_refs(src_ci, src_got); 2200 ceph_put_cap_refs(dst_ci, dst_got); 2201} 2202 2203/* 2204 * This function does several size-related checks, returning an error if: 2205 * - source file is smaller than off+len 2206 * - destination file size is not OK (inode_newsize_ok()) 2207 * - max bytes quotas is exceeded 2208 */ 2209static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode, 2210 loff_t src_off, loff_t dst_off, size_t len) 2211{ 2212 loff_t size, endoff; 2213 2214 size = i_size_read(src_inode); 2215 /* 2216 * Don't copy beyond source file EOF. Instead of simply setting length 2217 * to (size - src_off), just drop to VFS default implementation, as the 2218 * local i_size may be stale due to other clients writing to the source 2219 * inode. 2220 */ 2221 if (src_off + len > size) { 2222 dout("Copy beyond EOF (%llu + %zu > %llu)\n", 2223 src_off, len, size); 2224 return -EOPNOTSUPP; 2225 } 2226 size = i_size_read(dst_inode); 2227 2228 endoff = dst_off + len; 2229 if (inode_newsize_ok(dst_inode, endoff)) 2230 return -EOPNOTSUPP; 2231 2232 if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff)) 2233 return -EDQUOT; 2234 2235 return 0; 2236} 2237 2238static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off, 2239 struct ceph_inode_info *dst_ci, u64 *dst_off, 2240 struct ceph_fs_client *fsc, 2241 size_t len, unsigned int flags) 2242{ 2243 struct ceph_object_locator src_oloc, dst_oloc; 2244 struct ceph_object_id src_oid, dst_oid; 2245 size_t bytes = 0; 2246 u64 src_objnum, src_objoff, dst_objnum, dst_objoff; 2247 u32 src_objlen, dst_objlen; 2248 u32 object_size = src_ci->i_layout.object_size; 2249 int ret; 2250 2251 src_oloc.pool = src_ci->i_layout.pool_id; 2252 src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns); 2253 dst_oloc.pool = dst_ci->i_layout.pool_id; 2254 dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns); 2255 2256 while (len >= object_size) { 2257 ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off, 2258 object_size, &src_objnum, 2259 &src_objoff, &src_objlen); 2260 ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off, 2261 object_size, &dst_objnum, 2262 &dst_objoff, &dst_objlen); 2263 ceph_oid_init(&src_oid); 2264 ceph_oid_printf(&src_oid, "%llx.%08llx", 2265 src_ci->i_vino.ino, src_objnum); 2266 ceph_oid_init(&dst_oid); 2267 ceph_oid_printf(&dst_oid, "%llx.%08llx", 2268 dst_ci->i_vino.ino, dst_objnum); 2269 /* Do an object remote copy */ 2270 ret = ceph_osdc_copy_from(&fsc->client->osdc, 2271 src_ci->i_vino.snap, 0, 2272 &src_oid, &src_oloc, 2273 CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | 2274 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE, 2275 &dst_oid, &dst_oloc, 2276 CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | 2277 CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, 2278 dst_ci->i_truncate_seq, 2279 dst_ci->i_truncate_size, 2280 CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ); 2281 if (ret) { 2282 if (ret == -EOPNOTSUPP) { 2283 fsc->have_copy_from2 = false; 2284 pr_notice("OSDs don't support copy-from2; disabling copy offload\n"); 2285 } 2286 dout("ceph_osdc_copy_from returned %d\n", ret); 2287 if (!bytes) 2288 bytes = ret; 2289 goto out; 2290 } 2291 len -= object_size; 2292 bytes += object_size; 2293 *src_off += object_size; 2294 *dst_off += object_size; 2295 } 2296 2297out: 2298 ceph_oloc_destroy(&src_oloc); 2299 ceph_oloc_destroy(&dst_oloc); 2300 return bytes; 2301} 2302 2303static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, 2304 struct file *dst_file, loff_t dst_off, 2305 size_t len, unsigned int flags) 2306{ 2307 struct inode *src_inode = file_inode(src_file); 2308 struct inode *dst_inode = file_inode(dst_file); 2309 struct ceph_inode_info *src_ci = ceph_inode(src_inode); 2310 struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); 2311 struct ceph_cap_flush *prealloc_cf; 2312 struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode); 2313 loff_t size; 2314 ssize_t ret = -EIO, bytes; 2315 u64 src_objnum, dst_objnum, src_objoff, dst_objoff; 2316 u32 src_objlen, dst_objlen; 2317 int src_got = 0, dst_got = 0, err, dirty; 2318 2319 if (src_inode->i_sb != dst_inode->i_sb) { 2320 struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode); 2321 2322 if (ceph_fsid_compare(&src_fsc->client->fsid, 2323 &dst_fsc->client->fsid)) { 2324 dout("Copying files across clusters: src: %pU dst: %pU\n", 2325 &src_fsc->client->fsid, &dst_fsc->client->fsid); 2326 return -EXDEV; 2327 } 2328 } 2329 if (ceph_snap(dst_inode) != CEPH_NOSNAP) 2330 return -EROFS; 2331 2332 /* 2333 * Some of the checks below will return -EOPNOTSUPP, which will force a 2334 * fallback to the default VFS copy_file_range implementation. This is 2335 * desirable in several cases (for ex, the 'len' is smaller than the 2336 * size of the objects, or in cases where that would be more 2337 * efficient). 2338 */ 2339 2340 if (ceph_test_mount_opt(src_fsc, NOCOPYFROM)) 2341 return -EOPNOTSUPP; 2342 2343 if (!src_fsc->have_copy_from2) 2344 return -EOPNOTSUPP; 2345 2346 /* 2347 * Striped file layouts require that we copy partial objects, but the 2348 * OSD copy-from operation only supports full-object copies. Limit 2349 * this to non-striped file layouts for now. 2350 */ 2351 if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || 2352 (src_ci->i_layout.stripe_count != 1) || 2353 (dst_ci->i_layout.stripe_count != 1) || 2354 (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) { 2355 dout("Invalid src/dst files layout\n"); 2356 return -EOPNOTSUPP; 2357 } 2358 2359 if (len < src_ci->i_layout.object_size) 2360 return -EOPNOTSUPP; /* no remote copy will be done */ 2361 2362 prealloc_cf = ceph_alloc_cap_flush(); 2363 if (!prealloc_cf) 2364 return -ENOMEM; 2365 2366 /* Start by sync'ing the source and destination files */ 2367 ret = file_write_and_wait_range(src_file, src_off, (src_off + len)); 2368 if (ret < 0) { 2369 dout("failed to write src file (%zd)\n", ret); 2370 goto out; 2371 } 2372 ret = file_write_and_wait_range(dst_file, dst_off, (dst_off + len)); 2373 if (ret < 0) { 2374 dout("failed to write dst file (%zd)\n", ret); 2375 goto out; 2376 } 2377 2378 /* 2379 * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other 2380 * clients may have dirty data in their caches. And OSDs know nothing 2381 * about caps, so they can't safely do the remote object copies. 2382 */ 2383 err = get_rd_wr_caps(src_file, &src_got, 2384 dst_file, (dst_off + len), &dst_got); 2385 if (err < 0) { 2386 dout("get_rd_wr_caps returned %d\n", err); 2387 ret = -EOPNOTSUPP; 2388 goto out; 2389 } 2390 2391 ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len); 2392 if (ret < 0) 2393 goto out_caps; 2394 2395 /* Drop dst file cached pages */ 2396 ret = invalidate_inode_pages2_range(dst_inode->i_mapping, 2397 dst_off >> PAGE_SHIFT, 2398 (dst_off + len) >> PAGE_SHIFT); 2399 if (ret < 0) { 2400 dout("Failed to invalidate inode pages (%zd)\n", ret); 2401 ret = 0; /* XXX */ 2402 } 2403 ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, 2404 src_ci->i_layout.object_size, 2405 &src_objnum, &src_objoff, &src_objlen); 2406 ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off, 2407 dst_ci->i_layout.object_size, 2408 &dst_objnum, &dst_objoff, &dst_objlen); 2409 /* object-level offsets need to the same */ 2410 if (src_objoff != dst_objoff) { 2411 ret = -EOPNOTSUPP; 2412 goto out_caps; 2413 } 2414 2415 /* 2416 * Do a manual copy if the object offset isn't object aligned. 2417 * 'src_objlen' contains the bytes left until the end of the object, 2418 * starting at the src_off 2419 */ 2420 if (src_objoff) { 2421 dout("Initial partial copy of %u bytes\n", src_objlen); 2422 2423 /* 2424 * we need to temporarily drop all caps as we'll be calling 2425 * {read,write}_iter, which will get caps again. 2426 */ 2427 put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); 2428 ret = do_splice_direct(src_file, &src_off, dst_file, 2429 &dst_off, src_objlen, flags); 2430 /* Abort on short copies or on error */ 2431 if (ret < (long)src_objlen) { 2432 dout("Failed partial copy (%zd)\n", ret); 2433 goto out; 2434 } 2435 len -= ret; 2436 err = get_rd_wr_caps(src_file, &src_got, 2437 dst_file, (dst_off + len), &dst_got); 2438 if (err < 0) 2439 goto out; 2440 err = is_file_size_ok(src_inode, dst_inode, 2441 src_off, dst_off, len); 2442 if (err < 0) 2443 goto out_caps; 2444 } 2445 2446 size = i_size_read(dst_inode); 2447 bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off, 2448 src_fsc, len, flags); 2449 if (bytes <= 0) { 2450 if (!ret) 2451 ret = bytes; 2452 goto out_caps; 2453 } 2454 dout("Copied %zu bytes out of %zu\n", bytes, len); 2455 len -= bytes; 2456 ret += bytes; 2457 2458 file_update_time(dst_file); 2459 inode_inc_iversion_raw(dst_inode); 2460 2461 if (dst_off > size) { 2462 /* Let the MDS know about dst file size change */ 2463 if (ceph_inode_set_size(dst_inode, dst_off) || 2464 ceph_quota_is_max_bytes_approaching(dst_inode, dst_off)) 2465 ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL); 2466 } 2467 /* Mark Fw dirty */ 2468 spin_lock(&dst_ci->i_ceph_lock); 2469 dst_ci->i_inline_version = CEPH_INLINE_NONE; 2470 dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf); 2471 spin_unlock(&dst_ci->i_ceph_lock); 2472 if (dirty) 2473 __mark_inode_dirty(dst_inode, dirty); 2474 2475out_caps: 2476 put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); 2477 2478 /* 2479 * Do the final manual copy if we still have some bytes left, unless 2480 * there were errors in remote object copies (len >= object_size). 2481 */ 2482 if (len && (len < src_ci->i_layout.object_size)) { 2483 dout("Final partial copy of %zu bytes\n", len); 2484 bytes = do_splice_direct(src_file, &src_off, dst_file, 2485 &dst_off, len, flags); 2486 if (bytes > 0) 2487 ret += bytes; 2488 else 2489 dout("Failed partial copy (%zd)\n", bytes); 2490 } 2491 2492out: 2493 ceph_free_cap_flush(prealloc_cf); 2494 2495 return ret; 2496} 2497 2498static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off, 2499 struct file *dst_file, loff_t dst_off, 2500 size_t len, unsigned int flags) 2501{ 2502 ssize_t ret; 2503 2504 ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off, 2505 len, flags); 2506 2507 if (ret == -EOPNOTSUPP || ret == -EXDEV) 2508 ret = generic_copy_file_range(src_file, src_off, dst_file, 2509 dst_off, len, flags); 2510 return ret; 2511} 2512 2513const struct file_operations ceph_file_fops = { 2514 .open = ceph_open, 2515 .release = ceph_release, 2516 .llseek = ceph_llseek, 2517 .read_iter = ceph_read_iter, 2518 .write_iter = ceph_write_iter, 2519 .mmap = ceph_mmap, 2520 .fsync = ceph_fsync, 2521 .lock = ceph_lock, 2522 .setlease = simple_nosetlease, 2523 .flock = ceph_flock, 2524 .splice_read = generic_file_splice_read, 2525 .splice_write = iter_file_splice_write, 2526 .unlocked_ioctl = ceph_ioctl, 2527 .compat_ioctl = compat_ptr_ioctl, 2528 .fallocate = ceph_fallocate, 2529 .copy_file_range = ceph_copy_file_range, 2530}; 2531