162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci#include <linux/ceph/ceph_debug.h> 362306a36Sopenharmony_ci#include <linux/ceph/striper.h> 462306a36Sopenharmony_ci 562306a36Sopenharmony_ci#include <linux/module.h> 662306a36Sopenharmony_ci#include <linux/sched.h> 762306a36Sopenharmony_ci#include <linux/slab.h> 862306a36Sopenharmony_ci#include <linux/file.h> 962306a36Sopenharmony_ci#include <linux/mount.h> 1062306a36Sopenharmony_ci#include <linux/namei.h> 1162306a36Sopenharmony_ci#include <linux/writeback.h> 1262306a36Sopenharmony_ci#include <linux/falloc.h> 1362306a36Sopenharmony_ci#include <linux/iversion.h> 1462306a36Sopenharmony_ci#include <linux/ktime.h> 1562306a36Sopenharmony_ci 1662306a36Sopenharmony_ci#include "super.h" 1762306a36Sopenharmony_ci#include "mds_client.h" 1862306a36Sopenharmony_ci#include "cache.h" 1962306a36Sopenharmony_ci#include "io.h" 2062306a36Sopenharmony_ci#include "metric.h" 2162306a36Sopenharmony_ci 2262306a36Sopenharmony_cistatic __le32 ceph_flags_sys2wire(u32 flags) 2362306a36Sopenharmony_ci{ 2462306a36Sopenharmony_ci u32 wire_flags = 0; 2562306a36Sopenharmony_ci 2662306a36Sopenharmony_ci switch (flags & O_ACCMODE) { 2762306a36Sopenharmony_ci case O_RDONLY: 2862306a36Sopenharmony_ci wire_flags |= CEPH_O_RDONLY; 2962306a36Sopenharmony_ci break; 3062306a36Sopenharmony_ci case O_WRONLY: 3162306a36Sopenharmony_ci wire_flags |= CEPH_O_WRONLY; 3262306a36Sopenharmony_ci break; 3362306a36Sopenharmony_ci case O_RDWR: 3462306a36Sopenharmony_ci wire_flags |= CEPH_O_RDWR; 3562306a36Sopenharmony_ci break; 3662306a36Sopenharmony_ci } 3762306a36Sopenharmony_ci 3862306a36Sopenharmony_ci flags &= ~O_ACCMODE; 3962306a36Sopenharmony_ci 4062306a36Sopenharmony_ci#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; } 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_ci ceph_sys2wire(O_CREAT); 4362306a36Sopenharmony_ci ceph_sys2wire(O_EXCL); 4462306a36Sopenharmony_ci ceph_sys2wire(O_TRUNC); 4562306a36Sopenharmony_ci ceph_sys2wire(O_DIRECTORY); 4662306a36Sopenharmony_ci ceph_sys2wire(O_NOFOLLOW); 4762306a36Sopenharmony_ci 4862306a36Sopenharmony_ci#undef ceph_sys2wire 4962306a36Sopenharmony_ci 5062306a36Sopenharmony_ci if (flags) 5162306a36Sopenharmony_ci dout("unused open flags: %x\n", flags); 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_ci return cpu_to_le32(wire_flags); 5462306a36Sopenharmony_ci} 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_ci/* 5762306a36Sopenharmony_ci * Ceph file operations 5862306a36Sopenharmony_ci * 5962306a36Sopenharmony_ci * Implement basic open/close functionality, and implement 6062306a36Sopenharmony_ci * read/write. 6162306a36Sopenharmony_ci * 6262306a36Sopenharmony_ci * We implement three modes of file I/O: 6362306a36Sopenharmony_ci * - buffered uses the generic_file_aio_{read,write} helpers 6462306a36Sopenharmony_ci * 6562306a36Sopenharmony_ci * - synchronous is used when there is multi-client read/write 6662306a36Sopenharmony_ci * sharing, avoids the page cache, and synchronously waits for an 6762306a36Sopenharmony_ci * ack from the OSD. 6862306a36Sopenharmony_ci * 6962306a36Sopenharmony_ci * - direct io takes the variant of the sync path that references 7062306a36Sopenharmony_ci * user pages directly. 7162306a36Sopenharmony_ci * 7262306a36Sopenharmony_ci * fsync() flushes and waits on dirty pages, but just queues metadata 7362306a36Sopenharmony_ci * for writeback: since the MDS can recover size and mtime there is no 7462306a36Sopenharmony_ci * need to wait for MDS acknowledgement. 7562306a36Sopenharmony_ci */ 7662306a36Sopenharmony_ci 7762306a36Sopenharmony_ci/* 7862306a36Sopenharmony_ci * How many pages to get in one call to iov_iter_get_pages(). This 7962306a36Sopenharmony_ci * determines the size of the on-stack array used as a buffer. 8062306a36Sopenharmony_ci */ 8162306a36Sopenharmony_ci#define ITER_GET_BVECS_PAGES 64 8262306a36Sopenharmony_ci 8362306a36Sopenharmony_cistatic ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize, 8462306a36Sopenharmony_ci struct bio_vec *bvecs) 8562306a36Sopenharmony_ci{ 8662306a36Sopenharmony_ci size_t size = 0; 8762306a36Sopenharmony_ci int bvec_idx = 0; 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci if (maxsize > iov_iter_count(iter)) 9062306a36Sopenharmony_ci maxsize = iov_iter_count(iter); 9162306a36Sopenharmony_ci 9262306a36Sopenharmony_ci while (size < maxsize) { 9362306a36Sopenharmony_ci struct page *pages[ITER_GET_BVECS_PAGES]; 9462306a36Sopenharmony_ci ssize_t bytes; 9562306a36Sopenharmony_ci size_t start; 9662306a36Sopenharmony_ci int idx = 0; 9762306a36Sopenharmony_ci 9862306a36Sopenharmony_ci bytes = iov_iter_get_pages2(iter, pages, maxsize - size, 9962306a36Sopenharmony_ci ITER_GET_BVECS_PAGES, &start); 10062306a36Sopenharmony_ci if (bytes < 0) 10162306a36Sopenharmony_ci return size ?: bytes; 10262306a36Sopenharmony_ci 10362306a36Sopenharmony_ci size += bytes; 10462306a36Sopenharmony_ci 10562306a36Sopenharmony_ci for ( ; bytes; idx++, bvec_idx++) { 10662306a36Sopenharmony_ci int len = min_t(int, bytes, PAGE_SIZE - start); 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_ci bvec_set_page(&bvecs[bvec_idx], pages[idx], len, start); 10962306a36Sopenharmony_ci bytes -= len; 11062306a36Sopenharmony_ci start = 0; 11162306a36Sopenharmony_ci } 11262306a36Sopenharmony_ci } 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ci return size; 11562306a36Sopenharmony_ci} 11662306a36Sopenharmony_ci 11762306a36Sopenharmony_ci/* 11862306a36Sopenharmony_ci * iov_iter_get_pages() only considers one iov_iter segment, no matter 11962306a36Sopenharmony_ci * what maxsize or maxpages are given. For ITER_BVEC that is a single 12062306a36Sopenharmony_ci * page. 12162306a36Sopenharmony_ci * 12262306a36Sopenharmony_ci * Attempt to get up to @maxsize bytes worth of pages from @iter. 12362306a36Sopenharmony_ci * Return the number of bytes in the created bio_vec array, or an error. 12462306a36Sopenharmony_ci */ 12562306a36Sopenharmony_cistatic ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize, 12662306a36Sopenharmony_ci struct bio_vec **bvecs, int *num_bvecs) 12762306a36Sopenharmony_ci{ 12862306a36Sopenharmony_ci struct bio_vec *bv; 12962306a36Sopenharmony_ci size_t orig_count = iov_iter_count(iter); 13062306a36Sopenharmony_ci ssize_t bytes; 13162306a36Sopenharmony_ci int npages; 13262306a36Sopenharmony_ci 13362306a36Sopenharmony_ci iov_iter_truncate(iter, maxsize); 13462306a36Sopenharmony_ci npages = iov_iter_npages(iter, INT_MAX); 13562306a36Sopenharmony_ci iov_iter_reexpand(iter, orig_count); 13662306a36Sopenharmony_ci 13762306a36Sopenharmony_ci /* 13862306a36Sopenharmony_ci * __iter_get_bvecs() may populate only part of the array -- zero it 13962306a36Sopenharmony_ci * out. 14062306a36Sopenharmony_ci */ 14162306a36Sopenharmony_ci bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL | __GFP_ZERO); 14262306a36Sopenharmony_ci if (!bv) 14362306a36Sopenharmony_ci return -ENOMEM; 14462306a36Sopenharmony_ci 14562306a36Sopenharmony_ci bytes = __iter_get_bvecs(iter, maxsize, bv); 14662306a36Sopenharmony_ci if (bytes < 0) { 14762306a36Sopenharmony_ci /* 14862306a36Sopenharmony_ci * No pages were pinned -- just free the array. 14962306a36Sopenharmony_ci */ 15062306a36Sopenharmony_ci kvfree(bv); 15162306a36Sopenharmony_ci return bytes; 15262306a36Sopenharmony_ci } 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci *bvecs = bv; 15562306a36Sopenharmony_ci *num_bvecs = npages; 15662306a36Sopenharmony_ci return bytes; 15762306a36Sopenharmony_ci} 15862306a36Sopenharmony_ci 15962306a36Sopenharmony_cistatic void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty) 16062306a36Sopenharmony_ci{ 16162306a36Sopenharmony_ci int i; 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_ci for (i = 0; i < num_bvecs; i++) { 16462306a36Sopenharmony_ci if (bvecs[i].bv_page) { 16562306a36Sopenharmony_ci if (should_dirty) 16662306a36Sopenharmony_ci set_page_dirty_lock(bvecs[i].bv_page); 16762306a36Sopenharmony_ci put_page(bvecs[i].bv_page); 16862306a36Sopenharmony_ci } 16962306a36Sopenharmony_ci } 17062306a36Sopenharmony_ci kvfree(bvecs); 17162306a36Sopenharmony_ci} 17262306a36Sopenharmony_ci 17362306a36Sopenharmony_ci/* 17462306a36Sopenharmony_ci * Prepare an open request. Preallocate ceph_cap to avoid an 17562306a36Sopenharmony_ci * inopportune ENOMEM later. 17662306a36Sopenharmony_ci */ 17762306a36Sopenharmony_cistatic struct ceph_mds_request * 17862306a36Sopenharmony_ciprepare_open_request(struct super_block *sb, int flags, int create_mode) 17962306a36Sopenharmony_ci{ 18062306a36Sopenharmony_ci struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb); 18162306a36Sopenharmony_ci struct ceph_mds_request *req; 18262306a36Sopenharmony_ci int want_auth = USE_ANY_MDS; 18362306a36Sopenharmony_ci int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_ci if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC)) 18662306a36Sopenharmony_ci want_auth = USE_AUTH_MDS; 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_ci req = ceph_mdsc_create_request(mdsc, op, want_auth); 18962306a36Sopenharmony_ci if (IS_ERR(req)) 19062306a36Sopenharmony_ci goto out; 19162306a36Sopenharmony_ci req->r_fmode = ceph_flags_to_mode(flags); 19262306a36Sopenharmony_ci req->r_args.open.flags = ceph_flags_sys2wire(flags); 19362306a36Sopenharmony_ci req->r_args.open.mode = cpu_to_le32(create_mode); 19462306a36Sopenharmony_ciout: 19562306a36Sopenharmony_ci return req; 19662306a36Sopenharmony_ci} 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_cistatic int ceph_init_file_info(struct inode *inode, struct file *file, 19962306a36Sopenharmony_ci int fmode, bool isdir) 20062306a36Sopenharmony_ci{ 20162306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 20262306a36Sopenharmony_ci struct ceph_mount_options *opt = 20362306a36Sopenharmony_ci ceph_inode_to_client(&ci->netfs.inode)->mount_options; 20462306a36Sopenharmony_ci struct ceph_file_info *fi; 20562306a36Sopenharmony_ci int ret; 20662306a36Sopenharmony_ci 20762306a36Sopenharmony_ci dout("%s %p %p 0%o (%s)\n", __func__, inode, file, 20862306a36Sopenharmony_ci inode->i_mode, isdir ? "dir" : "regular"); 20962306a36Sopenharmony_ci BUG_ON(inode->i_fop->release != ceph_release); 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci if (isdir) { 21262306a36Sopenharmony_ci struct ceph_dir_file_info *dfi = 21362306a36Sopenharmony_ci kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); 21462306a36Sopenharmony_ci if (!dfi) 21562306a36Sopenharmony_ci return -ENOMEM; 21662306a36Sopenharmony_ci 21762306a36Sopenharmony_ci file->private_data = dfi; 21862306a36Sopenharmony_ci fi = &dfi->file_info; 21962306a36Sopenharmony_ci dfi->next_offset = 2; 22062306a36Sopenharmony_ci dfi->readdir_cache_idx = -1; 22162306a36Sopenharmony_ci } else { 22262306a36Sopenharmony_ci fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); 22362306a36Sopenharmony_ci if (!fi) 22462306a36Sopenharmony_ci return -ENOMEM; 22562306a36Sopenharmony_ci 22662306a36Sopenharmony_ci if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) 22762306a36Sopenharmony_ci fi->flags |= CEPH_F_SYNC; 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_ci file->private_data = fi; 23062306a36Sopenharmony_ci } 23162306a36Sopenharmony_ci 23262306a36Sopenharmony_ci ceph_get_fmode(ci, fmode, 1); 23362306a36Sopenharmony_ci fi->fmode = fmode; 23462306a36Sopenharmony_ci 23562306a36Sopenharmony_ci spin_lock_init(&fi->rw_contexts_lock); 23662306a36Sopenharmony_ci INIT_LIST_HEAD(&fi->rw_contexts); 23762306a36Sopenharmony_ci fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); 23862306a36Sopenharmony_ci 23962306a36Sopenharmony_ci if ((file->f_mode & FMODE_WRITE) && ceph_has_inline_data(ci)) { 24062306a36Sopenharmony_ci ret = ceph_uninline_data(file); 24162306a36Sopenharmony_ci if (ret < 0) 24262306a36Sopenharmony_ci goto error; 24362306a36Sopenharmony_ci } 24462306a36Sopenharmony_ci 24562306a36Sopenharmony_ci return 0; 24662306a36Sopenharmony_ci 24762306a36Sopenharmony_cierror: 24862306a36Sopenharmony_ci ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE); 24962306a36Sopenharmony_ci ceph_put_fmode(ci, fi->fmode, 1); 25062306a36Sopenharmony_ci kmem_cache_free(ceph_file_cachep, fi); 25162306a36Sopenharmony_ci /* wake up anyone waiting for caps on this inode */ 25262306a36Sopenharmony_ci wake_up_all(&ci->i_cap_wq); 25362306a36Sopenharmony_ci return ret; 25462306a36Sopenharmony_ci} 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_ci/* 25762306a36Sopenharmony_ci * initialize private struct file data. 25862306a36Sopenharmony_ci * if we fail, clean up by dropping fmode reference on the ceph_inode 25962306a36Sopenharmony_ci */ 26062306a36Sopenharmony_cistatic int ceph_init_file(struct inode *inode, struct file *file, int fmode) 26162306a36Sopenharmony_ci{ 26262306a36Sopenharmony_ci int ret = 0; 26362306a36Sopenharmony_ci 26462306a36Sopenharmony_ci switch (inode->i_mode & S_IFMT) { 26562306a36Sopenharmony_ci case S_IFREG: 26662306a36Sopenharmony_ci ceph_fscache_use_cookie(inode, file->f_mode & FMODE_WRITE); 26762306a36Sopenharmony_ci fallthrough; 26862306a36Sopenharmony_ci case S_IFDIR: 26962306a36Sopenharmony_ci ret = ceph_init_file_info(inode, file, fmode, 27062306a36Sopenharmony_ci S_ISDIR(inode->i_mode)); 27162306a36Sopenharmony_ci break; 27262306a36Sopenharmony_ci 27362306a36Sopenharmony_ci case S_IFLNK: 27462306a36Sopenharmony_ci dout("init_file %p %p 0%o (symlink)\n", inode, file, 27562306a36Sopenharmony_ci inode->i_mode); 27662306a36Sopenharmony_ci break; 27762306a36Sopenharmony_ci 27862306a36Sopenharmony_ci default: 27962306a36Sopenharmony_ci dout("init_file %p %p 0%o (special)\n", inode, file, 28062306a36Sopenharmony_ci inode->i_mode); 28162306a36Sopenharmony_ci /* 28262306a36Sopenharmony_ci * we need to drop the open ref now, since we don't 28362306a36Sopenharmony_ci * have .release set to ceph_release. 28462306a36Sopenharmony_ci */ 28562306a36Sopenharmony_ci BUG_ON(inode->i_fop->release == ceph_release); 28662306a36Sopenharmony_ci 28762306a36Sopenharmony_ci /* call the proper open fop */ 28862306a36Sopenharmony_ci ret = inode->i_fop->open(inode, file); 28962306a36Sopenharmony_ci } 29062306a36Sopenharmony_ci return ret; 29162306a36Sopenharmony_ci} 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci/* 29462306a36Sopenharmony_ci * try renew caps after session gets killed. 29562306a36Sopenharmony_ci */ 29662306a36Sopenharmony_ciint ceph_renew_caps(struct inode *inode, int fmode) 29762306a36Sopenharmony_ci{ 29862306a36Sopenharmony_ci struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); 29962306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 30062306a36Sopenharmony_ci struct ceph_mds_request *req; 30162306a36Sopenharmony_ci int err, flags, wanted; 30262306a36Sopenharmony_ci 30362306a36Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 30462306a36Sopenharmony_ci __ceph_touch_fmode(ci, mdsc, fmode); 30562306a36Sopenharmony_ci wanted = __ceph_caps_file_wanted(ci); 30662306a36Sopenharmony_ci if (__ceph_is_any_real_caps(ci) && 30762306a36Sopenharmony_ci (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) { 30862306a36Sopenharmony_ci int issued = __ceph_caps_issued(ci, NULL); 30962306a36Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 31062306a36Sopenharmony_ci dout("renew caps %p want %s issued %s updating mds_wanted\n", 31162306a36Sopenharmony_ci inode, ceph_cap_string(wanted), ceph_cap_string(issued)); 31262306a36Sopenharmony_ci ceph_check_caps(ci, 0); 31362306a36Sopenharmony_ci return 0; 31462306a36Sopenharmony_ci } 31562306a36Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 31662306a36Sopenharmony_ci 31762306a36Sopenharmony_ci flags = 0; 31862306a36Sopenharmony_ci if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR)) 31962306a36Sopenharmony_ci flags = O_RDWR; 32062306a36Sopenharmony_ci else if (wanted & CEPH_CAP_FILE_RD) 32162306a36Sopenharmony_ci flags = O_RDONLY; 32262306a36Sopenharmony_ci else if (wanted & CEPH_CAP_FILE_WR) 32362306a36Sopenharmony_ci flags = O_WRONLY; 32462306a36Sopenharmony_ci#ifdef O_LAZY 32562306a36Sopenharmony_ci if (wanted & CEPH_CAP_FILE_LAZYIO) 32662306a36Sopenharmony_ci flags |= O_LAZY; 32762306a36Sopenharmony_ci#endif 32862306a36Sopenharmony_ci 32962306a36Sopenharmony_ci req = prepare_open_request(inode->i_sb, flags, 0); 33062306a36Sopenharmony_ci if (IS_ERR(req)) { 33162306a36Sopenharmony_ci err = PTR_ERR(req); 33262306a36Sopenharmony_ci goto out; 33362306a36Sopenharmony_ci } 33462306a36Sopenharmony_ci 33562306a36Sopenharmony_ci req->r_inode = inode; 33662306a36Sopenharmony_ci ihold(inode); 33762306a36Sopenharmony_ci req->r_num_caps = 1; 33862306a36Sopenharmony_ci 33962306a36Sopenharmony_ci err = ceph_mdsc_do_request(mdsc, NULL, req); 34062306a36Sopenharmony_ci ceph_mdsc_put_request(req); 34162306a36Sopenharmony_ciout: 34262306a36Sopenharmony_ci dout("renew caps %p open result=%d\n", inode, err); 34362306a36Sopenharmony_ci return err < 0 ? err : 0; 34462306a36Sopenharmony_ci} 34562306a36Sopenharmony_ci 34662306a36Sopenharmony_ci/* 34762306a36Sopenharmony_ci * If we already have the requisite capabilities, we can satisfy 34862306a36Sopenharmony_ci * the open request locally (no need to request new caps from the 34962306a36Sopenharmony_ci * MDS). We do, however, need to inform the MDS (asynchronously) 35062306a36Sopenharmony_ci * if our wanted caps set expands. 35162306a36Sopenharmony_ci */ 35262306a36Sopenharmony_ciint ceph_open(struct inode *inode, struct file *file) 35362306a36Sopenharmony_ci{ 35462306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 35562306a36Sopenharmony_ci struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); 35662306a36Sopenharmony_ci struct ceph_mds_client *mdsc = fsc->mdsc; 35762306a36Sopenharmony_ci struct ceph_mds_request *req; 35862306a36Sopenharmony_ci struct ceph_file_info *fi = file->private_data; 35962306a36Sopenharmony_ci int err; 36062306a36Sopenharmony_ci int flags, fmode, wanted; 36162306a36Sopenharmony_ci 36262306a36Sopenharmony_ci if (fi) { 36362306a36Sopenharmony_ci dout("open file %p is already opened\n", file); 36462306a36Sopenharmony_ci return 0; 36562306a36Sopenharmony_ci } 36662306a36Sopenharmony_ci 36762306a36Sopenharmony_ci /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */ 36862306a36Sopenharmony_ci flags = file->f_flags & ~(O_CREAT|O_EXCL); 36962306a36Sopenharmony_ci if (S_ISDIR(inode->i_mode)) { 37062306a36Sopenharmony_ci flags = O_DIRECTORY; /* mds likes to know */ 37162306a36Sopenharmony_ci } else if (S_ISREG(inode->i_mode)) { 37262306a36Sopenharmony_ci err = fscrypt_file_open(inode, file); 37362306a36Sopenharmony_ci if (err) 37462306a36Sopenharmony_ci return err; 37562306a36Sopenharmony_ci } 37662306a36Sopenharmony_ci 37762306a36Sopenharmony_ci dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode, 37862306a36Sopenharmony_ci ceph_vinop(inode), file, flags, file->f_flags); 37962306a36Sopenharmony_ci fmode = ceph_flags_to_mode(flags); 38062306a36Sopenharmony_ci wanted = ceph_caps_for_mode(fmode); 38162306a36Sopenharmony_ci 38262306a36Sopenharmony_ci /* snapped files are read-only */ 38362306a36Sopenharmony_ci if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE)) 38462306a36Sopenharmony_ci return -EROFS; 38562306a36Sopenharmony_ci 38662306a36Sopenharmony_ci /* trivially open snapdir */ 38762306a36Sopenharmony_ci if (ceph_snap(inode) == CEPH_SNAPDIR) { 38862306a36Sopenharmony_ci return ceph_init_file(inode, file, fmode); 38962306a36Sopenharmony_ci } 39062306a36Sopenharmony_ci 39162306a36Sopenharmony_ci /* 39262306a36Sopenharmony_ci * No need to block if we have caps on the auth MDS (for 39362306a36Sopenharmony_ci * write) or any MDS (for read). Update wanted set 39462306a36Sopenharmony_ci * asynchronously. 39562306a36Sopenharmony_ci */ 39662306a36Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 39762306a36Sopenharmony_ci if (__ceph_is_any_real_caps(ci) && 39862306a36Sopenharmony_ci (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { 39962306a36Sopenharmony_ci int mds_wanted = __ceph_caps_mds_wanted(ci, true); 40062306a36Sopenharmony_ci int issued = __ceph_caps_issued(ci, NULL); 40162306a36Sopenharmony_ci 40262306a36Sopenharmony_ci dout("open %p fmode %d want %s issued %s using existing\n", 40362306a36Sopenharmony_ci inode, fmode, ceph_cap_string(wanted), 40462306a36Sopenharmony_ci ceph_cap_string(issued)); 40562306a36Sopenharmony_ci __ceph_touch_fmode(ci, mdsc, fmode); 40662306a36Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 40762306a36Sopenharmony_ci 40862306a36Sopenharmony_ci /* adjust wanted? */ 40962306a36Sopenharmony_ci if ((issued & wanted) != wanted && 41062306a36Sopenharmony_ci (mds_wanted & wanted) != wanted && 41162306a36Sopenharmony_ci ceph_snap(inode) != CEPH_SNAPDIR) 41262306a36Sopenharmony_ci ceph_check_caps(ci, 0); 41362306a36Sopenharmony_ci 41462306a36Sopenharmony_ci return ceph_init_file(inode, file, fmode); 41562306a36Sopenharmony_ci } else if (ceph_snap(inode) != CEPH_NOSNAP && 41662306a36Sopenharmony_ci (ci->i_snap_caps & wanted) == wanted) { 41762306a36Sopenharmony_ci __ceph_touch_fmode(ci, mdsc, fmode); 41862306a36Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 41962306a36Sopenharmony_ci return ceph_init_file(inode, file, fmode); 42062306a36Sopenharmony_ci } 42162306a36Sopenharmony_ci 42262306a36Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 42362306a36Sopenharmony_ci 42462306a36Sopenharmony_ci dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); 42562306a36Sopenharmony_ci req = prepare_open_request(inode->i_sb, flags, 0); 42662306a36Sopenharmony_ci if (IS_ERR(req)) { 42762306a36Sopenharmony_ci err = PTR_ERR(req); 42862306a36Sopenharmony_ci goto out; 42962306a36Sopenharmony_ci } 43062306a36Sopenharmony_ci req->r_inode = inode; 43162306a36Sopenharmony_ci ihold(inode); 43262306a36Sopenharmony_ci 43362306a36Sopenharmony_ci req->r_num_caps = 1; 43462306a36Sopenharmony_ci err = ceph_mdsc_do_request(mdsc, NULL, req); 43562306a36Sopenharmony_ci if (!err) 43662306a36Sopenharmony_ci err = ceph_init_file(inode, file, req->r_fmode); 43762306a36Sopenharmony_ci ceph_mdsc_put_request(req); 43862306a36Sopenharmony_ci dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode)); 43962306a36Sopenharmony_ciout: 44062306a36Sopenharmony_ci return err; 44162306a36Sopenharmony_ci} 44262306a36Sopenharmony_ci 44362306a36Sopenharmony_ci/* Clone the layout from a synchronous create, if the dir now has Dc caps */ 44462306a36Sopenharmony_cistatic void 44562306a36Sopenharmony_cicache_file_layout(struct inode *dst, struct inode *src) 44662306a36Sopenharmony_ci{ 44762306a36Sopenharmony_ci struct ceph_inode_info *cdst = ceph_inode(dst); 44862306a36Sopenharmony_ci struct ceph_inode_info *csrc = ceph_inode(src); 44962306a36Sopenharmony_ci 45062306a36Sopenharmony_ci spin_lock(&cdst->i_ceph_lock); 45162306a36Sopenharmony_ci if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) && 45262306a36Sopenharmony_ci !ceph_file_layout_is_valid(&cdst->i_cached_layout)) { 45362306a36Sopenharmony_ci memcpy(&cdst->i_cached_layout, &csrc->i_layout, 45462306a36Sopenharmony_ci sizeof(cdst->i_cached_layout)); 45562306a36Sopenharmony_ci rcu_assign_pointer(cdst->i_cached_layout.pool_ns, 45662306a36Sopenharmony_ci ceph_try_get_string(csrc->i_layout.pool_ns)); 45762306a36Sopenharmony_ci } 45862306a36Sopenharmony_ci spin_unlock(&cdst->i_ceph_lock); 45962306a36Sopenharmony_ci} 46062306a36Sopenharmony_ci 46162306a36Sopenharmony_ci/* 46262306a36Sopenharmony_ci * Try to set up an async create. We need caps, a file layout, and inode number, 46362306a36Sopenharmony_ci * and either a lease on the dentry or complete dir info. If any of those 46462306a36Sopenharmony_ci * criteria are not satisfied, then return false and the caller can go 46562306a36Sopenharmony_ci * synchronous. 46662306a36Sopenharmony_ci */ 46762306a36Sopenharmony_cistatic int try_prep_async_create(struct inode *dir, struct dentry *dentry, 46862306a36Sopenharmony_ci struct ceph_file_layout *lo, u64 *pino) 46962306a36Sopenharmony_ci{ 47062306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(dir); 47162306a36Sopenharmony_ci struct ceph_dentry_info *di = ceph_dentry(dentry); 47262306a36Sopenharmony_ci int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE; 47362306a36Sopenharmony_ci u64 ino; 47462306a36Sopenharmony_ci 47562306a36Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 47662306a36Sopenharmony_ci /* No auth cap means no chance for Dc caps */ 47762306a36Sopenharmony_ci if (!ci->i_auth_cap) 47862306a36Sopenharmony_ci goto no_async; 47962306a36Sopenharmony_ci 48062306a36Sopenharmony_ci /* Any delegated inos? */ 48162306a36Sopenharmony_ci if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos)) 48262306a36Sopenharmony_ci goto no_async; 48362306a36Sopenharmony_ci 48462306a36Sopenharmony_ci if (!ceph_file_layout_is_valid(&ci->i_cached_layout)) 48562306a36Sopenharmony_ci goto no_async; 48662306a36Sopenharmony_ci 48762306a36Sopenharmony_ci if ((__ceph_caps_issued(ci, NULL) & want) != want) 48862306a36Sopenharmony_ci goto no_async; 48962306a36Sopenharmony_ci 49062306a36Sopenharmony_ci if (d_in_lookup(dentry)) { 49162306a36Sopenharmony_ci if (!__ceph_dir_is_complete(ci)) 49262306a36Sopenharmony_ci goto no_async; 49362306a36Sopenharmony_ci spin_lock(&dentry->d_lock); 49462306a36Sopenharmony_ci di->lease_shared_gen = atomic_read(&ci->i_shared_gen); 49562306a36Sopenharmony_ci spin_unlock(&dentry->d_lock); 49662306a36Sopenharmony_ci } else if (atomic_read(&ci->i_shared_gen) != 49762306a36Sopenharmony_ci READ_ONCE(di->lease_shared_gen)) { 49862306a36Sopenharmony_ci goto no_async; 49962306a36Sopenharmony_ci } 50062306a36Sopenharmony_ci 50162306a36Sopenharmony_ci ino = ceph_get_deleg_ino(ci->i_auth_cap->session); 50262306a36Sopenharmony_ci if (!ino) 50362306a36Sopenharmony_ci goto no_async; 50462306a36Sopenharmony_ci 50562306a36Sopenharmony_ci *pino = ino; 50662306a36Sopenharmony_ci ceph_take_cap_refs(ci, want, false); 50762306a36Sopenharmony_ci memcpy(lo, &ci->i_cached_layout, sizeof(*lo)); 50862306a36Sopenharmony_ci rcu_assign_pointer(lo->pool_ns, 50962306a36Sopenharmony_ci ceph_try_get_string(ci->i_cached_layout.pool_ns)); 51062306a36Sopenharmony_ci got = want; 51162306a36Sopenharmony_cino_async: 51262306a36Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 51362306a36Sopenharmony_ci return got; 51462306a36Sopenharmony_ci} 51562306a36Sopenharmony_ci 51662306a36Sopenharmony_cistatic void restore_deleg_ino(struct inode *dir, u64 ino) 51762306a36Sopenharmony_ci{ 51862306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(dir); 51962306a36Sopenharmony_ci struct ceph_mds_session *s = NULL; 52062306a36Sopenharmony_ci 52162306a36Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 52262306a36Sopenharmony_ci if (ci->i_auth_cap) 52362306a36Sopenharmony_ci s = ceph_get_mds_session(ci->i_auth_cap->session); 52462306a36Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 52562306a36Sopenharmony_ci if (s) { 52662306a36Sopenharmony_ci int err = ceph_restore_deleg_ino(s, ino); 52762306a36Sopenharmony_ci if (err) 52862306a36Sopenharmony_ci pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n", 52962306a36Sopenharmony_ci ino, err); 53062306a36Sopenharmony_ci ceph_put_mds_session(s); 53162306a36Sopenharmony_ci } 53262306a36Sopenharmony_ci} 53362306a36Sopenharmony_ci 53462306a36Sopenharmony_cistatic void wake_async_create_waiters(struct inode *inode, 53562306a36Sopenharmony_ci struct ceph_mds_session *session) 53662306a36Sopenharmony_ci{ 53762306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 53862306a36Sopenharmony_ci bool check_cap = false; 53962306a36Sopenharmony_ci 54062306a36Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 54162306a36Sopenharmony_ci if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { 54262306a36Sopenharmony_ci ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; 54362306a36Sopenharmony_ci wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); 54462306a36Sopenharmony_ci 54562306a36Sopenharmony_ci if (ci->i_ceph_flags & CEPH_I_ASYNC_CHECK_CAPS) { 54662306a36Sopenharmony_ci ci->i_ceph_flags &= ~CEPH_I_ASYNC_CHECK_CAPS; 54762306a36Sopenharmony_ci check_cap = true; 54862306a36Sopenharmony_ci } 54962306a36Sopenharmony_ci } 55062306a36Sopenharmony_ci ceph_kick_flushing_inode_caps(session, ci); 55162306a36Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 55262306a36Sopenharmony_ci 55362306a36Sopenharmony_ci if (check_cap) 55462306a36Sopenharmony_ci ceph_check_caps(ci, CHECK_CAPS_FLUSH); 55562306a36Sopenharmony_ci} 55662306a36Sopenharmony_ci 55762306a36Sopenharmony_cistatic void ceph_async_create_cb(struct ceph_mds_client *mdsc, 55862306a36Sopenharmony_ci struct ceph_mds_request *req) 55962306a36Sopenharmony_ci{ 56062306a36Sopenharmony_ci struct dentry *dentry = req->r_dentry; 56162306a36Sopenharmony_ci struct inode *dinode = d_inode(dentry); 56262306a36Sopenharmony_ci struct inode *tinode = req->r_target_inode; 56362306a36Sopenharmony_ci int result = req->r_err ? req->r_err : 56462306a36Sopenharmony_ci le32_to_cpu(req->r_reply_info.head->result); 56562306a36Sopenharmony_ci 56662306a36Sopenharmony_ci WARN_ON_ONCE(dinode && tinode && dinode != tinode); 56762306a36Sopenharmony_ci 56862306a36Sopenharmony_ci /* MDS changed -- caller must resubmit */ 56962306a36Sopenharmony_ci if (result == -EJUKEBOX) 57062306a36Sopenharmony_ci goto out; 57162306a36Sopenharmony_ci 57262306a36Sopenharmony_ci mapping_set_error(req->r_parent->i_mapping, result); 57362306a36Sopenharmony_ci 57462306a36Sopenharmony_ci if (result) { 57562306a36Sopenharmony_ci int pathlen = 0; 57662306a36Sopenharmony_ci u64 base = 0; 57762306a36Sopenharmony_ci char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, 57862306a36Sopenharmony_ci &base, 0); 57962306a36Sopenharmony_ci 58062306a36Sopenharmony_ci pr_warn("async create failure path=(%llx)%s result=%d!\n", 58162306a36Sopenharmony_ci base, IS_ERR(path) ? "<<bad>>" : path, result); 58262306a36Sopenharmony_ci ceph_mdsc_free_path(path, pathlen); 58362306a36Sopenharmony_ci 58462306a36Sopenharmony_ci ceph_dir_clear_complete(req->r_parent); 58562306a36Sopenharmony_ci if (!d_unhashed(dentry)) 58662306a36Sopenharmony_ci d_drop(dentry); 58762306a36Sopenharmony_ci 58862306a36Sopenharmony_ci if (dinode) { 58962306a36Sopenharmony_ci mapping_set_error(dinode->i_mapping, result); 59062306a36Sopenharmony_ci ceph_inode_shutdown(dinode); 59162306a36Sopenharmony_ci wake_async_create_waiters(dinode, req->r_session); 59262306a36Sopenharmony_ci } 59362306a36Sopenharmony_ci } 59462306a36Sopenharmony_ci 59562306a36Sopenharmony_ci if (tinode) { 59662306a36Sopenharmony_ci u64 ino = ceph_vino(tinode).ino; 59762306a36Sopenharmony_ci 59862306a36Sopenharmony_ci if (req->r_deleg_ino != ino) 59962306a36Sopenharmony_ci pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n", 60062306a36Sopenharmony_ci __func__, req->r_err, req->r_deleg_ino, ino); 60162306a36Sopenharmony_ci 60262306a36Sopenharmony_ci mapping_set_error(tinode->i_mapping, result); 60362306a36Sopenharmony_ci wake_async_create_waiters(tinode, req->r_session); 60462306a36Sopenharmony_ci } else if (!result) { 60562306a36Sopenharmony_ci pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__, 60662306a36Sopenharmony_ci req->r_deleg_ino); 60762306a36Sopenharmony_ci } 60862306a36Sopenharmony_ciout: 60962306a36Sopenharmony_ci ceph_mdsc_release_dir_caps(req); 61062306a36Sopenharmony_ci} 61162306a36Sopenharmony_ci 61262306a36Sopenharmony_cistatic int ceph_finish_async_create(struct inode *dir, struct inode *inode, 61362306a36Sopenharmony_ci struct dentry *dentry, 61462306a36Sopenharmony_ci struct file *file, umode_t mode, 61562306a36Sopenharmony_ci struct ceph_mds_request *req, 61662306a36Sopenharmony_ci struct ceph_acl_sec_ctx *as_ctx, 61762306a36Sopenharmony_ci struct ceph_file_layout *lo) 61862306a36Sopenharmony_ci{ 61962306a36Sopenharmony_ci int ret; 62062306a36Sopenharmony_ci char xattr_buf[4]; 62162306a36Sopenharmony_ci struct ceph_mds_reply_inode in = { }; 62262306a36Sopenharmony_ci struct ceph_mds_reply_info_in iinfo = { .in = &in }; 62362306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(dir); 62462306a36Sopenharmony_ci struct ceph_dentry_info *di = ceph_dentry(dentry); 62562306a36Sopenharmony_ci struct timespec64 now; 62662306a36Sopenharmony_ci struct ceph_string *pool_ns; 62762306a36Sopenharmony_ci struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); 62862306a36Sopenharmony_ci struct ceph_vino vino = { .ino = req->r_deleg_ino, 62962306a36Sopenharmony_ci .snap = CEPH_NOSNAP }; 63062306a36Sopenharmony_ci 63162306a36Sopenharmony_ci ktime_get_real_ts64(&now); 63262306a36Sopenharmony_ci 63362306a36Sopenharmony_ci iinfo.inline_version = CEPH_INLINE_NONE; 63462306a36Sopenharmony_ci iinfo.change_attr = 1; 63562306a36Sopenharmony_ci ceph_encode_timespec64(&iinfo.btime, &now); 63662306a36Sopenharmony_ci 63762306a36Sopenharmony_ci if (req->r_pagelist) { 63862306a36Sopenharmony_ci iinfo.xattr_len = req->r_pagelist->length; 63962306a36Sopenharmony_ci iinfo.xattr_data = req->r_pagelist->mapped_tail; 64062306a36Sopenharmony_ci } else { 64162306a36Sopenharmony_ci /* fake it */ 64262306a36Sopenharmony_ci iinfo.xattr_len = ARRAY_SIZE(xattr_buf); 64362306a36Sopenharmony_ci iinfo.xattr_data = xattr_buf; 64462306a36Sopenharmony_ci memset(iinfo.xattr_data, 0, iinfo.xattr_len); 64562306a36Sopenharmony_ci } 64662306a36Sopenharmony_ci 64762306a36Sopenharmony_ci in.ino = cpu_to_le64(vino.ino); 64862306a36Sopenharmony_ci in.snapid = cpu_to_le64(CEPH_NOSNAP); 64962306a36Sopenharmony_ci in.version = cpu_to_le64(1); // ??? 65062306a36Sopenharmony_ci in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE); 65162306a36Sopenharmony_ci in.cap.cap_id = cpu_to_le64(1); 65262306a36Sopenharmony_ci in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino); 65362306a36Sopenharmony_ci in.cap.flags = CEPH_CAP_FLAG_AUTH; 65462306a36Sopenharmony_ci in.ctime = in.mtime = in.atime = iinfo.btime; 65562306a36Sopenharmony_ci in.truncate_seq = cpu_to_le32(1); 65662306a36Sopenharmony_ci in.truncate_size = cpu_to_le64(-1ULL); 65762306a36Sopenharmony_ci in.xattr_version = cpu_to_le64(1); 65862306a36Sopenharmony_ci in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid())); 65962306a36Sopenharmony_ci if (dir->i_mode & S_ISGID) { 66062306a36Sopenharmony_ci in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid)); 66162306a36Sopenharmony_ci 66262306a36Sopenharmony_ci /* Directories always inherit the setgid bit. */ 66362306a36Sopenharmony_ci if (S_ISDIR(mode)) 66462306a36Sopenharmony_ci mode |= S_ISGID; 66562306a36Sopenharmony_ci } else { 66662306a36Sopenharmony_ci in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid())); 66762306a36Sopenharmony_ci } 66862306a36Sopenharmony_ci in.mode = cpu_to_le32((u32)mode); 66962306a36Sopenharmony_ci 67062306a36Sopenharmony_ci in.nlink = cpu_to_le32(1); 67162306a36Sopenharmony_ci in.max_size = cpu_to_le64(lo->stripe_unit); 67262306a36Sopenharmony_ci 67362306a36Sopenharmony_ci ceph_file_layout_to_legacy(lo, &in.layout); 67462306a36Sopenharmony_ci /* lo is private, so pool_ns can't change */ 67562306a36Sopenharmony_ci pool_ns = rcu_dereference_raw(lo->pool_ns); 67662306a36Sopenharmony_ci if (pool_ns) { 67762306a36Sopenharmony_ci iinfo.pool_ns_len = pool_ns->len; 67862306a36Sopenharmony_ci iinfo.pool_ns_data = pool_ns->str; 67962306a36Sopenharmony_ci } 68062306a36Sopenharmony_ci 68162306a36Sopenharmony_ci down_read(&mdsc->snap_rwsem); 68262306a36Sopenharmony_ci ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session, 68362306a36Sopenharmony_ci req->r_fmode, NULL); 68462306a36Sopenharmony_ci up_read(&mdsc->snap_rwsem); 68562306a36Sopenharmony_ci if (ret) { 68662306a36Sopenharmony_ci dout("%s failed to fill inode: %d\n", __func__, ret); 68762306a36Sopenharmony_ci ceph_dir_clear_complete(dir); 68862306a36Sopenharmony_ci if (!d_unhashed(dentry)) 68962306a36Sopenharmony_ci d_drop(dentry); 69062306a36Sopenharmony_ci discard_new_inode(inode); 69162306a36Sopenharmony_ci } else { 69262306a36Sopenharmony_ci struct dentry *dn; 69362306a36Sopenharmony_ci 69462306a36Sopenharmony_ci dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__, 69562306a36Sopenharmony_ci vino.ino, ceph_ino(dir), dentry->d_name.name); 69662306a36Sopenharmony_ci ceph_dir_clear_ordered(dir); 69762306a36Sopenharmony_ci ceph_init_inode_acls(inode, as_ctx); 69862306a36Sopenharmony_ci if (inode->i_state & I_NEW) { 69962306a36Sopenharmony_ci /* 70062306a36Sopenharmony_ci * If it's not I_NEW, then someone created this before 70162306a36Sopenharmony_ci * we got here. Assume the server is aware of it at 70262306a36Sopenharmony_ci * that point and don't worry about setting 70362306a36Sopenharmony_ci * CEPH_I_ASYNC_CREATE. 70462306a36Sopenharmony_ci */ 70562306a36Sopenharmony_ci ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE; 70662306a36Sopenharmony_ci unlock_new_inode(inode); 70762306a36Sopenharmony_ci } 70862306a36Sopenharmony_ci if (d_in_lookup(dentry) || d_really_is_negative(dentry)) { 70962306a36Sopenharmony_ci if (!d_unhashed(dentry)) 71062306a36Sopenharmony_ci d_drop(dentry); 71162306a36Sopenharmony_ci dn = d_splice_alias(inode, dentry); 71262306a36Sopenharmony_ci WARN_ON_ONCE(dn && dn != dentry); 71362306a36Sopenharmony_ci } 71462306a36Sopenharmony_ci file->f_mode |= FMODE_CREATED; 71562306a36Sopenharmony_ci ret = finish_open(file, dentry, ceph_open); 71662306a36Sopenharmony_ci } 71762306a36Sopenharmony_ci 71862306a36Sopenharmony_ci spin_lock(&dentry->d_lock); 71962306a36Sopenharmony_ci di->flags &= ~CEPH_DENTRY_ASYNC_CREATE; 72062306a36Sopenharmony_ci wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT); 72162306a36Sopenharmony_ci spin_unlock(&dentry->d_lock); 72262306a36Sopenharmony_ci 72362306a36Sopenharmony_ci return ret; 72462306a36Sopenharmony_ci} 72562306a36Sopenharmony_ci 72662306a36Sopenharmony_ci/* 72762306a36Sopenharmony_ci * Do a lookup + open with a single request. If we get a non-existent 72862306a36Sopenharmony_ci * file or symlink, return 1 so the VFS can retry. 72962306a36Sopenharmony_ci */ 73062306a36Sopenharmony_ciint ceph_atomic_open(struct inode *dir, struct dentry *dentry, 73162306a36Sopenharmony_ci struct file *file, unsigned flags, umode_t mode) 73262306a36Sopenharmony_ci{ 73362306a36Sopenharmony_ci struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 73462306a36Sopenharmony_ci struct ceph_mds_client *mdsc = fsc->mdsc; 73562306a36Sopenharmony_ci struct ceph_mds_request *req; 73662306a36Sopenharmony_ci struct inode *new_inode = NULL; 73762306a36Sopenharmony_ci struct dentry *dn; 73862306a36Sopenharmony_ci struct ceph_acl_sec_ctx as_ctx = {}; 73962306a36Sopenharmony_ci bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS); 74062306a36Sopenharmony_ci int mask; 74162306a36Sopenharmony_ci int err; 74262306a36Sopenharmony_ci 74362306a36Sopenharmony_ci dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n", 74462306a36Sopenharmony_ci dir, dentry, dentry, 74562306a36Sopenharmony_ci d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode); 74662306a36Sopenharmony_ci 74762306a36Sopenharmony_ci if (dentry->d_name.len > NAME_MAX) 74862306a36Sopenharmony_ci return -ENAMETOOLONG; 74962306a36Sopenharmony_ci 75062306a36Sopenharmony_ci err = ceph_wait_on_conflict_unlink(dentry); 75162306a36Sopenharmony_ci if (err) 75262306a36Sopenharmony_ci return err; 75362306a36Sopenharmony_ci /* 75462306a36Sopenharmony_ci * Do not truncate the file, since atomic_open is called before the 75562306a36Sopenharmony_ci * permission check. The caller will do the truncation afterward. 75662306a36Sopenharmony_ci */ 75762306a36Sopenharmony_ci flags &= ~O_TRUNC; 75862306a36Sopenharmony_ci 75962306a36Sopenharmony_ciretry: 76062306a36Sopenharmony_ci if (flags & O_CREAT) { 76162306a36Sopenharmony_ci if (ceph_quota_is_max_files_exceeded(dir)) 76262306a36Sopenharmony_ci return -EDQUOT; 76362306a36Sopenharmony_ci 76462306a36Sopenharmony_ci new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); 76562306a36Sopenharmony_ci if (IS_ERR(new_inode)) { 76662306a36Sopenharmony_ci err = PTR_ERR(new_inode); 76762306a36Sopenharmony_ci goto out_ctx; 76862306a36Sopenharmony_ci } 76962306a36Sopenharmony_ci /* Async create can't handle more than a page of xattrs */ 77062306a36Sopenharmony_ci if (as_ctx.pagelist && 77162306a36Sopenharmony_ci !list_is_singular(&as_ctx.pagelist->head)) 77262306a36Sopenharmony_ci try_async = false; 77362306a36Sopenharmony_ci } else if (!d_in_lookup(dentry)) { 77462306a36Sopenharmony_ci /* If it's not being looked up, it's negative */ 77562306a36Sopenharmony_ci return -ENOENT; 77662306a36Sopenharmony_ci } 77762306a36Sopenharmony_ci 77862306a36Sopenharmony_ci /* do the open */ 77962306a36Sopenharmony_ci req = prepare_open_request(dir->i_sb, flags, mode); 78062306a36Sopenharmony_ci if (IS_ERR(req)) { 78162306a36Sopenharmony_ci err = PTR_ERR(req); 78262306a36Sopenharmony_ci goto out_ctx; 78362306a36Sopenharmony_ci } 78462306a36Sopenharmony_ci req->r_dentry = dget(dentry); 78562306a36Sopenharmony_ci req->r_num_caps = 2; 78662306a36Sopenharmony_ci mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; 78762306a36Sopenharmony_ci if (ceph_security_xattr_wanted(dir)) 78862306a36Sopenharmony_ci mask |= CEPH_CAP_XATTR_SHARED; 78962306a36Sopenharmony_ci req->r_args.open.mask = cpu_to_le32(mask); 79062306a36Sopenharmony_ci req->r_parent = dir; 79162306a36Sopenharmony_ci ihold(dir); 79262306a36Sopenharmony_ci if (IS_ENCRYPTED(dir)) { 79362306a36Sopenharmony_ci set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags); 79462306a36Sopenharmony_ci err = fscrypt_prepare_lookup_partial(dir, dentry); 79562306a36Sopenharmony_ci if (err < 0) 79662306a36Sopenharmony_ci goto out_req; 79762306a36Sopenharmony_ci } 79862306a36Sopenharmony_ci 79962306a36Sopenharmony_ci if (flags & O_CREAT) { 80062306a36Sopenharmony_ci struct ceph_file_layout lo; 80162306a36Sopenharmony_ci 80262306a36Sopenharmony_ci req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL | 80362306a36Sopenharmony_ci CEPH_CAP_XATTR_EXCL; 80462306a36Sopenharmony_ci req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 80562306a36Sopenharmony_ci 80662306a36Sopenharmony_ci ceph_as_ctx_to_req(req, &as_ctx); 80762306a36Sopenharmony_ci 80862306a36Sopenharmony_ci if (try_async && (req->r_dir_caps = 80962306a36Sopenharmony_ci try_prep_async_create(dir, dentry, &lo, 81062306a36Sopenharmony_ci &req->r_deleg_ino))) { 81162306a36Sopenharmony_ci struct ceph_vino vino = { .ino = req->r_deleg_ino, 81262306a36Sopenharmony_ci .snap = CEPH_NOSNAP }; 81362306a36Sopenharmony_ci struct ceph_dentry_info *di = ceph_dentry(dentry); 81462306a36Sopenharmony_ci 81562306a36Sopenharmony_ci set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); 81662306a36Sopenharmony_ci req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL); 81762306a36Sopenharmony_ci req->r_callback = ceph_async_create_cb; 81862306a36Sopenharmony_ci 81962306a36Sopenharmony_ci /* Hash inode before RPC */ 82062306a36Sopenharmony_ci new_inode = ceph_get_inode(dir->i_sb, vino, new_inode); 82162306a36Sopenharmony_ci if (IS_ERR(new_inode)) { 82262306a36Sopenharmony_ci err = PTR_ERR(new_inode); 82362306a36Sopenharmony_ci new_inode = NULL; 82462306a36Sopenharmony_ci goto out_req; 82562306a36Sopenharmony_ci } 82662306a36Sopenharmony_ci WARN_ON_ONCE(!(new_inode->i_state & I_NEW)); 82762306a36Sopenharmony_ci 82862306a36Sopenharmony_ci spin_lock(&dentry->d_lock); 82962306a36Sopenharmony_ci di->flags |= CEPH_DENTRY_ASYNC_CREATE; 83062306a36Sopenharmony_ci spin_unlock(&dentry->d_lock); 83162306a36Sopenharmony_ci 83262306a36Sopenharmony_ci err = ceph_mdsc_submit_request(mdsc, dir, req); 83362306a36Sopenharmony_ci if (!err) { 83462306a36Sopenharmony_ci err = ceph_finish_async_create(dir, new_inode, 83562306a36Sopenharmony_ci dentry, file, 83662306a36Sopenharmony_ci mode, req, 83762306a36Sopenharmony_ci &as_ctx, &lo); 83862306a36Sopenharmony_ci new_inode = NULL; 83962306a36Sopenharmony_ci } else if (err == -EJUKEBOX) { 84062306a36Sopenharmony_ci restore_deleg_ino(dir, req->r_deleg_ino); 84162306a36Sopenharmony_ci ceph_mdsc_put_request(req); 84262306a36Sopenharmony_ci discard_new_inode(new_inode); 84362306a36Sopenharmony_ci ceph_release_acl_sec_ctx(&as_ctx); 84462306a36Sopenharmony_ci memset(&as_ctx, 0, sizeof(as_ctx)); 84562306a36Sopenharmony_ci new_inode = NULL; 84662306a36Sopenharmony_ci try_async = false; 84762306a36Sopenharmony_ci ceph_put_string(rcu_dereference_raw(lo.pool_ns)); 84862306a36Sopenharmony_ci goto retry; 84962306a36Sopenharmony_ci } 85062306a36Sopenharmony_ci ceph_put_string(rcu_dereference_raw(lo.pool_ns)); 85162306a36Sopenharmony_ci goto out_req; 85262306a36Sopenharmony_ci } 85362306a36Sopenharmony_ci } 85462306a36Sopenharmony_ci 85562306a36Sopenharmony_ci set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); 85662306a36Sopenharmony_ci req->r_new_inode = new_inode; 85762306a36Sopenharmony_ci new_inode = NULL; 85862306a36Sopenharmony_ci err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req); 85962306a36Sopenharmony_ci if (err == -ENOENT) { 86062306a36Sopenharmony_ci dentry = ceph_handle_snapdir(req, dentry); 86162306a36Sopenharmony_ci if (IS_ERR(dentry)) { 86262306a36Sopenharmony_ci err = PTR_ERR(dentry); 86362306a36Sopenharmony_ci goto out_req; 86462306a36Sopenharmony_ci } 86562306a36Sopenharmony_ci err = 0; 86662306a36Sopenharmony_ci } 86762306a36Sopenharmony_ci 86862306a36Sopenharmony_ci if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) 86962306a36Sopenharmony_ci err = ceph_handle_notrace_create(dir, dentry); 87062306a36Sopenharmony_ci 87162306a36Sopenharmony_ci if (d_in_lookup(dentry)) { 87262306a36Sopenharmony_ci dn = ceph_finish_lookup(req, dentry, err); 87362306a36Sopenharmony_ci if (IS_ERR(dn)) 87462306a36Sopenharmony_ci err = PTR_ERR(dn); 87562306a36Sopenharmony_ci } else { 87662306a36Sopenharmony_ci /* we were given a hashed negative dentry */ 87762306a36Sopenharmony_ci dn = NULL; 87862306a36Sopenharmony_ci } 87962306a36Sopenharmony_ci if (err) 88062306a36Sopenharmony_ci goto out_req; 88162306a36Sopenharmony_ci if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) { 88262306a36Sopenharmony_ci /* make vfs retry on splice, ENOENT, or symlink */ 88362306a36Sopenharmony_ci dout("atomic_open finish_no_open on dn %p\n", dn); 88462306a36Sopenharmony_ci err = finish_no_open(file, dn); 88562306a36Sopenharmony_ci } else { 88662306a36Sopenharmony_ci if (IS_ENCRYPTED(dir) && 88762306a36Sopenharmony_ci !fscrypt_has_permitted_context(dir, d_inode(dentry))) { 88862306a36Sopenharmony_ci pr_warn("Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n", 88962306a36Sopenharmony_ci ceph_vinop(dir), ceph_vinop(d_inode(dentry))); 89062306a36Sopenharmony_ci goto out_req; 89162306a36Sopenharmony_ci } 89262306a36Sopenharmony_ci 89362306a36Sopenharmony_ci dout("atomic_open finish_open on dn %p\n", dn); 89462306a36Sopenharmony_ci if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { 89562306a36Sopenharmony_ci struct inode *newino = d_inode(dentry); 89662306a36Sopenharmony_ci 89762306a36Sopenharmony_ci cache_file_layout(dir, newino); 89862306a36Sopenharmony_ci ceph_init_inode_acls(newino, &as_ctx); 89962306a36Sopenharmony_ci file->f_mode |= FMODE_CREATED; 90062306a36Sopenharmony_ci } 90162306a36Sopenharmony_ci err = finish_open(file, dentry, ceph_open); 90262306a36Sopenharmony_ci } 90362306a36Sopenharmony_ciout_req: 90462306a36Sopenharmony_ci ceph_mdsc_put_request(req); 90562306a36Sopenharmony_ci iput(new_inode); 90662306a36Sopenharmony_ciout_ctx: 90762306a36Sopenharmony_ci ceph_release_acl_sec_ctx(&as_ctx); 90862306a36Sopenharmony_ci dout("atomic_open result=%d\n", err); 90962306a36Sopenharmony_ci return err; 91062306a36Sopenharmony_ci} 91162306a36Sopenharmony_ci 91262306a36Sopenharmony_ciint ceph_release(struct inode *inode, struct file *file) 91362306a36Sopenharmony_ci{ 91462306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 91562306a36Sopenharmony_ci 91662306a36Sopenharmony_ci if (S_ISDIR(inode->i_mode)) { 91762306a36Sopenharmony_ci struct ceph_dir_file_info *dfi = file->private_data; 91862306a36Sopenharmony_ci dout("release inode %p dir file %p\n", inode, file); 91962306a36Sopenharmony_ci WARN_ON(!list_empty(&dfi->file_info.rw_contexts)); 92062306a36Sopenharmony_ci 92162306a36Sopenharmony_ci ceph_put_fmode(ci, dfi->file_info.fmode, 1); 92262306a36Sopenharmony_ci 92362306a36Sopenharmony_ci if (dfi->last_readdir) 92462306a36Sopenharmony_ci ceph_mdsc_put_request(dfi->last_readdir); 92562306a36Sopenharmony_ci kfree(dfi->last_name); 92662306a36Sopenharmony_ci kfree(dfi->dir_info); 92762306a36Sopenharmony_ci kmem_cache_free(ceph_dir_file_cachep, dfi); 92862306a36Sopenharmony_ci } else { 92962306a36Sopenharmony_ci struct ceph_file_info *fi = file->private_data; 93062306a36Sopenharmony_ci dout("release inode %p regular file %p\n", inode, file); 93162306a36Sopenharmony_ci WARN_ON(!list_empty(&fi->rw_contexts)); 93262306a36Sopenharmony_ci 93362306a36Sopenharmony_ci ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE); 93462306a36Sopenharmony_ci ceph_put_fmode(ci, fi->fmode, 1); 93562306a36Sopenharmony_ci 93662306a36Sopenharmony_ci kmem_cache_free(ceph_file_cachep, fi); 93762306a36Sopenharmony_ci } 93862306a36Sopenharmony_ci 93962306a36Sopenharmony_ci /* wake up anyone waiting for caps on this inode */ 94062306a36Sopenharmony_ci wake_up_all(&ci->i_cap_wq); 94162306a36Sopenharmony_ci return 0; 94262306a36Sopenharmony_ci} 94362306a36Sopenharmony_ci 94462306a36Sopenharmony_cienum { 94562306a36Sopenharmony_ci HAVE_RETRIED = 1, 94662306a36Sopenharmony_ci CHECK_EOF = 2, 94762306a36Sopenharmony_ci READ_INLINE = 3, 94862306a36Sopenharmony_ci}; 94962306a36Sopenharmony_ci 95062306a36Sopenharmony_ci/* 95162306a36Sopenharmony_ci * Completely synchronous read and write methods. Direct from __user 95262306a36Sopenharmony_ci * buffer to osd, or directly to user pages (if O_DIRECT). 95362306a36Sopenharmony_ci * 95462306a36Sopenharmony_ci * If the read spans object boundary, just do multiple reads. (That's not 95562306a36Sopenharmony_ci * atomic, but good enough for now.) 95662306a36Sopenharmony_ci * 95762306a36Sopenharmony_ci * If we get a short result from the OSD, check against i_size; we need to 95862306a36Sopenharmony_ci * only return a short read to the caller if we hit EOF. 95962306a36Sopenharmony_ci */ 96062306a36Sopenharmony_cissize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos, 96162306a36Sopenharmony_ci struct iov_iter *to, int *retry_op, 96262306a36Sopenharmony_ci u64 *last_objver) 96362306a36Sopenharmony_ci{ 96462306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 96562306a36Sopenharmony_ci struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 96662306a36Sopenharmony_ci struct ceph_osd_client *osdc = &fsc->client->osdc; 96762306a36Sopenharmony_ci ssize_t ret; 96862306a36Sopenharmony_ci u64 off = *ki_pos; 96962306a36Sopenharmony_ci u64 len = iov_iter_count(to); 97062306a36Sopenharmony_ci u64 i_size = i_size_read(inode); 97162306a36Sopenharmony_ci bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); 97262306a36Sopenharmony_ci u64 objver = 0; 97362306a36Sopenharmony_ci 97462306a36Sopenharmony_ci dout("sync_read on inode %p %llx~%llx\n", inode, *ki_pos, len); 97562306a36Sopenharmony_ci 97662306a36Sopenharmony_ci if (ceph_inode_is_shutdown(inode)) 97762306a36Sopenharmony_ci return -EIO; 97862306a36Sopenharmony_ci 97962306a36Sopenharmony_ci if (!len) 98062306a36Sopenharmony_ci return 0; 98162306a36Sopenharmony_ci /* 98262306a36Sopenharmony_ci * flush any page cache pages in this range. this 98362306a36Sopenharmony_ci * will make concurrent normal and sync io slow, 98462306a36Sopenharmony_ci * but it will at least behave sensibly when they are 98562306a36Sopenharmony_ci * in sequence. 98662306a36Sopenharmony_ci */ 98762306a36Sopenharmony_ci ret = filemap_write_and_wait_range(inode->i_mapping, 98862306a36Sopenharmony_ci off, off + len - 1); 98962306a36Sopenharmony_ci if (ret < 0) 99062306a36Sopenharmony_ci return ret; 99162306a36Sopenharmony_ci 99262306a36Sopenharmony_ci ret = 0; 99362306a36Sopenharmony_ci while ((len = iov_iter_count(to)) > 0) { 99462306a36Sopenharmony_ci struct ceph_osd_request *req; 99562306a36Sopenharmony_ci struct page **pages; 99662306a36Sopenharmony_ci int num_pages; 99762306a36Sopenharmony_ci size_t page_off; 99862306a36Sopenharmony_ci bool more; 99962306a36Sopenharmony_ci int idx; 100062306a36Sopenharmony_ci size_t left; 100162306a36Sopenharmony_ci struct ceph_osd_req_op *op; 100262306a36Sopenharmony_ci u64 read_off = off; 100362306a36Sopenharmony_ci u64 read_len = len; 100462306a36Sopenharmony_ci 100562306a36Sopenharmony_ci /* determine new offset/length if encrypted */ 100662306a36Sopenharmony_ci ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len); 100762306a36Sopenharmony_ci 100862306a36Sopenharmony_ci dout("sync_read orig %llu~%llu reading %llu~%llu", 100962306a36Sopenharmony_ci off, len, read_off, read_len); 101062306a36Sopenharmony_ci 101162306a36Sopenharmony_ci req = ceph_osdc_new_request(osdc, &ci->i_layout, 101262306a36Sopenharmony_ci ci->i_vino, read_off, &read_len, 0, 1, 101362306a36Sopenharmony_ci sparse ? CEPH_OSD_OP_SPARSE_READ : 101462306a36Sopenharmony_ci CEPH_OSD_OP_READ, 101562306a36Sopenharmony_ci CEPH_OSD_FLAG_READ, 101662306a36Sopenharmony_ci NULL, ci->i_truncate_seq, 101762306a36Sopenharmony_ci ci->i_truncate_size, false); 101862306a36Sopenharmony_ci if (IS_ERR(req)) { 101962306a36Sopenharmony_ci ret = PTR_ERR(req); 102062306a36Sopenharmony_ci break; 102162306a36Sopenharmony_ci } 102262306a36Sopenharmony_ci 102362306a36Sopenharmony_ci /* adjust len downward if the request truncated the len */ 102462306a36Sopenharmony_ci if (off + len > read_off + read_len) 102562306a36Sopenharmony_ci len = read_off + read_len - off; 102662306a36Sopenharmony_ci more = len < iov_iter_count(to); 102762306a36Sopenharmony_ci 102862306a36Sopenharmony_ci num_pages = calc_pages_for(read_off, read_len); 102962306a36Sopenharmony_ci page_off = offset_in_page(off); 103062306a36Sopenharmony_ci pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 103162306a36Sopenharmony_ci if (IS_ERR(pages)) { 103262306a36Sopenharmony_ci ceph_osdc_put_request(req); 103362306a36Sopenharmony_ci ret = PTR_ERR(pages); 103462306a36Sopenharmony_ci break; 103562306a36Sopenharmony_ci } 103662306a36Sopenharmony_ci 103762306a36Sopenharmony_ci osd_req_op_extent_osd_data_pages(req, 0, pages, read_len, 103862306a36Sopenharmony_ci offset_in_page(read_off), 103962306a36Sopenharmony_ci false, false); 104062306a36Sopenharmony_ci 104162306a36Sopenharmony_ci op = &req->r_ops[0]; 104262306a36Sopenharmony_ci if (sparse) { 104362306a36Sopenharmony_ci ret = ceph_alloc_sparse_ext_map(op); 104462306a36Sopenharmony_ci if (ret) { 104562306a36Sopenharmony_ci ceph_osdc_put_request(req); 104662306a36Sopenharmony_ci break; 104762306a36Sopenharmony_ci } 104862306a36Sopenharmony_ci } 104962306a36Sopenharmony_ci 105062306a36Sopenharmony_ci ceph_osdc_start_request(osdc, req); 105162306a36Sopenharmony_ci ret = ceph_osdc_wait_request(osdc, req); 105262306a36Sopenharmony_ci 105362306a36Sopenharmony_ci ceph_update_read_metrics(&fsc->mdsc->metric, 105462306a36Sopenharmony_ci req->r_start_latency, 105562306a36Sopenharmony_ci req->r_end_latency, 105662306a36Sopenharmony_ci read_len, ret); 105762306a36Sopenharmony_ci 105862306a36Sopenharmony_ci if (ret > 0) 105962306a36Sopenharmony_ci objver = req->r_version; 106062306a36Sopenharmony_ci 106162306a36Sopenharmony_ci i_size = i_size_read(inode); 106262306a36Sopenharmony_ci dout("sync_read %llu~%llu got %zd i_size %llu%s\n", 106362306a36Sopenharmony_ci off, len, ret, i_size, (more ? " MORE" : "")); 106462306a36Sopenharmony_ci 106562306a36Sopenharmony_ci /* Fix it to go to end of extent map */ 106662306a36Sopenharmony_ci if (sparse && ret >= 0) 106762306a36Sopenharmony_ci ret = ceph_sparse_ext_map_end(op); 106862306a36Sopenharmony_ci else if (ret == -ENOENT) 106962306a36Sopenharmony_ci ret = 0; 107062306a36Sopenharmony_ci 107162306a36Sopenharmony_ci if (ret > 0 && IS_ENCRYPTED(inode)) { 107262306a36Sopenharmony_ci int fret; 107362306a36Sopenharmony_ci 107462306a36Sopenharmony_ci fret = ceph_fscrypt_decrypt_extents(inode, pages, 107562306a36Sopenharmony_ci read_off, op->extent.sparse_ext, 107662306a36Sopenharmony_ci op->extent.sparse_ext_cnt); 107762306a36Sopenharmony_ci if (fret < 0) { 107862306a36Sopenharmony_ci ret = fret; 107962306a36Sopenharmony_ci ceph_osdc_put_request(req); 108062306a36Sopenharmony_ci break; 108162306a36Sopenharmony_ci } 108262306a36Sopenharmony_ci 108362306a36Sopenharmony_ci /* account for any partial block at the beginning */ 108462306a36Sopenharmony_ci fret -= (off - read_off); 108562306a36Sopenharmony_ci 108662306a36Sopenharmony_ci /* 108762306a36Sopenharmony_ci * Short read after big offset adjustment? 108862306a36Sopenharmony_ci * Nothing is usable, just call it a zero 108962306a36Sopenharmony_ci * len read. 109062306a36Sopenharmony_ci */ 109162306a36Sopenharmony_ci fret = max(fret, 0); 109262306a36Sopenharmony_ci 109362306a36Sopenharmony_ci /* account for partial block at the end */ 109462306a36Sopenharmony_ci ret = min_t(ssize_t, fret, len); 109562306a36Sopenharmony_ci } 109662306a36Sopenharmony_ci 109762306a36Sopenharmony_ci ceph_osdc_put_request(req); 109862306a36Sopenharmony_ci 109962306a36Sopenharmony_ci /* Short read but not EOF? Zero out the remainder. */ 110062306a36Sopenharmony_ci if (ret >= 0 && ret < len && (off + ret < i_size)) { 110162306a36Sopenharmony_ci int zlen = min(len - ret, i_size - off - ret); 110262306a36Sopenharmony_ci int zoff = page_off + ret; 110362306a36Sopenharmony_ci 110462306a36Sopenharmony_ci dout("sync_read zero gap %llu~%llu\n", 110562306a36Sopenharmony_ci off + ret, off + ret + zlen); 110662306a36Sopenharmony_ci ceph_zero_page_vector_range(zoff, zlen, pages); 110762306a36Sopenharmony_ci ret += zlen; 110862306a36Sopenharmony_ci } 110962306a36Sopenharmony_ci 111062306a36Sopenharmony_ci idx = 0; 111162306a36Sopenharmony_ci if (ret <= 0) 111262306a36Sopenharmony_ci left = 0; 111362306a36Sopenharmony_ci else if (off + ret > i_size) 111462306a36Sopenharmony_ci left = i_size - off; 111562306a36Sopenharmony_ci else 111662306a36Sopenharmony_ci left = ret; 111762306a36Sopenharmony_ci while (left > 0) { 111862306a36Sopenharmony_ci size_t plen, copied; 111962306a36Sopenharmony_ci 112062306a36Sopenharmony_ci plen = min_t(size_t, left, PAGE_SIZE - page_off); 112162306a36Sopenharmony_ci SetPageUptodate(pages[idx]); 112262306a36Sopenharmony_ci copied = copy_page_to_iter(pages[idx++], 112362306a36Sopenharmony_ci page_off, plen, to); 112462306a36Sopenharmony_ci off += copied; 112562306a36Sopenharmony_ci left -= copied; 112662306a36Sopenharmony_ci page_off = 0; 112762306a36Sopenharmony_ci if (copied < plen) { 112862306a36Sopenharmony_ci ret = -EFAULT; 112962306a36Sopenharmony_ci break; 113062306a36Sopenharmony_ci } 113162306a36Sopenharmony_ci } 113262306a36Sopenharmony_ci ceph_release_page_vector(pages, num_pages); 113362306a36Sopenharmony_ci 113462306a36Sopenharmony_ci if (ret < 0) { 113562306a36Sopenharmony_ci if (ret == -EBLOCKLISTED) 113662306a36Sopenharmony_ci fsc->blocklisted = true; 113762306a36Sopenharmony_ci break; 113862306a36Sopenharmony_ci } 113962306a36Sopenharmony_ci 114062306a36Sopenharmony_ci if (off >= i_size || !more) 114162306a36Sopenharmony_ci break; 114262306a36Sopenharmony_ci } 114362306a36Sopenharmony_ci 114462306a36Sopenharmony_ci if (ret > 0) { 114562306a36Sopenharmony_ci if (off >= i_size) { 114662306a36Sopenharmony_ci *retry_op = CHECK_EOF; 114762306a36Sopenharmony_ci ret = i_size - *ki_pos; 114862306a36Sopenharmony_ci *ki_pos = i_size; 114962306a36Sopenharmony_ci } else { 115062306a36Sopenharmony_ci ret = off - *ki_pos; 115162306a36Sopenharmony_ci *ki_pos = off; 115262306a36Sopenharmony_ci } 115362306a36Sopenharmony_ci 115462306a36Sopenharmony_ci if (last_objver) 115562306a36Sopenharmony_ci *last_objver = objver; 115662306a36Sopenharmony_ci } 115762306a36Sopenharmony_ci dout("sync_read result %zd retry_op %d\n", ret, *retry_op); 115862306a36Sopenharmony_ci return ret; 115962306a36Sopenharmony_ci} 116062306a36Sopenharmony_ci 116162306a36Sopenharmony_cistatic ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, 116262306a36Sopenharmony_ci int *retry_op) 116362306a36Sopenharmony_ci{ 116462306a36Sopenharmony_ci struct file *file = iocb->ki_filp; 116562306a36Sopenharmony_ci struct inode *inode = file_inode(file); 116662306a36Sopenharmony_ci 116762306a36Sopenharmony_ci dout("sync_read on file %p %llx~%zx %s\n", file, iocb->ki_pos, 116862306a36Sopenharmony_ci iov_iter_count(to), (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 116962306a36Sopenharmony_ci 117062306a36Sopenharmony_ci return __ceph_sync_read(inode, &iocb->ki_pos, to, retry_op, NULL); 117162306a36Sopenharmony_ci} 117262306a36Sopenharmony_ci 117362306a36Sopenharmony_cistruct ceph_aio_request { 117462306a36Sopenharmony_ci struct kiocb *iocb; 117562306a36Sopenharmony_ci size_t total_len; 117662306a36Sopenharmony_ci bool write; 117762306a36Sopenharmony_ci bool should_dirty; 117862306a36Sopenharmony_ci int error; 117962306a36Sopenharmony_ci struct list_head osd_reqs; 118062306a36Sopenharmony_ci unsigned num_reqs; 118162306a36Sopenharmony_ci atomic_t pending_reqs; 118262306a36Sopenharmony_ci struct timespec64 mtime; 118362306a36Sopenharmony_ci struct ceph_cap_flush *prealloc_cf; 118462306a36Sopenharmony_ci}; 118562306a36Sopenharmony_ci 118662306a36Sopenharmony_cistruct ceph_aio_work { 118762306a36Sopenharmony_ci struct work_struct work; 118862306a36Sopenharmony_ci struct ceph_osd_request *req; 118962306a36Sopenharmony_ci}; 119062306a36Sopenharmony_ci 119162306a36Sopenharmony_cistatic void ceph_aio_retry_work(struct work_struct *work); 119262306a36Sopenharmony_ci 119362306a36Sopenharmony_cistatic void ceph_aio_complete(struct inode *inode, 119462306a36Sopenharmony_ci struct ceph_aio_request *aio_req) 119562306a36Sopenharmony_ci{ 119662306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 119762306a36Sopenharmony_ci int ret; 119862306a36Sopenharmony_ci 119962306a36Sopenharmony_ci if (!atomic_dec_and_test(&aio_req->pending_reqs)) 120062306a36Sopenharmony_ci return; 120162306a36Sopenharmony_ci 120262306a36Sopenharmony_ci if (aio_req->iocb->ki_flags & IOCB_DIRECT) 120362306a36Sopenharmony_ci inode_dio_end(inode); 120462306a36Sopenharmony_ci 120562306a36Sopenharmony_ci ret = aio_req->error; 120662306a36Sopenharmony_ci if (!ret) 120762306a36Sopenharmony_ci ret = aio_req->total_len; 120862306a36Sopenharmony_ci 120962306a36Sopenharmony_ci dout("ceph_aio_complete %p rc %d\n", inode, ret); 121062306a36Sopenharmony_ci 121162306a36Sopenharmony_ci if (ret >= 0 && aio_req->write) { 121262306a36Sopenharmony_ci int dirty; 121362306a36Sopenharmony_ci 121462306a36Sopenharmony_ci loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len; 121562306a36Sopenharmony_ci if (endoff > i_size_read(inode)) { 121662306a36Sopenharmony_ci if (ceph_inode_set_size(inode, endoff)) 121762306a36Sopenharmony_ci ceph_check_caps(ci, CHECK_CAPS_AUTHONLY); 121862306a36Sopenharmony_ci } 121962306a36Sopenharmony_ci 122062306a36Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 122162306a36Sopenharmony_ci dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, 122262306a36Sopenharmony_ci &aio_req->prealloc_cf); 122362306a36Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 122462306a36Sopenharmony_ci if (dirty) 122562306a36Sopenharmony_ci __mark_inode_dirty(inode, dirty); 122662306a36Sopenharmony_ci 122762306a36Sopenharmony_ci } 122862306a36Sopenharmony_ci 122962306a36Sopenharmony_ci ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR : 123062306a36Sopenharmony_ci CEPH_CAP_FILE_RD)); 123162306a36Sopenharmony_ci 123262306a36Sopenharmony_ci aio_req->iocb->ki_complete(aio_req->iocb, ret); 123362306a36Sopenharmony_ci 123462306a36Sopenharmony_ci ceph_free_cap_flush(aio_req->prealloc_cf); 123562306a36Sopenharmony_ci kfree(aio_req); 123662306a36Sopenharmony_ci} 123762306a36Sopenharmony_ci 123862306a36Sopenharmony_cistatic void ceph_aio_complete_req(struct ceph_osd_request *req) 123962306a36Sopenharmony_ci{ 124062306a36Sopenharmony_ci int rc = req->r_result; 124162306a36Sopenharmony_ci struct inode *inode = req->r_inode; 124262306a36Sopenharmony_ci struct ceph_aio_request *aio_req = req->r_priv; 124362306a36Sopenharmony_ci struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); 124462306a36Sopenharmony_ci struct ceph_osd_req_op *op = &req->r_ops[0]; 124562306a36Sopenharmony_ci struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric; 124662306a36Sopenharmony_ci unsigned int len = osd_data->bvec_pos.iter.bi_size; 124762306a36Sopenharmony_ci bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ); 124862306a36Sopenharmony_ci 124962306a36Sopenharmony_ci BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); 125062306a36Sopenharmony_ci BUG_ON(!osd_data->num_bvecs); 125162306a36Sopenharmony_ci 125262306a36Sopenharmony_ci dout("ceph_aio_complete_req %p rc %d bytes %u\n", inode, rc, len); 125362306a36Sopenharmony_ci 125462306a36Sopenharmony_ci if (rc == -EOLDSNAPC) { 125562306a36Sopenharmony_ci struct ceph_aio_work *aio_work; 125662306a36Sopenharmony_ci BUG_ON(!aio_req->write); 125762306a36Sopenharmony_ci 125862306a36Sopenharmony_ci aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS); 125962306a36Sopenharmony_ci if (aio_work) { 126062306a36Sopenharmony_ci INIT_WORK(&aio_work->work, ceph_aio_retry_work); 126162306a36Sopenharmony_ci aio_work->req = req; 126262306a36Sopenharmony_ci queue_work(ceph_inode_to_client(inode)->inode_wq, 126362306a36Sopenharmony_ci &aio_work->work); 126462306a36Sopenharmony_ci return; 126562306a36Sopenharmony_ci } 126662306a36Sopenharmony_ci rc = -ENOMEM; 126762306a36Sopenharmony_ci } else if (!aio_req->write) { 126862306a36Sopenharmony_ci if (sparse && rc >= 0) 126962306a36Sopenharmony_ci rc = ceph_sparse_ext_map_end(op); 127062306a36Sopenharmony_ci if (rc == -ENOENT) 127162306a36Sopenharmony_ci rc = 0; 127262306a36Sopenharmony_ci if (rc >= 0 && len > rc) { 127362306a36Sopenharmony_ci struct iov_iter i; 127462306a36Sopenharmony_ci int zlen = len - rc; 127562306a36Sopenharmony_ci 127662306a36Sopenharmony_ci /* 127762306a36Sopenharmony_ci * If read is satisfied by single OSD request, 127862306a36Sopenharmony_ci * it can pass EOF. Otherwise read is within 127962306a36Sopenharmony_ci * i_size. 128062306a36Sopenharmony_ci */ 128162306a36Sopenharmony_ci if (aio_req->num_reqs == 1) { 128262306a36Sopenharmony_ci loff_t i_size = i_size_read(inode); 128362306a36Sopenharmony_ci loff_t endoff = aio_req->iocb->ki_pos + rc; 128462306a36Sopenharmony_ci if (endoff < i_size) 128562306a36Sopenharmony_ci zlen = min_t(size_t, zlen, 128662306a36Sopenharmony_ci i_size - endoff); 128762306a36Sopenharmony_ci aio_req->total_len = rc + zlen; 128862306a36Sopenharmony_ci } 128962306a36Sopenharmony_ci 129062306a36Sopenharmony_ci iov_iter_bvec(&i, ITER_DEST, osd_data->bvec_pos.bvecs, 129162306a36Sopenharmony_ci osd_data->num_bvecs, len); 129262306a36Sopenharmony_ci iov_iter_advance(&i, rc); 129362306a36Sopenharmony_ci iov_iter_zero(zlen, &i); 129462306a36Sopenharmony_ci } 129562306a36Sopenharmony_ci } 129662306a36Sopenharmony_ci 129762306a36Sopenharmony_ci /* r_start_latency == 0 means the request was not submitted */ 129862306a36Sopenharmony_ci if (req->r_start_latency) { 129962306a36Sopenharmony_ci if (aio_req->write) 130062306a36Sopenharmony_ci ceph_update_write_metrics(metric, req->r_start_latency, 130162306a36Sopenharmony_ci req->r_end_latency, len, rc); 130262306a36Sopenharmony_ci else 130362306a36Sopenharmony_ci ceph_update_read_metrics(metric, req->r_start_latency, 130462306a36Sopenharmony_ci req->r_end_latency, len, rc); 130562306a36Sopenharmony_ci } 130662306a36Sopenharmony_ci 130762306a36Sopenharmony_ci put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs, 130862306a36Sopenharmony_ci aio_req->should_dirty); 130962306a36Sopenharmony_ci ceph_osdc_put_request(req); 131062306a36Sopenharmony_ci 131162306a36Sopenharmony_ci if (rc < 0) 131262306a36Sopenharmony_ci cmpxchg(&aio_req->error, 0, rc); 131362306a36Sopenharmony_ci 131462306a36Sopenharmony_ci ceph_aio_complete(inode, aio_req); 131562306a36Sopenharmony_ci return; 131662306a36Sopenharmony_ci} 131762306a36Sopenharmony_ci 131862306a36Sopenharmony_cistatic void ceph_aio_retry_work(struct work_struct *work) 131962306a36Sopenharmony_ci{ 132062306a36Sopenharmony_ci struct ceph_aio_work *aio_work = 132162306a36Sopenharmony_ci container_of(work, struct ceph_aio_work, work); 132262306a36Sopenharmony_ci struct ceph_osd_request *orig_req = aio_work->req; 132362306a36Sopenharmony_ci struct ceph_aio_request *aio_req = orig_req->r_priv; 132462306a36Sopenharmony_ci struct inode *inode = orig_req->r_inode; 132562306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 132662306a36Sopenharmony_ci struct ceph_snap_context *snapc; 132762306a36Sopenharmony_ci struct ceph_osd_request *req; 132862306a36Sopenharmony_ci int ret; 132962306a36Sopenharmony_ci 133062306a36Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 133162306a36Sopenharmony_ci if (__ceph_have_pending_cap_snap(ci)) { 133262306a36Sopenharmony_ci struct ceph_cap_snap *capsnap = 133362306a36Sopenharmony_ci list_last_entry(&ci->i_cap_snaps, 133462306a36Sopenharmony_ci struct ceph_cap_snap, 133562306a36Sopenharmony_ci ci_item); 133662306a36Sopenharmony_ci snapc = ceph_get_snap_context(capsnap->context); 133762306a36Sopenharmony_ci } else { 133862306a36Sopenharmony_ci BUG_ON(!ci->i_head_snapc); 133962306a36Sopenharmony_ci snapc = ceph_get_snap_context(ci->i_head_snapc); 134062306a36Sopenharmony_ci } 134162306a36Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 134262306a36Sopenharmony_ci 134362306a36Sopenharmony_ci req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 1, 134462306a36Sopenharmony_ci false, GFP_NOFS); 134562306a36Sopenharmony_ci if (!req) { 134662306a36Sopenharmony_ci ret = -ENOMEM; 134762306a36Sopenharmony_ci req = orig_req; 134862306a36Sopenharmony_ci goto out; 134962306a36Sopenharmony_ci } 135062306a36Sopenharmony_ci 135162306a36Sopenharmony_ci req->r_flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; 135262306a36Sopenharmony_ci ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); 135362306a36Sopenharmony_ci ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); 135462306a36Sopenharmony_ci 135562306a36Sopenharmony_ci req->r_ops[0] = orig_req->r_ops[0]; 135662306a36Sopenharmony_ci 135762306a36Sopenharmony_ci req->r_mtime = aio_req->mtime; 135862306a36Sopenharmony_ci req->r_data_offset = req->r_ops[0].extent.offset; 135962306a36Sopenharmony_ci 136062306a36Sopenharmony_ci ret = ceph_osdc_alloc_messages(req, GFP_NOFS); 136162306a36Sopenharmony_ci if (ret) { 136262306a36Sopenharmony_ci ceph_osdc_put_request(req); 136362306a36Sopenharmony_ci req = orig_req; 136462306a36Sopenharmony_ci goto out; 136562306a36Sopenharmony_ci } 136662306a36Sopenharmony_ci 136762306a36Sopenharmony_ci ceph_osdc_put_request(orig_req); 136862306a36Sopenharmony_ci 136962306a36Sopenharmony_ci req->r_callback = ceph_aio_complete_req; 137062306a36Sopenharmony_ci req->r_inode = inode; 137162306a36Sopenharmony_ci req->r_priv = aio_req; 137262306a36Sopenharmony_ci 137362306a36Sopenharmony_ci ceph_osdc_start_request(req->r_osdc, req); 137462306a36Sopenharmony_ciout: 137562306a36Sopenharmony_ci if (ret < 0) { 137662306a36Sopenharmony_ci req->r_result = ret; 137762306a36Sopenharmony_ci ceph_aio_complete_req(req); 137862306a36Sopenharmony_ci } 137962306a36Sopenharmony_ci 138062306a36Sopenharmony_ci ceph_put_snap_context(snapc); 138162306a36Sopenharmony_ci kfree(aio_work); 138262306a36Sopenharmony_ci} 138362306a36Sopenharmony_ci 138462306a36Sopenharmony_cistatic ssize_t 138562306a36Sopenharmony_ciceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, 138662306a36Sopenharmony_ci struct ceph_snap_context *snapc, 138762306a36Sopenharmony_ci struct ceph_cap_flush **pcf) 138862306a36Sopenharmony_ci{ 138962306a36Sopenharmony_ci struct file *file = iocb->ki_filp; 139062306a36Sopenharmony_ci struct inode *inode = file_inode(file); 139162306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 139262306a36Sopenharmony_ci struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 139362306a36Sopenharmony_ci struct ceph_client_metric *metric = &fsc->mdsc->metric; 139462306a36Sopenharmony_ci struct ceph_vino vino; 139562306a36Sopenharmony_ci struct ceph_osd_request *req; 139662306a36Sopenharmony_ci struct bio_vec *bvecs; 139762306a36Sopenharmony_ci struct ceph_aio_request *aio_req = NULL; 139862306a36Sopenharmony_ci int num_pages = 0; 139962306a36Sopenharmony_ci int flags; 140062306a36Sopenharmony_ci int ret = 0; 140162306a36Sopenharmony_ci struct timespec64 mtime = current_time(inode); 140262306a36Sopenharmony_ci size_t count = iov_iter_count(iter); 140362306a36Sopenharmony_ci loff_t pos = iocb->ki_pos; 140462306a36Sopenharmony_ci bool write = iov_iter_rw(iter) == WRITE; 140562306a36Sopenharmony_ci bool should_dirty = !write && user_backed_iter(iter); 140662306a36Sopenharmony_ci bool sparse = ceph_test_mount_opt(fsc, SPARSEREAD); 140762306a36Sopenharmony_ci 140862306a36Sopenharmony_ci if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) 140962306a36Sopenharmony_ci return -EROFS; 141062306a36Sopenharmony_ci 141162306a36Sopenharmony_ci dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n", 141262306a36Sopenharmony_ci (write ? "write" : "read"), file, pos, (unsigned)count, 141362306a36Sopenharmony_ci snapc, snapc ? snapc->seq : 0); 141462306a36Sopenharmony_ci 141562306a36Sopenharmony_ci if (write) { 141662306a36Sopenharmony_ci int ret2; 141762306a36Sopenharmony_ci 141862306a36Sopenharmony_ci ceph_fscache_invalidate(inode, true); 141962306a36Sopenharmony_ci 142062306a36Sopenharmony_ci ret2 = invalidate_inode_pages2_range(inode->i_mapping, 142162306a36Sopenharmony_ci pos >> PAGE_SHIFT, 142262306a36Sopenharmony_ci (pos + count - 1) >> PAGE_SHIFT); 142362306a36Sopenharmony_ci if (ret2 < 0) 142462306a36Sopenharmony_ci dout("invalidate_inode_pages2_range returned %d\n", ret2); 142562306a36Sopenharmony_ci 142662306a36Sopenharmony_ci flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; 142762306a36Sopenharmony_ci } else { 142862306a36Sopenharmony_ci flags = CEPH_OSD_FLAG_READ; 142962306a36Sopenharmony_ci } 143062306a36Sopenharmony_ci 143162306a36Sopenharmony_ci while (iov_iter_count(iter) > 0) { 143262306a36Sopenharmony_ci u64 size = iov_iter_count(iter); 143362306a36Sopenharmony_ci ssize_t len; 143462306a36Sopenharmony_ci struct ceph_osd_req_op *op; 143562306a36Sopenharmony_ci int readop = sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ; 143662306a36Sopenharmony_ci 143762306a36Sopenharmony_ci if (write) 143862306a36Sopenharmony_ci size = min_t(u64, size, fsc->mount_options->wsize); 143962306a36Sopenharmony_ci else 144062306a36Sopenharmony_ci size = min_t(u64, size, fsc->mount_options->rsize); 144162306a36Sopenharmony_ci 144262306a36Sopenharmony_ci vino = ceph_vino(inode); 144362306a36Sopenharmony_ci req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 144462306a36Sopenharmony_ci vino, pos, &size, 0, 144562306a36Sopenharmony_ci 1, 144662306a36Sopenharmony_ci write ? CEPH_OSD_OP_WRITE : readop, 144762306a36Sopenharmony_ci flags, snapc, 144862306a36Sopenharmony_ci ci->i_truncate_seq, 144962306a36Sopenharmony_ci ci->i_truncate_size, 145062306a36Sopenharmony_ci false); 145162306a36Sopenharmony_ci if (IS_ERR(req)) { 145262306a36Sopenharmony_ci ret = PTR_ERR(req); 145362306a36Sopenharmony_ci break; 145462306a36Sopenharmony_ci } 145562306a36Sopenharmony_ci 145662306a36Sopenharmony_ci len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages); 145762306a36Sopenharmony_ci if (len < 0) { 145862306a36Sopenharmony_ci ceph_osdc_put_request(req); 145962306a36Sopenharmony_ci ret = len; 146062306a36Sopenharmony_ci break; 146162306a36Sopenharmony_ci } 146262306a36Sopenharmony_ci if (len != size) 146362306a36Sopenharmony_ci osd_req_op_extent_update(req, 0, len); 146462306a36Sopenharmony_ci 146562306a36Sopenharmony_ci /* 146662306a36Sopenharmony_ci * To simplify error handling, allow AIO when IO within i_size 146762306a36Sopenharmony_ci * or IO can be satisfied by single OSD request. 146862306a36Sopenharmony_ci */ 146962306a36Sopenharmony_ci if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) && 147062306a36Sopenharmony_ci (len == count || pos + count <= i_size_read(inode))) { 147162306a36Sopenharmony_ci aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL); 147262306a36Sopenharmony_ci if (aio_req) { 147362306a36Sopenharmony_ci aio_req->iocb = iocb; 147462306a36Sopenharmony_ci aio_req->write = write; 147562306a36Sopenharmony_ci aio_req->should_dirty = should_dirty; 147662306a36Sopenharmony_ci INIT_LIST_HEAD(&aio_req->osd_reqs); 147762306a36Sopenharmony_ci if (write) { 147862306a36Sopenharmony_ci aio_req->mtime = mtime; 147962306a36Sopenharmony_ci swap(aio_req->prealloc_cf, *pcf); 148062306a36Sopenharmony_ci } 148162306a36Sopenharmony_ci } 148262306a36Sopenharmony_ci /* ignore error */ 148362306a36Sopenharmony_ci } 148462306a36Sopenharmony_ci 148562306a36Sopenharmony_ci if (write) { 148662306a36Sopenharmony_ci /* 148762306a36Sopenharmony_ci * throw out any page cache pages in this range. this 148862306a36Sopenharmony_ci * may block. 148962306a36Sopenharmony_ci */ 149062306a36Sopenharmony_ci truncate_inode_pages_range(inode->i_mapping, pos, 149162306a36Sopenharmony_ci PAGE_ALIGN(pos + len) - 1); 149262306a36Sopenharmony_ci 149362306a36Sopenharmony_ci req->r_mtime = mtime; 149462306a36Sopenharmony_ci } 149562306a36Sopenharmony_ci 149662306a36Sopenharmony_ci osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); 149762306a36Sopenharmony_ci op = &req->r_ops[0]; 149862306a36Sopenharmony_ci if (sparse) { 149962306a36Sopenharmony_ci ret = ceph_alloc_sparse_ext_map(op); 150062306a36Sopenharmony_ci if (ret) { 150162306a36Sopenharmony_ci ceph_osdc_put_request(req); 150262306a36Sopenharmony_ci break; 150362306a36Sopenharmony_ci } 150462306a36Sopenharmony_ci } 150562306a36Sopenharmony_ci 150662306a36Sopenharmony_ci if (aio_req) { 150762306a36Sopenharmony_ci aio_req->total_len += len; 150862306a36Sopenharmony_ci aio_req->num_reqs++; 150962306a36Sopenharmony_ci atomic_inc(&aio_req->pending_reqs); 151062306a36Sopenharmony_ci 151162306a36Sopenharmony_ci req->r_callback = ceph_aio_complete_req; 151262306a36Sopenharmony_ci req->r_inode = inode; 151362306a36Sopenharmony_ci req->r_priv = aio_req; 151462306a36Sopenharmony_ci list_add_tail(&req->r_private_item, &aio_req->osd_reqs); 151562306a36Sopenharmony_ci 151662306a36Sopenharmony_ci pos += len; 151762306a36Sopenharmony_ci continue; 151862306a36Sopenharmony_ci } 151962306a36Sopenharmony_ci 152062306a36Sopenharmony_ci ceph_osdc_start_request(req->r_osdc, req); 152162306a36Sopenharmony_ci ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 152262306a36Sopenharmony_ci 152362306a36Sopenharmony_ci if (write) 152462306a36Sopenharmony_ci ceph_update_write_metrics(metric, req->r_start_latency, 152562306a36Sopenharmony_ci req->r_end_latency, len, ret); 152662306a36Sopenharmony_ci else 152762306a36Sopenharmony_ci ceph_update_read_metrics(metric, req->r_start_latency, 152862306a36Sopenharmony_ci req->r_end_latency, len, ret); 152962306a36Sopenharmony_ci 153062306a36Sopenharmony_ci size = i_size_read(inode); 153162306a36Sopenharmony_ci if (!write) { 153262306a36Sopenharmony_ci if (sparse && ret >= 0) 153362306a36Sopenharmony_ci ret = ceph_sparse_ext_map_end(op); 153462306a36Sopenharmony_ci else if (ret == -ENOENT) 153562306a36Sopenharmony_ci ret = 0; 153662306a36Sopenharmony_ci 153762306a36Sopenharmony_ci if (ret >= 0 && ret < len && pos + ret < size) { 153862306a36Sopenharmony_ci struct iov_iter i; 153962306a36Sopenharmony_ci int zlen = min_t(size_t, len - ret, 154062306a36Sopenharmony_ci size - pos - ret); 154162306a36Sopenharmony_ci 154262306a36Sopenharmony_ci iov_iter_bvec(&i, ITER_DEST, bvecs, num_pages, len); 154362306a36Sopenharmony_ci iov_iter_advance(&i, ret); 154462306a36Sopenharmony_ci iov_iter_zero(zlen, &i); 154562306a36Sopenharmony_ci ret += zlen; 154662306a36Sopenharmony_ci } 154762306a36Sopenharmony_ci if (ret >= 0) 154862306a36Sopenharmony_ci len = ret; 154962306a36Sopenharmony_ci } 155062306a36Sopenharmony_ci 155162306a36Sopenharmony_ci put_bvecs(bvecs, num_pages, should_dirty); 155262306a36Sopenharmony_ci ceph_osdc_put_request(req); 155362306a36Sopenharmony_ci if (ret < 0) 155462306a36Sopenharmony_ci break; 155562306a36Sopenharmony_ci 155662306a36Sopenharmony_ci pos += len; 155762306a36Sopenharmony_ci if (!write && pos >= size) 155862306a36Sopenharmony_ci break; 155962306a36Sopenharmony_ci 156062306a36Sopenharmony_ci if (write && pos > size) { 156162306a36Sopenharmony_ci if (ceph_inode_set_size(inode, pos)) 156262306a36Sopenharmony_ci ceph_check_caps(ceph_inode(inode), 156362306a36Sopenharmony_ci CHECK_CAPS_AUTHONLY); 156462306a36Sopenharmony_ci } 156562306a36Sopenharmony_ci } 156662306a36Sopenharmony_ci 156762306a36Sopenharmony_ci if (aio_req) { 156862306a36Sopenharmony_ci LIST_HEAD(osd_reqs); 156962306a36Sopenharmony_ci 157062306a36Sopenharmony_ci if (aio_req->num_reqs == 0) { 157162306a36Sopenharmony_ci kfree(aio_req); 157262306a36Sopenharmony_ci return ret; 157362306a36Sopenharmony_ci } 157462306a36Sopenharmony_ci 157562306a36Sopenharmony_ci ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR : 157662306a36Sopenharmony_ci CEPH_CAP_FILE_RD); 157762306a36Sopenharmony_ci 157862306a36Sopenharmony_ci list_splice(&aio_req->osd_reqs, &osd_reqs); 157962306a36Sopenharmony_ci inode_dio_begin(inode); 158062306a36Sopenharmony_ci while (!list_empty(&osd_reqs)) { 158162306a36Sopenharmony_ci req = list_first_entry(&osd_reqs, 158262306a36Sopenharmony_ci struct ceph_osd_request, 158362306a36Sopenharmony_ci r_private_item); 158462306a36Sopenharmony_ci list_del_init(&req->r_private_item); 158562306a36Sopenharmony_ci if (ret >= 0) 158662306a36Sopenharmony_ci ceph_osdc_start_request(req->r_osdc, req); 158762306a36Sopenharmony_ci if (ret < 0) { 158862306a36Sopenharmony_ci req->r_result = ret; 158962306a36Sopenharmony_ci ceph_aio_complete_req(req); 159062306a36Sopenharmony_ci } 159162306a36Sopenharmony_ci } 159262306a36Sopenharmony_ci return -EIOCBQUEUED; 159362306a36Sopenharmony_ci } 159462306a36Sopenharmony_ci 159562306a36Sopenharmony_ci if (ret != -EOLDSNAPC && pos > iocb->ki_pos) { 159662306a36Sopenharmony_ci ret = pos - iocb->ki_pos; 159762306a36Sopenharmony_ci iocb->ki_pos = pos; 159862306a36Sopenharmony_ci } 159962306a36Sopenharmony_ci return ret; 160062306a36Sopenharmony_ci} 160162306a36Sopenharmony_ci 160262306a36Sopenharmony_ci/* 160362306a36Sopenharmony_ci * Synchronous write, straight from __user pointer or user pages. 160462306a36Sopenharmony_ci * 160562306a36Sopenharmony_ci * If write spans object boundary, just do multiple writes. (For a 160662306a36Sopenharmony_ci * correct atomic write, we should e.g. take write locks on all 160762306a36Sopenharmony_ci * objects, rollback on failure, etc.) 160862306a36Sopenharmony_ci */ 160962306a36Sopenharmony_cistatic ssize_t 161062306a36Sopenharmony_ciceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, 161162306a36Sopenharmony_ci struct ceph_snap_context *snapc) 161262306a36Sopenharmony_ci{ 161362306a36Sopenharmony_ci struct file *file = iocb->ki_filp; 161462306a36Sopenharmony_ci struct inode *inode = file_inode(file); 161562306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 161662306a36Sopenharmony_ci struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 161762306a36Sopenharmony_ci struct ceph_osd_client *osdc = &fsc->client->osdc; 161862306a36Sopenharmony_ci struct ceph_osd_request *req; 161962306a36Sopenharmony_ci struct page **pages; 162062306a36Sopenharmony_ci u64 len; 162162306a36Sopenharmony_ci int num_pages; 162262306a36Sopenharmony_ci int written = 0; 162362306a36Sopenharmony_ci int ret; 162462306a36Sopenharmony_ci bool check_caps = false; 162562306a36Sopenharmony_ci struct timespec64 mtime = current_time(inode); 162662306a36Sopenharmony_ci size_t count = iov_iter_count(from); 162762306a36Sopenharmony_ci 162862306a36Sopenharmony_ci if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) 162962306a36Sopenharmony_ci return -EROFS; 163062306a36Sopenharmony_ci 163162306a36Sopenharmony_ci dout("sync_write on file %p %lld~%u snapc %p seq %lld\n", 163262306a36Sopenharmony_ci file, pos, (unsigned)count, snapc, snapc->seq); 163362306a36Sopenharmony_ci 163462306a36Sopenharmony_ci ret = filemap_write_and_wait_range(inode->i_mapping, 163562306a36Sopenharmony_ci pos, pos + count - 1); 163662306a36Sopenharmony_ci if (ret < 0) 163762306a36Sopenharmony_ci return ret; 163862306a36Sopenharmony_ci 163962306a36Sopenharmony_ci ceph_fscache_invalidate(inode, false); 164062306a36Sopenharmony_ci 164162306a36Sopenharmony_ci while ((len = iov_iter_count(from)) > 0) { 164262306a36Sopenharmony_ci size_t left; 164362306a36Sopenharmony_ci int n; 164462306a36Sopenharmony_ci u64 write_pos = pos; 164562306a36Sopenharmony_ci u64 write_len = len; 164662306a36Sopenharmony_ci u64 objnum, objoff; 164762306a36Sopenharmony_ci u32 xlen; 164862306a36Sopenharmony_ci u64 assert_ver = 0; 164962306a36Sopenharmony_ci bool rmw; 165062306a36Sopenharmony_ci bool first, last; 165162306a36Sopenharmony_ci struct iov_iter saved_iter = *from; 165262306a36Sopenharmony_ci size_t off; 165362306a36Sopenharmony_ci 165462306a36Sopenharmony_ci ceph_fscrypt_adjust_off_and_len(inode, &write_pos, &write_len); 165562306a36Sopenharmony_ci 165662306a36Sopenharmony_ci /* clamp the length to the end of first object */ 165762306a36Sopenharmony_ci ceph_calc_file_object_mapping(&ci->i_layout, write_pos, 165862306a36Sopenharmony_ci write_len, &objnum, &objoff, 165962306a36Sopenharmony_ci &xlen); 166062306a36Sopenharmony_ci write_len = xlen; 166162306a36Sopenharmony_ci 166262306a36Sopenharmony_ci /* adjust len downward if it goes beyond current object */ 166362306a36Sopenharmony_ci if (pos + len > write_pos + write_len) 166462306a36Sopenharmony_ci len = write_pos + write_len - pos; 166562306a36Sopenharmony_ci 166662306a36Sopenharmony_ci /* 166762306a36Sopenharmony_ci * If we had to adjust the length or position to align with a 166862306a36Sopenharmony_ci * crypto block, then we must do a read/modify/write cycle. We 166962306a36Sopenharmony_ci * use a version assertion to redrive the thing if something 167062306a36Sopenharmony_ci * changes in between. 167162306a36Sopenharmony_ci */ 167262306a36Sopenharmony_ci first = pos != write_pos; 167362306a36Sopenharmony_ci last = (pos + len) != (write_pos + write_len); 167462306a36Sopenharmony_ci rmw = first || last; 167562306a36Sopenharmony_ci 167662306a36Sopenharmony_ci dout("sync_write ino %llx %lld~%llu adjusted %lld~%llu -- %srmw\n", 167762306a36Sopenharmony_ci ci->i_vino.ino, pos, len, write_pos, write_len, 167862306a36Sopenharmony_ci rmw ? "" : "no "); 167962306a36Sopenharmony_ci 168062306a36Sopenharmony_ci /* 168162306a36Sopenharmony_ci * The data is emplaced into the page as it would be if it were 168262306a36Sopenharmony_ci * in an array of pagecache pages. 168362306a36Sopenharmony_ci */ 168462306a36Sopenharmony_ci num_pages = calc_pages_for(write_pos, write_len); 168562306a36Sopenharmony_ci pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 168662306a36Sopenharmony_ci if (IS_ERR(pages)) { 168762306a36Sopenharmony_ci ret = PTR_ERR(pages); 168862306a36Sopenharmony_ci break; 168962306a36Sopenharmony_ci } 169062306a36Sopenharmony_ci 169162306a36Sopenharmony_ci /* Do we need to preload the pages? */ 169262306a36Sopenharmony_ci if (rmw) { 169362306a36Sopenharmony_ci u64 first_pos = write_pos; 169462306a36Sopenharmony_ci u64 last_pos = (write_pos + write_len) - CEPH_FSCRYPT_BLOCK_SIZE; 169562306a36Sopenharmony_ci u64 read_len = CEPH_FSCRYPT_BLOCK_SIZE; 169662306a36Sopenharmony_ci struct ceph_osd_req_op *op; 169762306a36Sopenharmony_ci 169862306a36Sopenharmony_ci /* We should only need to do this for encrypted inodes */ 169962306a36Sopenharmony_ci WARN_ON_ONCE(!IS_ENCRYPTED(inode)); 170062306a36Sopenharmony_ci 170162306a36Sopenharmony_ci /* No need to do two reads if first and last blocks are same */ 170262306a36Sopenharmony_ci if (first && last_pos == first_pos) 170362306a36Sopenharmony_ci last = false; 170462306a36Sopenharmony_ci 170562306a36Sopenharmony_ci /* 170662306a36Sopenharmony_ci * Allocate a read request for one or two extents, 170762306a36Sopenharmony_ci * depending on how the request was aligned. 170862306a36Sopenharmony_ci */ 170962306a36Sopenharmony_ci req = ceph_osdc_new_request(osdc, &ci->i_layout, 171062306a36Sopenharmony_ci ci->i_vino, first ? first_pos : last_pos, 171162306a36Sopenharmony_ci &read_len, 0, (first && last) ? 2 : 1, 171262306a36Sopenharmony_ci CEPH_OSD_OP_SPARSE_READ, CEPH_OSD_FLAG_READ, 171362306a36Sopenharmony_ci NULL, ci->i_truncate_seq, 171462306a36Sopenharmony_ci ci->i_truncate_size, false); 171562306a36Sopenharmony_ci if (IS_ERR(req)) { 171662306a36Sopenharmony_ci ceph_release_page_vector(pages, num_pages); 171762306a36Sopenharmony_ci ret = PTR_ERR(req); 171862306a36Sopenharmony_ci break; 171962306a36Sopenharmony_ci } 172062306a36Sopenharmony_ci 172162306a36Sopenharmony_ci /* Something is misaligned! */ 172262306a36Sopenharmony_ci if (read_len != CEPH_FSCRYPT_BLOCK_SIZE) { 172362306a36Sopenharmony_ci ceph_osdc_put_request(req); 172462306a36Sopenharmony_ci ceph_release_page_vector(pages, num_pages); 172562306a36Sopenharmony_ci ret = -EIO; 172662306a36Sopenharmony_ci break; 172762306a36Sopenharmony_ci } 172862306a36Sopenharmony_ci 172962306a36Sopenharmony_ci /* Add extent for first block? */ 173062306a36Sopenharmony_ci op = &req->r_ops[0]; 173162306a36Sopenharmony_ci 173262306a36Sopenharmony_ci if (first) { 173362306a36Sopenharmony_ci osd_req_op_extent_osd_data_pages(req, 0, pages, 173462306a36Sopenharmony_ci CEPH_FSCRYPT_BLOCK_SIZE, 173562306a36Sopenharmony_ci offset_in_page(first_pos), 173662306a36Sopenharmony_ci false, false); 173762306a36Sopenharmony_ci /* We only expect a single extent here */ 173862306a36Sopenharmony_ci ret = __ceph_alloc_sparse_ext_map(op, 1); 173962306a36Sopenharmony_ci if (ret) { 174062306a36Sopenharmony_ci ceph_osdc_put_request(req); 174162306a36Sopenharmony_ci ceph_release_page_vector(pages, num_pages); 174262306a36Sopenharmony_ci break; 174362306a36Sopenharmony_ci } 174462306a36Sopenharmony_ci } 174562306a36Sopenharmony_ci 174662306a36Sopenharmony_ci /* Add extent for last block */ 174762306a36Sopenharmony_ci if (last) { 174862306a36Sopenharmony_ci /* Init the other extent if first extent has been used */ 174962306a36Sopenharmony_ci if (first) { 175062306a36Sopenharmony_ci op = &req->r_ops[1]; 175162306a36Sopenharmony_ci osd_req_op_extent_init(req, 1, 175262306a36Sopenharmony_ci CEPH_OSD_OP_SPARSE_READ, 175362306a36Sopenharmony_ci last_pos, CEPH_FSCRYPT_BLOCK_SIZE, 175462306a36Sopenharmony_ci ci->i_truncate_size, 175562306a36Sopenharmony_ci ci->i_truncate_seq); 175662306a36Sopenharmony_ci } 175762306a36Sopenharmony_ci 175862306a36Sopenharmony_ci ret = __ceph_alloc_sparse_ext_map(op, 1); 175962306a36Sopenharmony_ci if (ret) { 176062306a36Sopenharmony_ci ceph_osdc_put_request(req); 176162306a36Sopenharmony_ci ceph_release_page_vector(pages, num_pages); 176262306a36Sopenharmony_ci break; 176362306a36Sopenharmony_ci } 176462306a36Sopenharmony_ci 176562306a36Sopenharmony_ci osd_req_op_extent_osd_data_pages(req, first ? 1 : 0, 176662306a36Sopenharmony_ci &pages[num_pages - 1], 176762306a36Sopenharmony_ci CEPH_FSCRYPT_BLOCK_SIZE, 176862306a36Sopenharmony_ci offset_in_page(last_pos), 176962306a36Sopenharmony_ci false, false); 177062306a36Sopenharmony_ci } 177162306a36Sopenharmony_ci 177262306a36Sopenharmony_ci ceph_osdc_start_request(osdc, req); 177362306a36Sopenharmony_ci ret = ceph_osdc_wait_request(osdc, req); 177462306a36Sopenharmony_ci 177562306a36Sopenharmony_ci /* FIXME: length field is wrong if there are 2 extents */ 177662306a36Sopenharmony_ci ceph_update_read_metrics(&fsc->mdsc->metric, 177762306a36Sopenharmony_ci req->r_start_latency, 177862306a36Sopenharmony_ci req->r_end_latency, 177962306a36Sopenharmony_ci read_len, ret); 178062306a36Sopenharmony_ci 178162306a36Sopenharmony_ci /* Ok if object is not already present */ 178262306a36Sopenharmony_ci if (ret == -ENOENT) { 178362306a36Sopenharmony_ci /* 178462306a36Sopenharmony_ci * If there is no object, then we can't assert 178562306a36Sopenharmony_ci * on its version. Set it to 0, and we'll use an 178662306a36Sopenharmony_ci * exclusive create instead. 178762306a36Sopenharmony_ci */ 178862306a36Sopenharmony_ci ceph_osdc_put_request(req); 178962306a36Sopenharmony_ci ret = 0; 179062306a36Sopenharmony_ci 179162306a36Sopenharmony_ci /* 179262306a36Sopenharmony_ci * zero out the soon-to-be uncopied parts of the 179362306a36Sopenharmony_ci * first and last pages. 179462306a36Sopenharmony_ci */ 179562306a36Sopenharmony_ci if (first) 179662306a36Sopenharmony_ci zero_user_segment(pages[0], 0, 179762306a36Sopenharmony_ci offset_in_page(first_pos)); 179862306a36Sopenharmony_ci if (last) 179962306a36Sopenharmony_ci zero_user_segment(pages[num_pages - 1], 180062306a36Sopenharmony_ci offset_in_page(last_pos), 180162306a36Sopenharmony_ci PAGE_SIZE); 180262306a36Sopenharmony_ci } else { 180362306a36Sopenharmony_ci if (ret < 0) { 180462306a36Sopenharmony_ci ceph_osdc_put_request(req); 180562306a36Sopenharmony_ci ceph_release_page_vector(pages, num_pages); 180662306a36Sopenharmony_ci break; 180762306a36Sopenharmony_ci } 180862306a36Sopenharmony_ci 180962306a36Sopenharmony_ci op = &req->r_ops[0]; 181062306a36Sopenharmony_ci if (op->extent.sparse_ext_cnt == 0) { 181162306a36Sopenharmony_ci if (first) 181262306a36Sopenharmony_ci zero_user_segment(pages[0], 0, 181362306a36Sopenharmony_ci offset_in_page(first_pos)); 181462306a36Sopenharmony_ci else 181562306a36Sopenharmony_ci zero_user_segment(pages[num_pages - 1], 181662306a36Sopenharmony_ci offset_in_page(last_pos), 181762306a36Sopenharmony_ci PAGE_SIZE); 181862306a36Sopenharmony_ci } else if (op->extent.sparse_ext_cnt != 1 || 181962306a36Sopenharmony_ci ceph_sparse_ext_map_end(op) != 182062306a36Sopenharmony_ci CEPH_FSCRYPT_BLOCK_SIZE) { 182162306a36Sopenharmony_ci ret = -EIO; 182262306a36Sopenharmony_ci ceph_osdc_put_request(req); 182362306a36Sopenharmony_ci ceph_release_page_vector(pages, num_pages); 182462306a36Sopenharmony_ci break; 182562306a36Sopenharmony_ci } 182662306a36Sopenharmony_ci 182762306a36Sopenharmony_ci if (first && last) { 182862306a36Sopenharmony_ci op = &req->r_ops[1]; 182962306a36Sopenharmony_ci if (op->extent.sparse_ext_cnt == 0) { 183062306a36Sopenharmony_ci zero_user_segment(pages[num_pages - 1], 183162306a36Sopenharmony_ci offset_in_page(last_pos), 183262306a36Sopenharmony_ci PAGE_SIZE); 183362306a36Sopenharmony_ci } else if (op->extent.sparse_ext_cnt != 1 || 183462306a36Sopenharmony_ci ceph_sparse_ext_map_end(op) != 183562306a36Sopenharmony_ci CEPH_FSCRYPT_BLOCK_SIZE) { 183662306a36Sopenharmony_ci ret = -EIO; 183762306a36Sopenharmony_ci ceph_osdc_put_request(req); 183862306a36Sopenharmony_ci ceph_release_page_vector(pages, num_pages); 183962306a36Sopenharmony_ci break; 184062306a36Sopenharmony_ci } 184162306a36Sopenharmony_ci } 184262306a36Sopenharmony_ci 184362306a36Sopenharmony_ci /* Grab assert version. It must be non-zero. */ 184462306a36Sopenharmony_ci assert_ver = req->r_version; 184562306a36Sopenharmony_ci WARN_ON_ONCE(ret > 0 && assert_ver == 0); 184662306a36Sopenharmony_ci 184762306a36Sopenharmony_ci ceph_osdc_put_request(req); 184862306a36Sopenharmony_ci if (first) { 184962306a36Sopenharmony_ci ret = ceph_fscrypt_decrypt_block_inplace(inode, 185062306a36Sopenharmony_ci pages[0], CEPH_FSCRYPT_BLOCK_SIZE, 185162306a36Sopenharmony_ci offset_in_page(first_pos), 185262306a36Sopenharmony_ci first_pos >> CEPH_FSCRYPT_BLOCK_SHIFT); 185362306a36Sopenharmony_ci if (ret < 0) { 185462306a36Sopenharmony_ci ceph_release_page_vector(pages, num_pages); 185562306a36Sopenharmony_ci break; 185662306a36Sopenharmony_ci } 185762306a36Sopenharmony_ci } 185862306a36Sopenharmony_ci if (last) { 185962306a36Sopenharmony_ci ret = ceph_fscrypt_decrypt_block_inplace(inode, 186062306a36Sopenharmony_ci pages[num_pages - 1], 186162306a36Sopenharmony_ci CEPH_FSCRYPT_BLOCK_SIZE, 186262306a36Sopenharmony_ci offset_in_page(last_pos), 186362306a36Sopenharmony_ci last_pos >> CEPH_FSCRYPT_BLOCK_SHIFT); 186462306a36Sopenharmony_ci if (ret < 0) { 186562306a36Sopenharmony_ci ceph_release_page_vector(pages, num_pages); 186662306a36Sopenharmony_ci break; 186762306a36Sopenharmony_ci } 186862306a36Sopenharmony_ci } 186962306a36Sopenharmony_ci } 187062306a36Sopenharmony_ci } 187162306a36Sopenharmony_ci 187262306a36Sopenharmony_ci left = len; 187362306a36Sopenharmony_ci off = offset_in_page(pos); 187462306a36Sopenharmony_ci for (n = 0; n < num_pages; n++) { 187562306a36Sopenharmony_ci size_t plen = min_t(size_t, left, PAGE_SIZE - off); 187662306a36Sopenharmony_ci 187762306a36Sopenharmony_ci /* copy the data */ 187862306a36Sopenharmony_ci ret = copy_page_from_iter(pages[n], off, plen, from); 187962306a36Sopenharmony_ci if (ret != plen) { 188062306a36Sopenharmony_ci ret = -EFAULT; 188162306a36Sopenharmony_ci break; 188262306a36Sopenharmony_ci } 188362306a36Sopenharmony_ci off = 0; 188462306a36Sopenharmony_ci left -= ret; 188562306a36Sopenharmony_ci } 188662306a36Sopenharmony_ci if (ret < 0) { 188762306a36Sopenharmony_ci dout("sync_write write failed with %d\n", ret); 188862306a36Sopenharmony_ci ceph_release_page_vector(pages, num_pages); 188962306a36Sopenharmony_ci break; 189062306a36Sopenharmony_ci } 189162306a36Sopenharmony_ci 189262306a36Sopenharmony_ci if (IS_ENCRYPTED(inode)) { 189362306a36Sopenharmony_ci ret = ceph_fscrypt_encrypt_pages(inode, pages, 189462306a36Sopenharmony_ci write_pos, write_len, 189562306a36Sopenharmony_ci GFP_KERNEL); 189662306a36Sopenharmony_ci if (ret < 0) { 189762306a36Sopenharmony_ci dout("encryption failed with %d\n", ret); 189862306a36Sopenharmony_ci ceph_release_page_vector(pages, num_pages); 189962306a36Sopenharmony_ci break; 190062306a36Sopenharmony_ci } 190162306a36Sopenharmony_ci } 190262306a36Sopenharmony_ci 190362306a36Sopenharmony_ci req = ceph_osdc_new_request(osdc, &ci->i_layout, 190462306a36Sopenharmony_ci ci->i_vino, write_pos, &write_len, 190562306a36Sopenharmony_ci rmw ? 1 : 0, rmw ? 2 : 1, 190662306a36Sopenharmony_ci CEPH_OSD_OP_WRITE, 190762306a36Sopenharmony_ci CEPH_OSD_FLAG_WRITE, 190862306a36Sopenharmony_ci snapc, ci->i_truncate_seq, 190962306a36Sopenharmony_ci ci->i_truncate_size, false); 191062306a36Sopenharmony_ci if (IS_ERR(req)) { 191162306a36Sopenharmony_ci ret = PTR_ERR(req); 191262306a36Sopenharmony_ci ceph_release_page_vector(pages, num_pages); 191362306a36Sopenharmony_ci break; 191462306a36Sopenharmony_ci } 191562306a36Sopenharmony_ci 191662306a36Sopenharmony_ci dout("sync_write write op %lld~%llu\n", write_pos, write_len); 191762306a36Sopenharmony_ci osd_req_op_extent_osd_data_pages(req, rmw ? 1 : 0, pages, write_len, 191862306a36Sopenharmony_ci offset_in_page(write_pos), false, 191962306a36Sopenharmony_ci true); 192062306a36Sopenharmony_ci req->r_inode = inode; 192162306a36Sopenharmony_ci req->r_mtime = mtime; 192262306a36Sopenharmony_ci 192362306a36Sopenharmony_ci /* Set up the assertion */ 192462306a36Sopenharmony_ci if (rmw) { 192562306a36Sopenharmony_ci /* 192662306a36Sopenharmony_ci * Set up the assertion. If we don't have a version 192762306a36Sopenharmony_ci * number, then the object doesn't exist yet. Use an 192862306a36Sopenharmony_ci * exclusive create instead of a version assertion in 192962306a36Sopenharmony_ci * that case. 193062306a36Sopenharmony_ci */ 193162306a36Sopenharmony_ci if (assert_ver) { 193262306a36Sopenharmony_ci osd_req_op_init(req, 0, CEPH_OSD_OP_ASSERT_VER, 0); 193362306a36Sopenharmony_ci req->r_ops[0].assert_ver.ver = assert_ver; 193462306a36Sopenharmony_ci } else { 193562306a36Sopenharmony_ci osd_req_op_init(req, 0, CEPH_OSD_OP_CREATE, 193662306a36Sopenharmony_ci CEPH_OSD_OP_FLAG_EXCL); 193762306a36Sopenharmony_ci } 193862306a36Sopenharmony_ci } 193962306a36Sopenharmony_ci 194062306a36Sopenharmony_ci ceph_osdc_start_request(osdc, req); 194162306a36Sopenharmony_ci ret = ceph_osdc_wait_request(osdc, req); 194262306a36Sopenharmony_ci 194362306a36Sopenharmony_ci ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, 194462306a36Sopenharmony_ci req->r_end_latency, len, ret); 194562306a36Sopenharmony_ci ceph_osdc_put_request(req); 194662306a36Sopenharmony_ci if (ret != 0) { 194762306a36Sopenharmony_ci dout("sync_write osd write returned %d\n", ret); 194862306a36Sopenharmony_ci /* Version changed! Must re-do the rmw cycle */ 194962306a36Sopenharmony_ci if ((assert_ver && (ret == -ERANGE || ret == -EOVERFLOW)) || 195062306a36Sopenharmony_ci (!assert_ver && ret == -EEXIST)) { 195162306a36Sopenharmony_ci /* We should only ever see this on a rmw */ 195262306a36Sopenharmony_ci WARN_ON_ONCE(!rmw); 195362306a36Sopenharmony_ci 195462306a36Sopenharmony_ci /* The version should never go backward */ 195562306a36Sopenharmony_ci WARN_ON_ONCE(ret == -EOVERFLOW); 195662306a36Sopenharmony_ci 195762306a36Sopenharmony_ci *from = saved_iter; 195862306a36Sopenharmony_ci 195962306a36Sopenharmony_ci /* FIXME: limit number of times we loop? */ 196062306a36Sopenharmony_ci continue; 196162306a36Sopenharmony_ci } 196262306a36Sopenharmony_ci ceph_set_error_write(ci); 196362306a36Sopenharmony_ci break; 196462306a36Sopenharmony_ci } 196562306a36Sopenharmony_ci 196662306a36Sopenharmony_ci ceph_clear_error_write(ci); 196762306a36Sopenharmony_ci 196862306a36Sopenharmony_ci /* 196962306a36Sopenharmony_ci * We successfully wrote to a range of the file. Declare 197062306a36Sopenharmony_ci * that region of the pagecache invalid. 197162306a36Sopenharmony_ci */ 197262306a36Sopenharmony_ci ret = invalidate_inode_pages2_range( 197362306a36Sopenharmony_ci inode->i_mapping, 197462306a36Sopenharmony_ci pos >> PAGE_SHIFT, 197562306a36Sopenharmony_ci (pos + len - 1) >> PAGE_SHIFT); 197662306a36Sopenharmony_ci if (ret < 0) { 197762306a36Sopenharmony_ci dout("invalidate_inode_pages2_range returned %d\n", 197862306a36Sopenharmony_ci ret); 197962306a36Sopenharmony_ci ret = 0; 198062306a36Sopenharmony_ci } 198162306a36Sopenharmony_ci pos += len; 198262306a36Sopenharmony_ci written += len; 198362306a36Sopenharmony_ci dout("sync_write written %d\n", written); 198462306a36Sopenharmony_ci if (pos > i_size_read(inode)) { 198562306a36Sopenharmony_ci check_caps = ceph_inode_set_size(inode, pos); 198662306a36Sopenharmony_ci if (check_caps) 198762306a36Sopenharmony_ci ceph_check_caps(ceph_inode(inode), 198862306a36Sopenharmony_ci CHECK_CAPS_AUTHONLY); 198962306a36Sopenharmony_ci } 199062306a36Sopenharmony_ci 199162306a36Sopenharmony_ci } 199262306a36Sopenharmony_ci 199362306a36Sopenharmony_ci if (ret != -EOLDSNAPC && written > 0) { 199462306a36Sopenharmony_ci ret = written; 199562306a36Sopenharmony_ci iocb->ki_pos = pos; 199662306a36Sopenharmony_ci } 199762306a36Sopenharmony_ci dout("sync_write returning %d\n", ret); 199862306a36Sopenharmony_ci return ret; 199962306a36Sopenharmony_ci} 200062306a36Sopenharmony_ci 200162306a36Sopenharmony_ci/* 200262306a36Sopenharmony_ci * Wrap generic_file_aio_read with checks for cap bits on the inode. 200362306a36Sopenharmony_ci * Atomically grab references, so that those bits are not released 200462306a36Sopenharmony_ci * back to the MDS mid-read. 200562306a36Sopenharmony_ci * 200662306a36Sopenharmony_ci * Hmm, the sync read case isn't actually async... should it be? 200762306a36Sopenharmony_ci */ 200862306a36Sopenharmony_cistatic ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) 200962306a36Sopenharmony_ci{ 201062306a36Sopenharmony_ci struct file *filp = iocb->ki_filp; 201162306a36Sopenharmony_ci struct ceph_file_info *fi = filp->private_data; 201262306a36Sopenharmony_ci size_t len = iov_iter_count(to); 201362306a36Sopenharmony_ci struct inode *inode = file_inode(filp); 201462306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 201562306a36Sopenharmony_ci bool direct_lock = iocb->ki_flags & IOCB_DIRECT; 201662306a36Sopenharmony_ci ssize_t ret; 201762306a36Sopenharmony_ci int want = 0, got = 0; 201862306a36Sopenharmony_ci int retry_op = 0, read = 0; 201962306a36Sopenharmony_ci 202062306a36Sopenharmony_ciagain: 202162306a36Sopenharmony_ci dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", 202262306a36Sopenharmony_ci inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); 202362306a36Sopenharmony_ci 202462306a36Sopenharmony_ci if (ceph_inode_is_shutdown(inode)) 202562306a36Sopenharmony_ci return -ESTALE; 202662306a36Sopenharmony_ci 202762306a36Sopenharmony_ci if (direct_lock) 202862306a36Sopenharmony_ci ceph_start_io_direct(inode); 202962306a36Sopenharmony_ci else 203062306a36Sopenharmony_ci ceph_start_io_read(inode); 203162306a36Sopenharmony_ci 203262306a36Sopenharmony_ci if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) 203362306a36Sopenharmony_ci want |= CEPH_CAP_FILE_CACHE; 203462306a36Sopenharmony_ci if (fi->fmode & CEPH_FILE_MODE_LAZY) 203562306a36Sopenharmony_ci want |= CEPH_CAP_FILE_LAZYIO; 203662306a36Sopenharmony_ci 203762306a36Sopenharmony_ci ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got); 203862306a36Sopenharmony_ci if (ret < 0) { 203962306a36Sopenharmony_ci if (direct_lock) 204062306a36Sopenharmony_ci ceph_end_io_direct(inode); 204162306a36Sopenharmony_ci else 204262306a36Sopenharmony_ci ceph_end_io_read(inode); 204362306a36Sopenharmony_ci return ret; 204462306a36Sopenharmony_ci } 204562306a36Sopenharmony_ci 204662306a36Sopenharmony_ci if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || 204762306a36Sopenharmony_ci (iocb->ki_flags & IOCB_DIRECT) || 204862306a36Sopenharmony_ci (fi->flags & CEPH_F_SYNC)) { 204962306a36Sopenharmony_ci 205062306a36Sopenharmony_ci dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n", 205162306a36Sopenharmony_ci inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 205262306a36Sopenharmony_ci ceph_cap_string(got)); 205362306a36Sopenharmony_ci 205462306a36Sopenharmony_ci if (!ceph_has_inline_data(ci)) { 205562306a36Sopenharmony_ci if (!retry_op && 205662306a36Sopenharmony_ci (iocb->ki_flags & IOCB_DIRECT) && 205762306a36Sopenharmony_ci !IS_ENCRYPTED(inode)) { 205862306a36Sopenharmony_ci ret = ceph_direct_read_write(iocb, to, 205962306a36Sopenharmony_ci NULL, NULL); 206062306a36Sopenharmony_ci if (ret >= 0 && ret < len) 206162306a36Sopenharmony_ci retry_op = CHECK_EOF; 206262306a36Sopenharmony_ci } else { 206362306a36Sopenharmony_ci ret = ceph_sync_read(iocb, to, &retry_op); 206462306a36Sopenharmony_ci } 206562306a36Sopenharmony_ci } else { 206662306a36Sopenharmony_ci retry_op = READ_INLINE; 206762306a36Sopenharmony_ci } 206862306a36Sopenharmony_ci } else { 206962306a36Sopenharmony_ci CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); 207062306a36Sopenharmony_ci dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", 207162306a36Sopenharmony_ci inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 207262306a36Sopenharmony_ci ceph_cap_string(got)); 207362306a36Sopenharmony_ci ceph_add_rw_context(fi, &rw_ctx); 207462306a36Sopenharmony_ci ret = generic_file_read_iter(iocb, to); 207562306a36Sopenharmony_ci ceph_del_rw_context(fi, &rw_ctx); 207662306a36Sopenharmony_ci } 207762306a36Sopenharmony_ci 207862306a36Sopenharmony_ci dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", 207962306a36Sopenharmony_ci inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); 208062306a36Sopenharmony_ci ceph_put_cap_refs(ci, got); 208162306a36Sopenharmony_ci 208262306a36Sopenharmony_ci if (direct_lock) 208362306a36Sopenharmony_ci ceph_end_io_direct(inode); 208462306a36Sopenharmony_ci else 208562306a36Sopenharmony_ci ceph_end_io_read(inode); 208662306a36Sopenharmony_ci 208762306a36Sopenharmony_ci if (retry_op > HAVE_RETRIED && ret >= 0) { 208862306a36Sopenharmony_ci int statret; 208962306a36Sopenharmony_ci struct page *page = NULL; 209062306a36Sopenharmony_ci loff_t i_size; 209162306a36Sopenharmony_ci if (retry_op == READ_INLINE) { 209262306a36Sopenharmony_ci page = __page_cache_alloc(GFP_KERNEL); 209362306a36Sopenharmony_ci if (!page) 209462306a36Sopenharmony_ci return -ENOMEM; 209562306a36Sopenharmony_ci } 209662306a36Sopenharmony_ci 209762306a36Sopenharmony_ci statret = __ceph_do_getattr(inode, page, 209862306a36Sopenharmony_ci CEPH_STAT_CAP_INLINE_DATA, !!page); 209962306a36Sopenharmony_ci if (statret < 0) { 210062306a36Sopenharmony_ci if (page) 210162306a36Sopenharmony_ci __free_page(page); 210262306a36Sopenharmony_ci if (statret == -ENODATA) { 210362306a36Sopenharmony_ci BUG_ON(retry_op != READ_INLINE); 210462306a36Sopenharmony_ci goto again; 210562306a36Sopenharmony_ci } 210662306a36Sopenharmony_ci return statret; 210762306a36Sopenharmony_ci } 210862306a36Sopenharmony_ci 210962306a36Sopenharmony_ci i_size = i_size_read(inode); 211062306a36Sopenharmony_ci if (retry_op == READ_INLINE) { 211162306a36Sopenharmony_ci BUG_ON(ret > 0 || read > 0); 211262306a36Sopenharmony_ci if (iocb->ki_pos < i_size && 211362306a36Sopenharmony_ci iocb->ki_pos < PAGE_SIZE) { 211462306a36Sopenharmony_ci loff_t end = min_t(loff_t, i_size, 211562306a36Sopenharmony_ci iocb->ki_pos + len); 211662306a36Sopenharmony_ci end = min_t(loff_t, end, PAGE_SIZE); 211762306a36Sopenharmony_ci if (statret < end) 211862306a36Sopenharmony_ci zero_user_segment(page, statret, end); 211962306a36Sopenharmony_ci ret = copy_page_to_iter(page, 212062306a36Sopenharmony_ci iocb->ki_pos & ~PAGE_MASK, 212162306a36Sopenharmony_ci end - iocb->ki_pos, to); 212262306a36Sopenharmony_ci iocb->ki_pos += ret; 212362306a36Sopenharmony_ci read += ret; 212462306a36Sopenharmony_ci } 212562306a36Sopenharmony_ci if (iocb->ki_pos < i_size && read < len) { 212662306a36Sopenharmony_ci size_t zlen = min_t(size_t, len - read, 212762306a36Sopenharmony_ci i_size - iocb->ki_pos); 212862306a36Sopenharmony_ci ret = iov_iter_zero(zlen, to); 212962306a36Sopenharmony_ci iocb->ki_pos += ret; 213062306a36Sopenharmony_ci read += ret; 213162306a36Sopenharmony_ci } 213262306a36Sopenharmony_ci __free_pages(page, 0); 213362306a36Sopenharmony_ci return read; 213462306a36Sopenharmony_ci } 213562306a36Sopenharmony_ci 213662306a36Sopenharmony_ci /* hit EOF or hole? */ 213762306a36Sopenharmony_ci if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && 213862306a36Sopenharmony_ci ret < len) { 213962306a36Sopenharmony_ci dout("sync_read hit hole, ppos %lld < size %lld" 214062306a36Sopenharmony_ci ", reading more\n", iocb->ki_pos, i_size); 214162306a36Sopenharmony_ci 214262306a36Sopenharmony_ci read += ret; 214362306a36Sopenharmony_ci len -= ret; 214462306a36Sopenharmony_ci retry_op = HAVE_RETRIED; 214562306a36Sopenharmony_ci goto again; 214662306a36Sopenharmony_ci } 214762306a36Sopenharmony_ci } 214862306a36Sopenharmony_ci 214962306a36Sopenharmony_ci if (ret >= 0) 215062306a36Sopenharmony_ci ret += read; 215162306a36Sopenharmony_ci 215262306a36Sopenharmony_ci return ret; 215362306a36Sopenharmony_ci} 215462306a36Sopenharmony_ci 215562306a36Sopenharmony_ci/* 215662306a36Sopenharmony_ci * Wrap filemap_splice_read with checks for cap bits on the inode. 215762306a36Sopenharmony_ci * Atomically grab references, so that those bits are not released 215862306a36Sopenharmony_ci * back to the MDS mid-read. 215962306a36Sopenharmony_ci */ 216062306a36Sopenharmony_cistatic ssize_t ceph_splice_read(struct file *in, loff_t *ppos, 216162306a36Sopenharmony_ci struct pipe_inode_info *pipe, 216262306a36Sopenharmony_ci size_t len, unsigned int flags) 216362306a36Sopenharmony_ci{ 216462306a36Sopenharmony_ci struct ceph_file_info *fi = in->private_data; 216562306a36Sopenharmony_ci struct inode *inode = file_inode(in); 216662306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 216762306a36Sopenharmony_ci ssize_t ret; 216862306a36Sopenharmony_ci int want = 0, got = 0; 216962306a36Sopenharmony_ci CEPH_DEFINE_RW_CONTEXT(rw_ctx, 0); 217062306a36Sopenharmony_ci 217162306a36Sopenharmony_ci dout("splice_read %p %llx.%llx %llu~%zu trying to get caps on %p\n", 217262306a36Sopenharmony_ci inode, ceph_vinop(inode), *ppos, len, inode); 217362306a36Sopenharmony_ci 217462306a36Sopenharmony_ci if (ceph_inode_is_shutdown(inode)) 217562306a36Sopenharmony_ci return -ESTALE; 217662306a36Sopenharmony_ci 217762306a36Sopenharmony_ci if (ceph_has_inline_data(ci) || 217862306a36Sopenharmony_ci (fi->flags & CEPH_F_SYNC)) 217962306a36Sopenharmony_ci return copy_splice_read(in, ppos, pipe, len, flags); 218062306a36Sopenharmony_ci 218162306a36Sopenharmony_ci ceph_start_io_read(inode); 218262306a36Sopenharmony_ci 218362306a36Sopenharmony_ci want = CEPH_CAP_FILE_CACHE; 218462306a36Sopenharmony_ci if (fi->fmode & CEPH_FILE_MODE_LAZY) 218562306a36Sopenharmony_ci want |= CEPH_CAP_FILE_LAZYIO; 218662306a36Sopenharmony_ci 218762306a36Sopenharmony_ci ret = ceph_get_caps(in, CEPH_CAP_FILE_RD, want, -1, &got); 218862306a36Sopenharmony_ci if (ret < 0) 218962306a36Sopenharmony_ci goto out_end; 219062306a36Sopenharmony_ci 219162306a36Sopenharmony_ci if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) == 0) { 219262306a36Sopenharmony_ci dout("splice_read/sync %p %llx.%llx %llu~%zu got cap refs on %s\n", 219362306a36Sopenharmony_ci inode, ceph_vinop(inode), *ppos, len, 219462306a36Sopenharmony_ci ceph_cap_string(got)); 219562306a36Sopenharmony_ci 219662306a36Sopenharmony_ci ceph_put_cap_refs(ci, got); 219762306a36Sopenharmony_ci ceph_end_io_read(inode); 219862306a36Sopenharmony_ci return copy_splice_read(in, ppos, pipe, len, flags); 219962306a36Sopenharmony_ci } 220062306a36Sopenharmony_ci 220162306a36Sopenharmony_ci dout("splice_read %p %llx.%llx %llu~%zu got cap refs on %s\n", 220262306a36Sopenharmony_ci inode, ceph_vinop(inode), *ppos, len, ceph_cap_string(got)); 220362306a36Sopenharmony_ci 220462306a36Sopenharmony_ci rw_ctx.caps = got; 220562306a36Sopenharmony_ci ceph_add_rw_context(fi, &rw_ctx); 220662306a36Sopenharmony_ci ret = filemap_splice_read(in, ppos, pipe, len, flags); 220762306a36Sopenharmony_ci ceph_del_rw_context(fi, &rw_ctx); 220862306a36Sopenharmony_ci 220962306a36Sopenharmony_ci dout("splice_read %p %llx.%llx dropping cap refs on %s = %zd\n", 221062306a36Sopenharmony_ci inode, ceph_vinop(inode), ceph_cap_string(got), ret); 221162306a36Sopenharmony_ci 221262306a36Sopenharmony_ci ceph_put_cap_refs(ci, got); 221362306a36Sopenharmony_ciout_end: 221462306a36Sopenharmony_ci ceph_end_io_read(inode); 221562306a36Sopenharmony_ci return ret; 221662306a36Sopenharmony_ci} 221762306a36Sopenharmony_ci 221862306a36Sopenharmony_ci/* 221962306a36Sopenharmony_ci * Take cap references to avoid releasing caps to MDS mid-write. 222062306a36Sopenharmony_ci * 222162306a36Sopenharmony_ci * If we are synchronous, and write with an old snap context, the OSD 222262306a36Sopenharmony_ci * may return EOLDSNAPC. In that case, retry the write.. _after_ 222362306a36Sopenharmony_ci * dropping our cap refs and allowing the pending snap to logically 222462306a36Sopenharmony_ci * complete _before_ this write occurs. 222562306a36Sopenharmony_ci * 222662306a36Sopenharmony_ci * If we are near ENOSPC, write synchronously. 222762306a36Sopenharmony_ci */ 222862306a36Sopenharmony_cistatic ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) 222962306a36Sopenharmony_ci{ 223062306a36Sopenharmony_ci struct file *file = iocb->ki_filp; 223162306a36Sopenharmony_ci struct ceph_file_info *fi = file->private_data; 223262306a36Sopenharmony_ci struct inode *inode = file_inode(file); 223362306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 223462306a36Sopenharmony_ci struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 223562306a36Sopenharmony_ci struct ceph_osd_client *osdc = &fsc->client->osdc; 223662306a36Sopenharmony_ci struct ceph_cap_flush *prealloc_cf; 223762306a36Sopenharmony_ci ssize_t count, written = 0; 223862306a36Sopenharmony_ci int err, want = 0, got; 223962306a36Sopenharmony_ci bool direct_lock = false; 224062306a36Sopenharmony_ci u32 map_flags; 224162306a36Sopenharmony_ci u64 pool_flags; 224262306a36Sopenharmony_ci loff_t pos; 224362306a36Sopenharmony_ci loff_t limit = max(i_size_read(inode), fsc->max_file_size); 224462306a36Sopenharmony_ci 224562306a36Sopenharmony_ci if (ceph_inode_is_shutdown(inode)) 224662306a36Sopenharmony_ci return -ESTALE; 224762306a36Sopenharmony_ci 224862306a36Sopenharmony_ci if (ceph_snap(inode) != CEPH_NOSNAP) 224962306a36Sopenharmony_ci return -EROFS; 225062306a36Sopenharmony_ci 225162306a36Sopenharmony_ci prealloc_cf = ceph_alloc_cap_flush(); 225262306a36Sopenharmony_ci if (!prealloc_cf) 225362306a36Sopenharmony_ci return -ENOMEM; 225462306a36Sopenharmony_ci 225562306a36Sopenharmony_ci if ((iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND)) == IOCB_DIRECT) 225662306a36Sopenharmony_ci direct_lock = true; 225762306a36Sopenharmony_ci 225862306a36Sopenharmony_ciretry_snap: 225962306a36Sopenharmony_ci if (direct_lock) 226062306a36Sopenharmony_ci ceph_start_io_direct(inode); 226162306a36Sopenharmony_ci else 226262306a36Sopenharmony_ci ceph_start_io_write(inode); 226362306a36Sopenharmony_ci 226462306a36Sopenharmony_ci if (iocb->ki_flags & IOCB_APPEND) { 226562306a36Sopenharmony_ci err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); 226662306a36Sopenharmony_ci if (err < 0) 226762306a36Sopenharmony_ci goto out; 226862306a36Sopenharmony_ci } 226962306a36Sopenharmony_ci 227062306a36Sopenharmony_ci err = generic_write_checks(iocb, from); 227162306a36Sopenharmony_ci if (err <= 0) 227262306a36Sopenharmony_ci goto out; 227362306a36Sopenharmony_ci 227462306a36Sopenharmony_ci pos = iocb->ki_pos; 227562306a36Sopenharmony_ci if (unlikely(pos >= limit)) { 227662306a36Sopenharmony_ci err = -EFBIG; 227762306a36Sopenharmony_ci goto out; 227862306a36Sopenharmony_ci } else { 227962306a36Sopenharmony_ci iov_iter_truncate(from, limit - pos); 228062306a36Sopenharmony_ci } 228162306a36Sopenharmony_ci 228262306a36Sopenharmony_ci count = iov_iter_count(from); 228362306a36Sopenharmony_ci if (ceph_quota_is_max_bytes_exceeded(inode, pos + count)) { 228462306a36Sopenharmony_ci err = -EDQUOT; 228562306a36Sopenharmony_ci goto out; 228662306a36Sopenharmony_ci } 228762306a36Sopenharmony_ci 228862306a36Sopenharmony_ci down_read(&osdc->lock); 228962306a36Sopenharmony_ci map_flags = osdc->osdmap->flags; 229062306a36Sopenharmony_ci pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id); 229162306a36Sopenharmony_ci up_read(&osdc->lock); 229262306a36Sopenharmony_ci if ((map_flags & CEPH_OSDMAP_FULL) || 229362306a36Sopenharmony_ci (pool_flags & CEPH_POOL_FLAG_FULL)) { 229462306a36Sopenharmony_ci err = -ENOSPC; 229562306a36Sopenharmony_ci goto out; 229662306a36Sopenharmony_ci } 229762306a36Sopenharmony_ci 229862306a36Sopenharmony_ci err = file_remove_privs(file); 229962306a36Sopenharmony_ci if (err) 230062306a36Sopenharmony_ci goto out; 230162306a36Sopenharmony_ci 230262306a36Sopenharmony_ci dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", 230362306a36Sopenharmony_ci inode, ceph_vinop(inode), pos, count, i_size_read(inode)); 230462306a36Sopenharmony_ci if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) 230562306a36Sopenharmony_ci want |= CEPH_CAP_FILE_BUFFER; 230662306a36Sopenharmony_ci if (fi->fmode & CEPH_FILE_MODE_LAZY) 230762306a36Sopenharmony_ci want |= CEPH_CAP_FILE_LAZYIO; 230862306a36Sopenharmony_ci got = 0; 230962306a36Sopenharmony_ci err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got); 231062306a36Sopenharmony_ci if (err < 0) 231162306a36Sopenharmony_ci goto out; 231262306a36Sopenharmony_ci 231362306a36Sopenharmony_ci err = file_update_time(file); 231462306a36Sopenharmony_ci if (err) 231562306a36Sopenharmony_ci goto out_caps; 231662306a36Sopenharmony_ci 231762306a36Sopenharmony_ci inode_inc_iversion_raw(inode); 231862306a36Sopenharmony_ci 231962306a36Sopenharmony_ci dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n", 232062306a36Sopenharmony_ci inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); 232162306a36Sopenharmony_ci 232262306a36Sopenharmony_ci if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || 232362306a36Sopenharmony_ci (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) || 232462306a36Sopenharmony_ci (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { 232562306a36Sopenharmony_ci struct ceph_snap_context *snapc; 232662306a36Sopenharmony_ci struct iov_iter data; 232762306a36Sopenharmony_ci 232862306a36Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 232962306a36Sopenharmony_ci if (__ceph_have_pending_cap_snap(ci)) { 233062306a36Sopenharmony_ci struct ceph_cap_snap *capsnap = 233162306a36Sopenharmony_ci list_last_entry(&ci->i_cap_snaps, 233262306a36Sopenharmony_ci struct ceph_cap_snap, 233362306a36Sopenharmony_ci ci_item); 233462306a36Sopenharmony_ci snapc = ceph_get_snap_context(capsnap->context); 233562306a36Sopenharmony_ci } else { 233662306a36Sopenharmony_ci BUG_ON(!ci->i_head_snapc); 233762306a36Sopenharmony_ci snapc = ceph_get_snap_context(ci->i_head_snapc); 233862306a36Sopenharmony_ci } 233962306a36Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 234062306a36Sopenharmony_ci 234162306a36Sopenharmony_ci /* we might need to revert back to that point */ 234262306a36Sopenharmony_ci data = *from; 234362306a36Sopenharmony_ci if ((iocb->ki_flags & IOCB_DIRECT) && !IS_ENCRYPTED(inode)) 234462306a36Sopenharmony_ci written = ceph_direct_read_write(iocb, &data, snapc, 234562306a36Sopenharmony_ci &prealloc_cf); 234662306a36Sopenharmony_ci else 234762306a36Sopenharmony_ci written = ceph_sync_write(iocb, &data, pos, snapc); 234862306a36Sopenharmony_ci if (direct_lock) 234962306a36Sopenharmony_ci ceph_end_io_direct(inode); 235062306a36Sopenharmony_ci else 235162306a36Sopenharmony_ci ceph_end_io_write(inode); 235262306a36Sopenharmony_ci if (written > 0) 235362306a36Sopenharmony_ci iov_iter_advance(from, written); 235462306a36Sopenharmony_ci ceph_put_snap_context(snapc); 235562306a36Sopenharmony_ci } else { 235662306a36Sopenharmony_ci /* 235762306a36Sopenharmony_ci * No need to acquire the i_truncate_mutex. Because 235862306a36Sopenharmony_ci * the MDS revokes Fwb caps before sending truncate 235962306a36Sopenharmony_ci * message to us. We can't get Fwb cap while there 236062306a36Sopenharmony_ci * are pending vmtruncate. So write and vmtruncate 236162306a36Sopenharmony_ci * can not run at the same time 236262306a36Sopenharmony_ci */ 236362306a36Sopenharmony_ci written = generic_perform_write(iocb, from); 236462306a36Sopenharmony_ci ceph_end_io_write(inode); 236562306a36Sopenharmony_ci } 236662306a36Sopenharmony_ci 236762306a36Sopenharmony_ci if (written >= 0) { 236862306a36Sopenharmony_ci int dirty; 236962306a36Sopenharmony_ci 237062306a36Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 237162306a36Sopenharmony_ci dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, 237262306a36Sopenharmony_ci &prealloc_cf); 237362306a36Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 237462306a36Sopenharmony_ci if (dirty) 237562306a36Sopenharmony_ci __mark_inode_dirty(inode, dirty); 237662306a36Sopenharmony_ci if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos)) 237762306a36Sopenharmony_ci ceph_check_caps(ci, CHECK_CAPS_FLUSH); 237862306a36Sopenharmony_ci } 237962306a36Sopenharmony_ci 238062306a36Sopenharmony_ci dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", 238162306a36Sopenharmony_ci inode, ceph_vinop(inode), pos, (unsigned)count, 238262306a36Sopenharmony_ci ceph_cap_string(got)); 238362306a36Sopenharmony_ci ceph_put_cap_refs(ci, got); 238462306a36Sopenharmony_ci 238562306a36Sopenharmony_ci if (written == -EOLDSNAPC) { 238662306a36Sopenharmony_ci dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", 238762306a36Sopenharmony_ci inode, ceph_vinop(inode), pos, (unsigned)count); 238862306a36Sopenharmony_ci goto retry_snap; 238962306a36Sopenharmony_ci } 239062306a36Sopenharmony_ci 239162306a36Sopenharmony_ci if (written >= 0) { 239262306a36Sopenharmony_ci if ((map_flags & CEPH_OSDMAP_NEARFULL) || 239362306a36Sopenharmony_ci (pool_flags & CEPH_POOL_FLAG_NEARFULL)) 239462306a36Sopenharmony_ci iocb->ki_flags |= IOCB_DSYNC; 239562306a36Sopenharmony_ci written = generic_write_sync(iocb, written); 239662306a36Sopenharmony_ci } 239762306a36Sopenharmony_ci 239862306a36Sopenharmony_ci goto out_unlocked; 239962306a36Sopenharmony_ciout_caps: 240062306a36Sopenharmony_ci ceph_put_cap_refs(ci, got); 240162306a36Sopenharmony_ciout: 240262306a36Sopenharmony_ci if (direct_lock) 240362306a36Sopenharmony_ci ceph_end_io_direct(inode); 240462306a36Sopenharmony_ci else 240562306a36Sopenharmony_ci ceph_end_io_write(inode); 240662306a36Sopenharmony_ciout_unlocked: 240762306a36Sopenharmony_ci ceph_free_cap_flush(prealloc_cf); 240862306a36Sopenharmony_ci return written ? written : err; 240962306a36Sopenharmony_ci} 241062306a36Sopenharmony_ci 241162306a36Sopenharmony_ci/* 241262306a36Sopenharmony_ci * llseek. be sure to verify file size on SEEK_END. 241362306a36Sopenharmony_ci */ 241462306a36Sopenharmony_cistatic loff_t ceph_llseek(struct file *file, loff_t offset, int whence) 241562306a36Sopenharmony_ci{ 241662306a36Sopenharmony_ci if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { 241762306a36Sopenharmony_ci struct inode *inode = file_inode(file); 241862306a36Sopenharmony_ci int ret; 241962306a36Sopenharmony_ci 242062306a36Sopenharmony_ci ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); 242162306a36Sopenharmony_ci if (ret < 0) 242262306a36Sopenharmony_ci return ret; 242362306a36Sopenharmony_ci } 242462306a36Sopenharmony_ci return generic_file_llseek(file, offset, whence); 242562306a36Sopenharmony_ci} 242662306a36Sopenharmony_ci 242762306a36Sopenharmony_cistatic inline void ceph_zero_partial_page( 242862306a36Sopenharmony_ci struct inode *inode, loff_t offset, unsigned size) 242962306a36Sopenharmony_ci{ 243062306a36Sopenharmony_ci struct page *page; 243162306a36Sopenharmony_ci pgoff_t index = offset >> PAGE_SHIFT; 243262306a36Sopenharmony_ci 243362306a36Sopenharmony_ci page = find_lock_page(inode->i_mapping, index); 243462306a36Sopenharmony_ci if (page) { 243562306a36Sopenharmony_ci wait_on_page_writeback(page); 243662306a36Sopenharmony_ci zero_user(page, offset & (PAGE_SIZE - 1), size); 243762306a36Sopenharmony_ci unlock_page(page); 243862306a36Sopenharmony_ci put_page(page); 243962306a36Sopenharmony_ci } 244062306a36Sopenharmony_ci} 244162306a36Sopenharmony_ci 244262306a36Sopenharmony_cistatic void ceph_zero_pagecache_range(struct inode *inode, loff_t offset, 244362306a36Sopenharmony_ci loff_t length) 244462306a36Sopenharmony_ci{ 244562306a36Sopenharmony_ci loff_t nearly = round_up(offset, PAGE_SIZE); 244662306a36Sopenharmony_ci if (offset < nearly) { 244762306a36Sopenharmony_ci loff_t size = nearly - offset; 244862306a36Sopenharmony_ci if (length < size) 244962306a36Sopenharmony_ci size = length; 245062306a36Sopenharmony_ci ceph_zero_partial_page(inode, offset, size); 245162306a36Sopenharmony_ci offset += size; 245262306a36Sopenharmony_ci length -= size; 245362306a36Sopenharmony_ci } 245462306a36Sopenharmony_ci if (length >= PAGE_SIZE) { 245562306a36Sopenharmony_ci loff_t size = round_down(length, PAGE_SIZE); 245662306a36Sopenharmony_ci truncate_pagecache_range(inode, offset, offset + size - 1); 245762306a36Sopenharmony_ci offset += size; 245862306a36Sopenharmony_ci length -= size; 245962306a36Sopenharmony_ci } 246062306a36Sopenharmony_ci if (length) 246162306a36Sopenharmony_ci ceph_zero_partial_page(inode, offset, length); 246262306a36Sopenharmony_ci} 246362306a36Sopenharmony_ci 246462306a36Sopenharmony_cistatic int ceph_zero_partial_object(struct inode *inode, 246562306a36Sopenharmony_ci loff_t offset, loff_t *length) 246662306a36Sopenharmony_ci{ 246762306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 246862306a36Sopenharmony_ci struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 246962306a36Sopenharmony_ci struct ceph_osd_request *req; 247062306a36Sopenharmony_ci int ret = 0; 247162306a36Sopenharmony_ci loff_t zero = 0; 247262306a36Sopenharmony_ci int op; 247362306a36Sopenharmony_ci 247462306a36Sopenharmony_ci if (ceph_inode_is_shutdown(inode)) 247562306a36Sopenharmony_ci return -EIO; 247662306a36Sopenharmony_ci 247762306a36Sopenharmony_ci if (!length) { 247862306a36Sopenharmony_ci op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE; 247962306a36Sopenharmony_ci length = &zero; 248062306a36Sopenharmony_ci } else { 248162306a36Sopenharmony_ci op = CEPH_OSD_OP_ZERO; 248262306a36Sopenharmony_ci } 248362306a36Sopenharmony_ci 248462306a36Sopenharmony_ci req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 248562306a36Sopenharmony_ci ceph_vino(inode), 248662306a36Sopenharmony_ci offset, length, 248762306a36Sopenharmony_ci 0, 1, op, 248862306a36Sopenharmony_ci CEPH_OSD_FLAG_WRITE, 248962306a36Sopenharmony_ci NULL, 0, 0, false); 249062306a36Sopenharmony_ci if (IS_ERR(req)) { 249162306a36Sopenharmony_ci ret = PTR_ERR(req); 249262306a36Sopenharmony_ci goto out; 249362306a36Sopenharmony_ci } 249462306a36Sopenharmony_ci 249562306a36Sopenharmony_ci req->r_mtime = inode->i_mtime; 249662306a36Sopenharmony_ci ceph_osdc_start_request(&fsc->client->osdc, req); 249762306a36Sopenharmony_ci ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 249862306a36Sopenharmony_ci if (ret == -ENOENT) 249962306a36Sopenharmony_ci ret = 0; 250062306a36Sopenharmony_ci ceph_osdc_put_request(req); 250162306a36Sopenharmony_ci 250262306a36Sopenharmony_ciout: 250362306a36Sopenharmony_ci return ret; 250462306a36Sopenharmony_ci} 250562306a36Sopenharmony_ci 250662306a36Sopenharmony_cistatic int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length) 250762306a36Sopenharmony_ci{ 250862306a36Sopenharmony_ci int ret = 0; 250962306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 251062306a36Sopenharmony_ci s32 stripe_unit = ci->i_layout.stripe_unit; 251162306a36Sopenharmony_ci s32 stripe_count = ci->i_layout.stripe_count; 251262306a36Sopenharmony_ci s32 object_size = ci->i_layout.object_size; 251362306a36Sopenharmony_ci u64 object_set_size = object_size * stripe_count; 251462306a36Sopenharmony_ci u64 nearly, t; 251562306a36Sopenharmony_ci 251662306a36Sopenharmony_ci /* round offset up to next period boundary */ 251762306a36Sopenharmony_ci nearly = offset + object_set_size - 1; 251862306a36Sopenharmony_ci t = nearly; 251962306a36Sopenharmony_ci nearly -= do_div(t, object_set_size); 252062306a36Sopenharmony_ci 252162306a36Sopenharmony_ci while (length && offset < nearly) { 252262306a36Sopenharmony_ci loff_t size = length; 252362306a36Sopenharmony_ci ret = ceph_zero_partial_object(inode, offset, &size); 252462306a36Sopenharmony_ci if (ret < 0) 252562306a36Sopenharmony_ci return ret; 252662306a36Sopenharmony_ci offset += size; 252762306a36Sopenharmony_ci length -= size; 252862306a36Sopenharmony_ci } 252962306a36Sopenharmony_ci while (length >= object_set_size) { 253062306a36Sopenharmony_ci int i; 253162306a36Sopenharmony_ci loff_t pos = offset; 253262306a36Sopenharmony_ci for (i = 0; i < stripe_count; ++i) { 253362306a36Sopenharmony_ci ret = ceph_zero_partial_object(inode, pos, NULL); 253462306a36Sopenharmony_ci if (ret < 0) 253562306a36Sopenharmony_ci return ret; 253662306a36Sopenharmony_ci pos += stripe_unit; 253762306a36Sopenharmony_ci } 253862306a36Sopenharmony_ci offset += object_set_size; 253962306a36Sopenharmony_ci length -= object_set_size; 254062306a36Sopenharmony_ci } 254162306a36Sopenharmony_ci while (length) { 254262306a36Sopenharmony_ci loff_t size = length; 254362306a36Sopenharmony_ci ret = ceph_zero_partial_object(inode, offset, &size); 254462306a36Sopenharmony_ci if (ret < 0) 254562306a36Sopenharmony_ci return ret; 254662306a36Sopenharmony_ci offset += size; 254762306a36Sopenharmony_ci length -= size; 254862306a36Sopenharmony_ci } 254962306a36Sopenharmony_ci return ret; 255062306a36Sopenharmony_ci} 255162306a36Sopenharmony_ci 255262306a36Sopenharmony_cistatic long ceph_fallocate(struct file *file, int mode, 255362306a36Sopenharmony_ci loff_t offset, loff_t length) 255462306a36Sopenharmony_ci{ 255562306a36Sopenharmony_ci struct ceph_file_info *fi = file->private_data; 255662306a36Sopenharmony_ci struct inode *inode = file_inode(file); 255762306a36Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 255862306a36Sopenharmony_ci struct ceph_cap_flush *prealloc_cf; 255962306a36Sopenharmony_ci int want, got = 0; 256062306a36Sopenharmony_ci int dirty; 256162306a36Sopenharmony_ci int ret = 0; 256262306a36Sopenharmony_ci loff_t endoff = 0; 256362306a36Sopenharmony_ci loff_t size; 256462306a36Sopenharmony_ci 256562306a36Sopenharmony_ci dout("%s %p %llx.%llx mode %x, offset %llu length %llu\n", __func__, 256662306a36Sopenharmony_ci inode, ceph_vinop(inode), mode, offset, length); 256762306a36Sopenharmony_ci 256862306a36Sopenharmony_ci if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 256962306a36Sopenharmony_ci return -EOPNOTSUPP; 257062306a36Sopenharmony_ci 257162306a36Sopenharmony_ci if (!S_ISREG(inode->i_mode)) 257262306a36Sopenharmony_ci return -EOPNOTSUPP; 257362306a36Sopenharmony_ci 257462306a36Sopenharmony_ci if (IS_ENCRYPTED(inode)) 257562306a36Sopenharmony_ci return -EOPNOTSUPP; 257662306a36Sopenharmony_ci 257762306a36Sopenharmony_ci prealloc_cf = ceph_alloc_cap_flush(); 257862306a36Sopenharmony_ci if (!prealloc_cf) 257962306a36Sopenharmony_ci return -ENOMEM; 258062306a36Sopenharmony_ci 258162306a36Sopenharmony_ci inode_lock(inode); 258262306a36Sopenharmony_ci 258362306a36Sopenharmony_ci if (ceph_snap(inode) != CEPH_NOSNAP) { 258462306a36Sopenharmony_ci ret = -EROFS; 258562306a36Sopenharmony_ci goto unlock; 258662306a36Sopenharmony_ci } 258762306a36Sopenharmony_ci 258862306a36Sopenharmony_ci size = i_size_read(inode); 258962306a36Sopenharmony_ci 259062306a36Sopenharmony_ci /* Are we punching a hole beyond EOF? */ 259162306a36Sopenharmony_ci if (offset >= size) 259262306a36Sopenharmony_ci goto unlock; 259362306a36Sopenharmony_ci if ((offset + length) > size) 259462306a36Sopenharmony_ci length = size - offset; 259562306a36Sopenharmony_ci 259662306a36Sopenharmony_ci if (fi->fmode & CEPH_FILE_MODE_LAZY) 259762306a36Sopenharmony_ci want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; 259862306a36Sopenharmony_ci else 259962306a36Sopenharmony_ci want = CEPH_CAP_FILE_BUFFER; 260062306a36Sopenharmony_ci 260162306a36Sopenharmony_ci ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got); 260262306a36Sopenharmony_ci if (ret < 0) 260362306a36Sopenharmony_ci goto unlock; 260462306a36Sopenharmony_ci 260562306a36Sopenharmony_ci ret = file_modified(file); 260662306a36Sopenharmony_ci if (ret) 260762306a36Sopenharmony_ci goto put_caps; 260862306a36Sopenharmony_ci 260962306a36Sopenharmony_ci filemap_invalidate_lock(inode->i_mapping); 261062306a36Sopenharmony_ci ceph_fscache_invalidate(inode, false); 261162306a36Sopenharmony_ci ceph_zero_pagecache_range(inode, offset, length); 261262306a36Sopenharmony_ci ret = ceph_zero_objects(inode, offset, length); 261362306a36Sopenharmony_ci 261462306a36Sopenharmony_ci if (!ret) { 261562306a36Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 261662306a36Sopenharmony_ci dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, 261762306a36Sopenharmony_ci &prealloc_cf); 261862306a36Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 261962306a36Sopenharmony_ci if (dirty) 262062306a36Sopenharmony_ci __mark_inode_dirty(inode, dirty); 262162306a36Sopenharmony_ci } 262262306a36Sopenharmony_ci filemap_invalidate_unlock(inode->i_mapping); 262362306a36Sopenharmony_ci 262462306a36Sopenharmony_ciput_caps: 262562306a36Sopenharmony_ci ceph_put_cap_refs(ci, got); 262662306a36Sopenharmony_ciunlock: 262762306a36Sopenharmony_ci inode_unlock(inode); 262862306a36Sopenharmony_ci ceph_free_cap_flush(prealloc_cf); 262962306a36Sopenharmony_ci return ret; 263062306a36Sopenharmony_ci} 263162306a36Sopenharmony_ci 263262306a36Sopenharmony_ci/* 263362306a36Sopenharmony_ci * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for 263462306a36Sopenharmony_ci * src_ci. Two attempts are made to obtain both caps, and an error is return if 263562306a36Sopenharmony_ci * this fails; zero is returned on success. 263662306a36Sopenharmony_ci */ 263762306a36Sopenharmony_cistatic int get_rd_wr_caps(struct file *src_filp, int *src_got, 263862306a36Sopenharmony_ci struct file *dst_filp, 263962306a36Sopenharmony_ci loff_t dst_endoff, int *dst_got) 264062306a36Sopenharmony_ci{ 264162306a36Sopenharmony_ci int ret = 0; 264262306a36Sopenharmony_ci bool retrying = false; 264362306a36Sopenharmony_ci 264462306a36Sopenharmony_ciretry_caps: 264562306a36Sopenharmony_ci ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, 264662306a36Sopenharmony_ci dst_endoff, dst_got); 264762306a36Sopenharmony_ci if (ret < 0) 264862306a36Sopenharmony_ci return ret; 264962306a36Sopenharmony_ci 265062306a36Sopenharmony_ci /* 265162306a36Sopenharmony_ci * Since we're already holding the FILE_WR capability for the dst file, 265262306a36Sopenharmony_ci * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some 265362306a36Sopenharmony_ci * retry dance instead to try to get both capabilities. 265462306a36Sopenharmony_ci */ 265562306a36Sopenharmony_ci ret = ceph_try_get_caps(file_inode(src_filp), 265662306a36Sopenharmony_ci CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, 265762306a36Sopenharmony_ci false, src_got); 265862306a36Sopenharmony_ci if (ret <= 0) { 265962306a36Sopenharmony_ci /* Start by dropping dst_ci caps and getting src_ci caps */ 266062306a36Sopenharmony_ci ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got); 266162306a36Sopenharmony_ci if (retrying) { 266262306a36Sopenharmony_ci if (!ret) 266362306a36Sopenharmony_ci /* ceph_try_get_caps masks EAGAIN */ 266462306a36Sopenharmony_ci ret = -EAGAIN; 266562306a36Sopenharmony_ci return ret; 266662306a36Sopenharmony_ci } 266762306a36Sopenharmony_ci ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD, 266862306a36Sopenharmony_ci CEPH_CAP_FILE_SHARED, -1, src_got); 266962306a36Sopenharmony_ci if (ret < 0) 267062306a36Sopenharmony_ci return ret; 267162306a36Sopenharmony_ci /*... drop src_ci caps too, and retry */ 267262306a36Sopenharmony_ci ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got); 267362306a36Sopenharmony_ci retrying = true; 267462306a36Sopenharmony_ci goto retry_caps; 267562306a36Sopenharmony_ci } 267662306a36Sopenharmony_ci return ret; 267762306a36Sopenharmony_ci} 267862306a36Sopenharmony_ci 267962306a36Sopenharmony_cistatic void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got, 268062306a36Sopenharmony_ci struct ceph_inode_info *dst_ci, int dst_got) 268162306a36Sopenharmony_ci{ 268262306a36Sopenharmony_ci ceph_put_cap_refs(src_ci, src_got); 268362306a36Sopenharmony_ci ceph_put_cap_refs(dst_ci, dst_got); 268462306a36Sopenharmony_ci} 268562306a36Sopenharmony_ci 268662306a36Sopenharmony_ci/* 268762306a36Sopenharmony_ci * This function does several size-related checks, returning an error if: 268862306a36Sopenharmony_ci * - source file is smaller than off+len 268962306a36Sopenharmony_ci * - destination file size is not OK (inode_newsize_ok()) 269062306a36Sopenharmony_ci * - max bytes quotas is exceeded 269162306a36Sopenharmony_ci */ 269262306a36Sopenharmony_cistatic int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode, 269362306a36Sopenharmony_ci loff_t src_off, loff_t dst_off, size_t len) 269462306a36Sopenharmony_ci{ 269562306a36Sopenharmony_ci loff_t size, endoff; 269662306a36Sopenharmony_ci 269762306a36Sopenharmony_ci size = i_size_read(src_inode); 269862306a36Sopenharmony_ci /* 269962306a36Sopenharmony_ci * Don't copy beyond source file EOF. Instead of simply setting length 270062306a36Sopenharmony_ci * to (size - src_off), just drop to VFS default implementation, as the 270162306a36Sopenharmony_ci * local i_size may be stale due to other clients writing to the source 270262306a36Sopenharmony_ci * inode. 270362306a36Sopenharmony_ci */ 270462306a36Sopenharmony_ci if (src_off + len > size) { 270562306a36Sopenharmony_ci dout("Copy beyond EOF (%llu + %zu > %llu)\n", 270662306a36Sopenharmony_ci src_off, len, size); 270762306a36Sopenharmony_ci return -EOPNOTSUPP; 270862306a36Sopenharmony_ci } 270962306a36Sopenharmony_ci size = i_size_read(dst_inode); 271062306a36Sopenharmony_ci 271162306a36Sopenharmony_ci endoff = dst_off + len; 271262306a36Sopenharmony_ci if (inode_newsize_ok(dst_inode, endoff)) 271362306a36Sopenharmony_ci return -EOPNOTSUPP; 271462306a36Sopenharmony_ci 271562306a36Sopenharmony_ci if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff)) 271662306a36Sopenharmony_ci return -EDQUOT; 271762306a36Sopenharmony_ci 271862306a36Sopenharmony_ci return 0; 271962306a36Sopenharmony_ci} 272062306a36Sopenharmony_ci 272162306a36Sopenharmony_cistatic struct ceph_osd_request * 272262306a36Sopenharmony_ciceph_alloc_copyfrom_request(struct ceph_osd_client *osdc, 272362306a36Sopenharmony_ci u64 src_snapid, 272462306a36Sopenharmony_ci struct ceph_object_id *src_oid, 272562306a36Sopenharmony_ci struct ceph_object_locator *src_oloc, 272662306a36Sopenharmony_ci struct ceph_object_id *dst_oid, 272762306a36Sopenharmony_ci struct ceph_object_locator *dst_oloc, 272862306a36Sopenharmony_ci u32 truncate_seq, u64 truncate_size) 272962306a36Sopenharmony_ci{ 273062306a36Sopenharmony_ci struct ceph_osd_request *req; 273162306a36Sopenharmony_ci int ret; 273262306a36Sopenharmony_ci u32 src_fadvise_flags = 273362306a36Sopenharmony_ci CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | 273462306a36Sopenharmony_ci CEPH_OSD_OP_FLAG_FADVISE_NOCACHE; 273562306a36Sopenharmony_ci u32 dst_fadvise_flags = 273662306a36Sopenharmony_ci CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | 273762306a36Sopenharmony_ci CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; 273862306a36Sopenharmony_ci 273962306a36Sopenharmony_ci req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); 274062306a36Sopenharmony_ci if (!req) 274162306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 274262306a36Sopenharmony_ci 274362306a36Sopenharmony_ci req->r_flags = CEPH_OSD_FLAG_WRITE; 274462306a36Sopenharmony_ci 274562306a36Sopenharmony_ci ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc); 274662306a36Sopenharmony_ci ceph_oid_copy(&req->r_t.base_oid, dst_oid); 274762306a36Sopenharmony_ci 274862306a36Sopenharmony_ci ret = osd_req_op_copy_from_init(req, src_snapid, 0, 274962306a36Sopenharmony_ci src_oid, src_oloc, 275062306a36Sopenharmony_ci src_fadvise_flags, 275162306a36Sopenharmony_ci dst_fadvise_flags, 275262306a36Sopenharmony_ci truncate_seq, 275362306a36Sopenharmony_ci truncate_size, 275462306a36Sopenharmony_ci CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ); 275562306a36Sopenharmony_ci if (ret) 275662306a36Sopenharmony_ci goto out; 275762306a36Sopenharmony_ci 275862306a36Sopenharmony_ci ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); 275962306a36Sopenharmony_ci if (ret) 276062306a36Sopenharmony_ci goto out; 276162306a36Sopenharmony_ci 276262306a36Sopenharmony_ci return req; 276362306a36Sopenharmony_ci 276462306a36Sopenharmony_ciout: 276562306a36Sopenharmony_ci ceph_osdc_put_request(req); 276662306a36Sopenharmony_ci return ERR_PTR(ret); 276762306a36Sopenharmony_ci} 276862306a36Sopenharmony_ci 276962306a36Sopenharmony_cistatic ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off, 277062306a36Sopenharmony_ci struct ceph_inode_info *dst_ci, u64 *dst_off, 277162306a36Sopenharmony_ci struct ceph_fs_client *fsc, 277262306a36Sopenharmony_ci size_t len, unsigned int flags) 277362306a36Sopenharmony_ci{ 277462306a36Sopenharmony_ci struct ceph_object_locator src_oloc, dst_oloc; 277562306a36Sopenharmony_ci struct ceph_object_id src_oid, dst_oid; 277662306a36Sopenharmony_ci struct ceph_osd_client *osdc; 277762306a36Sopenharmony_ci struct ceph_osd_request *req; 277862306a36Sopenharmony_ci size_t bytes = 0; 277962306a36Sopenharmony_ci u64 src_objnum, src_objoff, dst_objnum, dst_objoff; 278062306a36Sopenharmony_ci u32 src_objlen, dst_objlen; 278162306a36Sopenharmony_ci u32 object_size = src_ci->i_layout.object_size; 278262306a36Sopenharmony_ci int ret; 278362306a36Sopenharmony_ci 278462306a36Sopenharmony_ci src_oloc.pool = src_ci->i_layout.pool_id; 278562306a36Sopenharmony_ci src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns); 278662306a36Sopenharmony_ci dst_oloc.pool = dst_ci->i_layout.pool_id; 278762306a36Sopenharmony_ci dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns); 278862306a36Sopenharmony_ci osdc = &fsc->client->osdc; 278962306a36Sopenharmony_ci 279062306a36Sopenharmony_ci while (len >= object_size) { 279162306a36Sopenharmony_ci ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off, 279262306a36Sopenharmony_ci object_size, &src_objnum, 279362306a36Sopenharmony_ci &src_objoff, &src_objlen); 279462306a36Sopenharmony_ci ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off, 279562306a36Sopenharmony_ci object_size, &dst_objnum, 279662306a36Sopenharmony_ci &dst_objoff, &dst_objlen); 279762306a36Sopenharmony_ci ceph_oid_init(&src_oid); 279862306a36Sopenharmony_ci ceph_oid_printf(&src_oid, "%llx.%08llx", 279962306a36Sopenharmony_ci src_ci->i_vino.ino, src_objnum); 280062306a36Sopenharmony_ci ceph_oid_init(&dst_oid); 280162306a36Sopenharmony_ci ceph_oid_printf(&dst_oid, "%llx.%08llx", 280262306a36Sopenharmony_ci dst_ci->i_vino.ino, dst_objnum); 280362306a36Sopenharmony_ci /* Do an object remote copy */ 280462306a36Sopenharmony_ci req = ceph_alloc_copyfrom_request(osdc, src_ci->i_vino.snap, 280562306a36Sopenharmony_ci &src_oid, &src_oloc, 280662306a36Sopenharmony_ci &dst_oid, &dst_oloc, 280762306a36Sopenharmony_ci dst_ci->i_truncate_seq, 280862306a36Sopenharmony_ci dst_ci->i_truncate_size); 280962306a36Sopenharmony_ci if (IS_ERR(req)) 281062306a36Sopenharmony_ci ret = PTR_ERR(req); 281162306a36Sopenharmony_ci else { 281262306a36Sopenharmony_ci ceph_osdc_start_request(osdc, req); 281362306a36Sopenharmony_ci ret = ceph_osdc_wait_request(osdc, req); 281462306a36Sopenharmony_ci ceph_update_copyfrom_metrics(&fsc->mdsc->metric, 281562306a36Sopenharmony_ci req->r_start_latency, 281662306a36Sopenharmony_ci req->r_end_latency, 281762306a36Sopenharmony_ci object_size, ret); 281862306a36Sopenharmony_ci ceph_osdc_put_request(req); 281962306a36Sopenharmony_ci } 282062306a36Sopenharmony_ci if (ret) { 282162306a36Sopenharmony_ci if (ret == -EOPNOTSUPP) { 282262306a36Sopenharmony_ci fsc->have_copy_from2 = false; 282362306a36Sopenharmony_ci pr_notice("OSDs don't support copy-from2; disabling copy offload\n"); 282462306a36Sopenharmony_ci } 282562306a36Sopenharmony_ci dout("ceph_osdc_copy_from returned %d\n", ret); 282662306a36Sopenharmony_ci if (!bytes) 282762306a36Sopenharmony_ci bytes = ret; 282862306a36Sopenharmony_ci goto out; 282962306a36Sopenharmony_ci } 283062306a36Sopenharmony_ci len -= object_size; 283162306a36Sopenharmony_ci bytes += object_size; 283262306a36Sopenharmony_ci *src_off += object_size; 283362306a36Sopenharmony_ci *dst_off += object_size; 283462306a36Sopenharmony_ci } 283562306a36Sopenharmony_ci 283662306a36Sopenharmony_ciout: 283762306a36Sopenharmony_ci ceph_oloc_destroy(&src_oloc); 283862306a36Sopenharmony_ci ceph_oloc_destroy(&dst_oloc); 283962306a36Sopenharmony_ci return bytes; 284062306a36Sopenharmony_ci} 284162306a36Sopenharmony_ci 284262306a36Sopenharmony_cistatic ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, 284362306a36Sopenharmony_ci struct file *dst_file, loff_t dst_off, 284462306a36Sopenharmony_ci size_t len, unsigned int flags) 284562306a36Sopenharmony_ci{ 284662306a36Sopenharmony_ci struct inode *src_inode = file_inode(src_file); 284762306a36Sopenharmony_ci struct inode *dst_inode = file_inode(dst_file); 284862306a36Sopenharmony_ci struct ceph_inode_info *src_ci = ceph_inode(src_inode); 284962306a36Sopenharmony_ci struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); 285062306a36Sopenharmony_ci struct ceph_cap_flush *prealloc_cf; 285162306a36Sopenharmony_ci struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode); 285262306a36Sopenharmony_ci loff_t size; 285362306a36Sopenharmony_ci ssize_t ret = -EIO, bytes; 285462306a36Sopenharmony_ci u64 src_objnum, dst_objnum, src_objoff, dst_objoff; 285562306a36Sopenharmony_ci u32 src_objlen, dst_objlen; 285662306a36Sopenharmony_ci int src_got = 0, dst_got = 0, err, dirty; 285762306a36Sopenharmony_ci 285862306a36Sopenharmony_ci if (src_inode->i_sb != dst_inode->i_sb) { 285962306a36Sopenharmony_ci struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode); 286062306a36Sopenharmony_ci 286162306a36Sopenharmony_ci if (ceph_fsid_compare(&src_fsc->client->fsid, 286262306a36Sopenharmony_ci &dst_fsc->client->fsid)) { 286362306a36Sopenharmony_ci dout("Copying files across clusters: src: %pU dst: %pU\n", 286462306a36Sopenharmony_ci &src_fsc->client->fsid, &dst_fsc->client->fsid); 286562306a36Sopenharmony_ci return -EXDEV; 286662306a36Sopenharmony_ci } 286762306a36Sopenharmony_ci } 286862306a36Sopenharmony_ci if (ceph_snap(dst_inode) != CEPH_NOSNAP) 286962306a36Sopenharmony_ci return -EROFS; 287062306a36Sopenharmony_ci 287162306a36Sopenharmony_ci /* 287262306a36Sopenharmony_ci * Some of the checks below will return -EOPNOTSUPP, which will force a 287362306a36Sopenharmony_ci * fallback to the default VFS copy_file_range implementation. This is 287462306a36Sopenharmony_ci * desirable in several cases (for ex, the 'len' is smaller than the 287562306a36Sopenharmony_ci * size of the objects, or in cases where that would be more 287662306a36Sopenharmony_ci * efficient). 287762306a36Sopenharmony_ci */ 287862306a36Sopenharmony_ci 287962306a36Sopenharmony_ci if (ceph_test_mount_opt(src_fsc, NOCOPYFROM)) 288062306a36Sopenharmony_ci return -EOPNOTSUPP; 288162306a36Sopenharmony_ci 288262306a36Sopenharmony_ci if (!src_fsc->have_copy_from2) 288362306a36Sopenharmony_ci return -EOPNOTSUPP; 288462306a36Sopenharmony_ci 288562306a36Sopenharmony_ci /* 288662306a36Sopenharmony_ci * Striped file layouts require that we copy partial objects, but the 288762306a36Sopenharmony_ci * OSD copy-from operation only supports full-object copies. Limit 288862306a36Sopenharmony_ci * this to non-striped file layouts for now. 288962306a36Sopenharmony_ci */ 289062306a36Sopenharmony_ci if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || 289162306a36Sopenharmony_ci (src_ci->i_layout.stripe_count != 1) || 289262306a36Sopenharmony_ci (dst_ci->i_layout.stripe_count != 1) || 289362306a36Sopenharmony_ci (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) { 289462306a36Sopenharmony_ci dout("Invalid src/dst files layout\n"); 289562306a36Sopenharmony_ci return -EOPNOTSUPP; 289662306a36Sopenharmony_ci } 289762306a36Sopenharmony_ci 289862306a36Sopenharmony_ci /* Every encrypted inode gets its own key, so we can't offload them */ 289962306a36Sopenharmony_ci if (IS_ENCRYPTED(src_inode) || IS_ENCRYPTED(dst_inode)) 290062306a36Sopenharmony_ci return -EOPNOTSUPP; 290162306a36Sopenharmony_ci 290262306a36Sopenharmony_ci if (len < src_ci->i_layout.object_size) 290362306a36Sopenharmony_ci return -EOPNOTSUPP; /* no remote copy will be done */ 290462306a36Sopenharmony_ci 290562306a36Sopenharmony_ci prealloc_cf = ceph_alloc_cap_flush(); 290662306a36Sopenharmony_ci if (!prealloc_cf) 290762306a36Sopenharmony_ci return -ENOMEM; 290862306a36Sopenharmony_ci 290962306a36Sopenharmony_ci /* Start by sync'ing the source and destination files */ 291062306a36Sopenharmony_ci ret = file_write_and_wait_range(src_file, src_off, (src_off + len)); 291162306a36Sopenharmony_ci if (ret < 0) { 291262306a36Sopenharmony_ci dout("failed to write src file (%zd)\n", ret); 291362306a36Sopenharmony_ci goto out; 291462306a36Sopenharmony_ci } 291562306a36Sopenharmony_ci ret = file_write_and_wait_range(dst_file, dst_off, (dst_off + len)); 291662306a36Sopenharmony_ci if (ret < 0) { 291762306a36Sopenharmony_ci dout("failed to write dst file (%zd)\n", ret); 291862306a36Sopenharmony_ci goto out; 291962306a36Sopenharmony_ci } 292062306a36Sopenharmony_ci 292162306a36Sopenharmony_ci /* 292262306a36Sopenharmony_ci * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other 292362306a36Sopenharmony_ci * clients may have dirty data in their caches. And OSDs know nothing 292462306a36Sopenharmony_ci * about caps, so they can't safely do the remote object copies. 292562306a36Sopenharmony_ci */ 292662306a36Sopenharmony_ci err = get_rd_wr_caps(src_file, &src_got, 292762306a36Sopenharmony_ci dst_file, (dst_off + len), &dst_got); 292862306a36Sopenharmony_ci if (err < 0) { 292962306a36Sopenharmony_ci dout("get_rd_wr_caps returned %d\n", err); 293062306a36Sopenharmony_ci ret = -EOPNOTSUPP; 293162306a36Sopenharmony_ci goto out; 293262306a36Sopenharmony_ci } 293362306a36Sopenharmony_ci 293462306a36Sopenharmony_ci ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len); 293562306a36Sopenharmony_ci if (ret < 0) 293662306a36Sopenharmony_ci goto out_caps; 293762306a36Sopenharmony_ci 293862306a36Sopenharmony_ci /* Drop dst file cached pages */ 293962306a36Sopenharmony_ci ceph_fscache_invalidate(dst_inode, false); 294062306a36Sopenharmony_ci ret = invalidate_inode_pages2_range(dst_inode->i_mapping, 294162306a36Sopenharmony_ci dst_off >> PAGE_SHIFT, 294262306a36Sopenharmony_ci (dst_off + len) >> PAGE_SHIFT); 294362306a36Sopenharmony_ci if (ret < 0) { 294462306a36Sopenharmony_ci dout("Failed to invalidate inode pages (%zd)\n", ret); 294562306a36Sopenharmony_ci ret = 0; /* XXX */ 294662306a36Sopenharmony_ci } 294762306a36Sopenharmony_ci ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, 294862306a36Sopenharmony_ci src_ci->i_layout.object_size, 294962306a36Sopenharmony_ci &src_objnum, &src_objoff, &src_objlen); 295062306a36Sopenharmony_ci ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off, 295162306a36Sopenharmony_ci dst_ci->i_layout.object_size, 295262306a36Sopenharmony_ci &dst_objnum, &dst_objoff, &dst_objlen); 295362306a36Sopenharmony_ci /* object-level offsets need to the same */ 295462306a36Sopenharmony_ci if (src_objoff != dst_objoff) { 295562306a36Sopenharmony_ci ret = -EOPNOTSUPP; 295662306a36Sopenharmony_ci goto out_caps; 295762306a36Sopenharmony_ci } 295862306a36Sopenharmony_ci 295962306a36Sopenharmony_ci /* 296062306a36Sopenharmony_ci * Do a manual copy if the object offset isn't object aligned. 296162306a36Sopenharmony_ci * 'src_objlen' contains the bytes left until the end of the object, 296262306a36Sopenharmony_ci * starting at the src_off 296362306a36Sopenharmony_ci */ 296462306a36Sopenharmony_ci if (src_objoff) { 296562306a36Sopenharmony_ci dout("Initial partial copy of %u bytes\n", src_objlen); 296662306a36Sopenharmony_ci 296762306a36Sopenharmony_ci /* 296862306a36Sopenharmony_ci * we need to temporarily drop all caps as we'll be calling 296962306a36Sopenharmony_ci * {read,write}_iter, which will get caps again. 297062306a36Sopenharmony_ci */ 297162306a36Sopenharmony_ci put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); 297262306a36Sopenharmony_ci ret = do_splice_direct(src_file, &src_off, dst_file, 297362306a36Sopenharmony_ci &dst_off, src_objlen, flags); 297462306a36Sopenharmony_ci /* Abort on short copies or on error */ 297562306a36Sopenharmony_ci if (ret < (long)src_objlen) { 297662306a36Sopenharmony_ci dout("Failed partial copy (%zd)\n", ret); 297762306a36Sopenharmony_ci goto out; 297862306a36Sopenharmony_ci } 297962306a36Sopenharmony_ci len -= ret; 298062306a36Sopenharmony_ci err = get_rd_wr_caps(src_file, &src_got, 298162306a36Sopenharmony_ci dst_file, (dst_off + len), &dst_got); 298262306a36Sopenharmony_ci if (err < 0) 298362306a36Sopenharmony_ci goto out; 298462306a36Sopenharmony_ci err = is_file_size_ok(src_inode, dst_inode, 298562306a36Sopenharmony_ci src_off, dst_off, len); 298662306a36Sopenharmony_ci if (err < 0) 298762306a36Sopenharmony_ci goto out_caps; 298862306a36Sopenharmony_ci } 298962306a36Sopenharmony_ci 299062306a36Sopenharmony_ci size = i_size_read(dst_inode); 299162306a36Sopenharmony_ci bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off, 299262306a36Sopenharmony_ci src_fsc, len, flags); 299362306a36Sopenharmony_ci if (bytes <= 0) { 299462306a36Sopenharmony_ci if (!ret) 299562306a36Sopenharmony_ci ret = bytes; 299662306a36Sopenharmony_ci goto out_caps; 299762306a36Sopenharmony_ci } 299862306a36Sopenharmony_ci dout("Copied %zu bytes out of %zu\n", bytes, len); 299962306a36Sopenharmony_ci len -= bytes; 300062306a36Sopenharmony_ci ret += bytes; 300162306a36Sopenharmony_ci 300262306a36Sopenharmony_ci file_update_time(dst_file); 300362306a36Sopenharmony_ci inode_inc_iversion_raw(dst_inode); 300462306a36Sopenharmony_ci 300562306a36Sopenharmony_ci if (dst_off > size) { 300662306a36Sopenharmony_ci /* Let the MDS know about dst file size change */ 300762306a36Sopenharmony_ci if (ceph_inode_set_size(dst_inode, dst_off) || 300862306a36Sopenharmony_ci ceph_quota_is_max_bytes_approaching(dst_inode, dst_off)) 300962306a36Sopenharmony_ci ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH); 301062306a36Sopenharmony_ci } 301162306a36Sopenharmony_ci /* Mark Fw dirty */ 301262306a36Sopenharmony_ci spin_lock(&dst_ci->i_ceph_lock); 301362306a36Sopenharmony_ci dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf); 301462306a36Sopenharmony_ci spin_unlock(&dst_ci->i_ceph_lock); 301562306a36Sopenharmony_ci if (dirty) 301662306a36Sopenharmony_ci __mark_inode_dirty(dst_inode, dirty); 301762306a36Sopenharmony_ci 301862306a36Sopenharmony_ciout_caps: 301962306a36Sopenharmony_ci put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); 302062306a36Sopenharmony_ci 302162306a36Sopenharmony_ci /* 302262306a36Sopenharmony_ci * Do the final manual copy if we still have some bytes left, unless 302362306a36Sopenharmony_ci * there were errors in remote object copies (len >= object_size). 302462306a36Sopenharmony_ci */ 302562306a36Sopenharmony_ci if (len && (len < src_ci->i_layout.object_size)) { 302662306a36Sopenharmony_ci dout("Final partial copy of %zu bytes\n", len); 302762306a36Sopenharmony_ci bytes = do_splice_direct(src_file, &src_off, dst_file, 302862306a36Sopenharmony_ci &dst_off, len, flags); 302962306a36Sopenharmony_ci if (bytes > 0) 303062306a36Sopenharmony_ci ret += bytes; 303162306a36Sopenharmony_ci else 303262306a36Sopenharmony_ci dout("Failed partial copy (%zd)\n", bytes); 303362306a36Sopenharmony_ci } 303462306a36Sopenharmony_ci 303562306a36Sopenharmony_ciout: 303662306a36Sopenharmony_ci ceph_free_cap_flush(prealloc_cf); 303762306a36Sopenharmony_ci 303862306a36Sopenharmony_ci return ret; 303962306a36Sopenharmony_ci} 304062306a36Sopenharmony_ci 304162306a36Sopenharmony_cistatic ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off, 304262306a36Sopenharmony_ci struct file *dst_file, loff_t dst_off, 304362306a36Sopenharmony_ci size_t len, unsigned int flags) 304462306a36Sopenharmony_ci{ 304562306a36Sopenharmony_ci ssize_t ret; 304662306a36Sopenharmony_ci 304762306a36Sopenharmony_ci ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off, 304862306a36Sopenharmony_ci len, flags); 304962306a36Sopenharmony_ci 305062306a36Sopenharmony_ci if (ret == -EOPNOTSUPP || ret == -EXDEV) 305162306a36Sopenharmony_ci ret = generic_copy_file_range(src_file, src_off, dst_file, 305262306a36Sopenharmony_ci dst_off, len, flags); 305362306a36Sopenharmony_ci return ret; 305462306a36Sopenharmony_ci} 305562306a36Sopenharmony_ci 305662306a36Sopenharmony_ciconst struct file_operations ceph_file_fops = { 305762306a36Sopenharmony_ci .open = ceph_open, 305862306a36Sopenharmony_ci .release = ceph_release, 305962306a36Sopenharmony_ci .llseek = ceph_llseek, 306062306a36Sopenharmony_ci .read_iter = ceph_read_iter, 306162306a36Sopenharmony_ci .write_iter = ceph_write_iter, 306262306a36Sopenharmony_ci .mmap = ceph_mmap, 306362306a36Sopenharmony_ci .fsync = ceph_fsync, 306462306a36Sopenharmony_ci .lock = ceph_lock, 306562306a36Sopenharmony_ci .setlease = simple_nosetlease, 306662306a36Sopenharmony_ci .flock = ceph_flock, 306762306a36Sopenharmony_ci .splice_read = ceph_splice_read, 306862306a36Sopenharmony_ci .splice_write = iter_file_splice_write, 306962306a36Sopenharmony_ci .unlocked_ioctl = ceph_ioctl, 307062306a36Sopenharmony_ci .compat_ioctl = compat_ptr_ioctl, 307162306a36Sopenharmony_ci .fallocate = ceph_fallocate, 307262306a36Sopenharmony_ci .copy_file_range = ceph_copy_file_range, 307362306a36Sopenharmony_ci}; 3074