18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 28c2ecf20Sopenharmony_ci#include <linux/ceph/ceph_debug.h> 38c2ecf20Sopenharmony_ci#include <linux/ceph/striper.h> 48c2ecf20Sopenharmony_ci 58c2ecf20Sopenharmony_ci#include <linux/module.h> 68c2ecf20Sopenharmony_ci#include <linux/sched.h> 78c2ecf20Sopenharmony_ci#include <linux/slab.h> 88c2ecf20Sopenharmony_ci#include <linux/file.h> 98c2ecf20Sopenharmony_ci#include <linux/mount.h> 108c2ecf20Sopenharmony_ci#include <linux/namei.h> 118c2ecf20Sopenharmony_ci#include <linux/writeback.h> 128c2ecf20Sopenharmony_ci#include <linux/falloc.h> 138c2ecf20Sopenharmony_ci#include <linux/iversion.h> 148c2ecf20Sopenharmony_ci#include <linux/ktime.h> 158c2ecf20Sopenharmony_ci 168c2ecf20Sopenharmony_ci#include "super.h" 178c2ecf20Sopenharmony_ci#include "mds_client.h" 188c2ecf20Sopenharmony_ci#include "cache.h" 198c2ecf20Sopenharmony_ci#include "io.h" 208c2ecf20Sopenharmony_ci#include "metric.h" 218c2ecf20Sopenharmony_ci 228c2ecf20Sopenharmony_cistatic __le32 ceph_flags_sys2wire(u32 flags) 238c2ecf20Sopenharmony_ci{ 248c2ecf20Sopenharmony_ci u32 wire_flags = 0; 258c2ecf20Sopenharmony_ci 268c2ecf20Sopenharmony_ci switch (flags & O_ACCMODE) { 278c2ecf20Sopenharmony_ci case O_RDONLY: 288c2ecf20Sopenharmony_ci wire_flags |= CEPH_O_RDONLY; 298c2ecf20Sopenharmony_ci break; 308c2ecf20Sopenharmony_ci case O_WRONLY: 318c2ecf20Sopenharmony_ci wire_flags |= CEPH_O_WRONLY; 328c2ecf20Sopenharmony_ci break; 338c2ecf20Sopenharmony_ci case O_RDWR: 348c2ecf20Sopenharmony_ci wire_flags |= CEPH_O_RDWR; 358c2ecf20Sopenharmony_ci break; 368c2ecf20Sopenharmony_ci } 378c2ecf20Sopenharmony_ci 388c2ecf20Sopenharmony_ci flags &= ~O_ACCMODE; 398c2ecf20Sopenharmony_ci 408c2ecf20Sopenharmony_ci#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; } 418c2ecf20Sopenharmony_ci 428c2ecf20Sopenharmony_ci ceph_sys2wire(O_CREAT); 438c2ecf20Sopenharmony_ci ceph_sys2wire(O_EXCL); 448c2ecf20Sopenharmony_ci ceph_sys2wire(O_TRUNC); 458c2ecf20Sopenharmony_ci ceph_sys2wire(O_DIRECTORY); 468c2ecf20Sopenharmony_ci ceph_sys2wire(O_NOFOLLOW); 478c2ecf20Sopenharmony_ci 488c2ecf20Sopenharmony_ci#undef ceph_sys2wire 498c2ecf20Sopenharmony_ci 508c2ecf20Sopenharmony_ci if (flags) 518c2ecf20Sopenharmony_ci dout("unused open flags: %x\n", flags); 528c2ecf20Sopenharmony_ci 538c2ecf20Sopenharmony_ci return cpu_to_le32(wire_flags); 548c2ecf20Sopenharmony_ci} 558c2ecf20Sopenharmony_ci 568c2ecf20Sopenharmony_ci/* 578c2ecf20Sopenharmony_ci * Ceph file operations 588c2ecf20Sopenharmony_ci * 598c2ecf20Sopenharmony_ci * Implement basic open/close functionality, and implement 608c2ecf20Sopenharmony_ci * read/write. 618c2ecf20Sopenharmony_ci * 628c2ecf20Sopenharmony_ci * We implement three modes of file I/O: 638c2ecf20Sopenharmony_ci * - buffered uses the generic_file_aio_{read,write} helpers 648c2ecf20Sopenharmony_ci * 658c2ecf20Sopenharmony_ci * - synchronous is used when there is multi-client read/write 668c2ecf20Sopenharmony_ci * sharing, avoids the page cache, and synchronously waits for an 678c2ecf20Sopenharmony_ci * ack from the OSD. 688c2ecf20Sopenharmony_ci * 698c2ecf20Sopenharmony_ci * - direct io takes the variant of the sync path that references 708c2ecf20Sopenharmony_ci * user pages directly. 718c2ecf20Sopenharmony_ci * 728c2ecf20Sopenharmony_ci * fsync() flushes and waits on dirty pages, but just queues metadata 738c2ecf20Sopenharmony_ci * for writeback: since the MDS can recover size and mtime there is no 748c2ecf20Sopenharmony_ci * need to wait for MDS acknowledgement. 758c2ecf20Sopenharmony_ci */ 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_ci/* 788c2ecf20Sopenharmony_ci * How many pages to get in one call to iov_iter_get_pages(). This 798c2ecf20Sopenharmony_ci * determines the size of the on-stack array used as a buffer. 808c2ecf20Sopenharmony_ci */ 818c2ecf20Sopenharmony_ci#define ITER_GET_BVECS_PAGES 64 828c2ecf20Sopenharmony_ci 838c2ecf20Sopenharmony_cistatic ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize, 848c2ecf20Sopenharmony_ci struct bio_vec *bvecs) 858c2ecf20Sopenharmony_ci{ 868c2ecf20Sopenharmony_ci size_t size = 0; 878c2ecf20Sopenharmony_ci int bvec_idx = 0; 888c2ecf20Sopenharmony_ci 898c2ecf20Sopenharmony_ci if (maxsize > iov_iter_count(iter)) 908c2ecf20Sopenharmony_ci maxsize = iov_iter_count(iter); 918c2ecf20Sopenharmony_ci 928c2ecf20Sopenharmony_ci while (size < maxsize) { 938c2ecf20Sopenharmony_ci struct page *pages[ITER_GET_BVECS_PAGES]; 948c2ecf20Sopenharmony_ci ssize_t bytes; 958c2ecf20Sopenharmony_ci size_t start; 968c2ecf20Sopenharmony_ci int idx = 0; 978c2ecf20Sopenharmony_ci 988c2ecf20Sopenharmony_ci bytes = iov_iter_get_pages(iter, pages, maxsize - size, 998c2ecf20Sopenharmony_ci ITER_GET_BVECS_PAGES, &start); 1008c2ecf20Sopenharmony_ci if (bytes < 0) 1018c2ecf20Sopenharmony_ci return size ?: bytes; 1028c2ecf20Sopenharmony_ci 1038c2ecf20Sopenharmony_ci iov_iter_advance(iter, bytes); 1048c2ecf20Sopenharmony_ci size += bytes; 1058c2ecf20Sopenharmony_ci 1068c2ecf20Sopenharmony_ci for ( ; bytes; idx++, bvec_idx++) { 1078c2ecf20Sopenharmony_ci struct bio_vec bv = { 1088c2ecf20Sopenharmony_ci .bv_page = pages[idx], 1098c2ecf20Sopenharmony_ci .bv_len = min_t(int, bytes, PAGE_SIZE - start), 1108c2ecf20Sopenharmony_ci .bv_offset = start, 1118c2ecf20Sopenharmony_ci }; 1128c2ecf20Sopenharmony_ci 1138c2ecf20Sopenharmony_ci bvecs[bvec_idx] = bv; 1148c2ecf20Sopenharmony_ci bytes -= bv.bv_len; 1158c2ecf20Sopenharmony_ci start = 0; 1168c2ecf20Sopenharmony_ci } 1178c2ecf20Sopenharmony_ci } 1188c2ecf20Sopenharmony_ci 1198c2ecf20Sopenharmony_ci return size; 1208c2ecf20Sopenharmony_ci} 1218c2ecf20Sopenharmony_ci 1228c2ecf20Sopenharmony_ci/* 1238c2ecf20Sopenharmony_ci * iov_iter_get_pages() only considers one iov_iter segment, no matter 1248c2ecf20Sopenharmony_ci * what maxsize or maxpages are given. For ITER_BVEC that is a single 1258c2ecf20Sopenharmony_ci * page. 1268c2ecf20Sopenharmony_ci * 1278c2ecf20Sopenharmony_ci * Attempt to get up to @maxsize bytes worth of pages from @iter. 1288c2ecf20Sopenharmony_ci * Return the number of bytes in the created bio_vec array, or an error. 1298c2ecf20Sopenharmony_ci */ 1308c2ecf20Sopenharmony_cistatic ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize, 1318c2ecf20Sopenharmony_ci struct bio_vec **bvecs, int *num_bvecs) 1328c2ecf20Sopenharmony_ci{ 1338c2ecf20Sopenharmony_ci struct bio_vec *bv; 1348c2ecf20Sopenharmony_ci size_t orig_count = iov_iter_count(iter); 1358c2ecf20Sopenharmony_ci ssize_t bytes; 1368c2ecf20Sopenharmony_ci int npages; 1378c2ecf20Sopenharmony_ci 1388c2ecf20Sopenharmony_ci iov_iter_truncate(iter, maxsize); 1398c2ecf20Sopenharmony_ci npages = iov_iter_npages(iter, INT_MAX); 1408c2ecf20Sopenharmony_ci iov_iter_reexpand(iter, orig_count); 1418c2ecf20Sopenharmony_ci 1428c2ecf20Sopenharmony_ci /* 1438c2ecf20Sopenharmony_ci * __iter_get_bvecs() may populate only part of the array -- zero it 1448c2ecf20Sopenharmony_ci * out. 1458c2ecf20Sopenharmony_ci */ 1468c2ecf20Sopenharmony_ci bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL | __GFP_ZERO); 1478c2ecf20Sopenharmony_ci if (!bv) 1488c2ecf20Sopenharmony_ci return -ENOMEM; 1498c2ecf20Sopenharmony_ci 1508c2ecf20Sopenharmony_ci bytes = __iter_get_bvecs(iter, maxsize, bv); 1518c2ecf20Sopenharmony_ci if (bytes < 0) { 1528c2ecf20Sopenharmony_ci /* 1538c2ecf20Sopenharmony_ci * No pages were pinned -- just free the array. 1548c2ecf20Sopenharmony_ci */ 1558c2ecf20Sopenharmony_ci kvfree(bv); 1568c2ecf20Sopenharmony_ci return bytes; 1578c2ecf20Sopenharmony_ci } 1588c2ecf20Sopenharmony_ci 1598c2ecf20Sopenharmony_ci *bvecs = bv; 1608c2ecf20Sopenharmony_ci *num_bvecs = npages; 1618c2ecf20Sopenharmony_ci return bytes; 1628c2ecf20Sopenharmony_ci} 1638c2ecf20Sopenharmony_ci 1648c2ecf20Sopenharmony_cistatic void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty) 1658c2ecf20Sopenharmony_ci{ 1668c2ecf20Sopenharmony_ci int i; 1678c2ecf20Sopenharmony_ci 1688c2ecf20Sopenharmony_ci for (i = 0; i < num_bvecs; i++) { 1698c2ecf20Sopenharmony_ci if (bvecs[i].bv_page) { 1708c2ecf20Sopenharmony_ci if (should_dirty) 1718c2ecf20Sopenharmony_ci set_page_dirty_lock(bvecs[i].bv_page); 1728c2ecf20Sopenharmony_ci put_page(bvecs[i].bv_page); 1738c2ecf20Sopenharmony_ci } 1748c2ecf20Sopenharmony_ci } 1758c2ecf20Sopenharmony_ci kvfree(bvecs); 1768c2ecf20Sopenharmony_ci} 1778c2ecf20Sopenharmony_ci 1788c2ecf20Sopenharmony_ci/* 1798c2ecf20Sopenharmony_ci * Prepare an open request. Preallocate ceph_cap to avoid an 1808c2ecf20Sopenharmony_ci * inopportune ENOMEM later. 1818c2ecf20Sopenharmony_ci */ 1828c2ecf20Sopenharmony_cistatic struct ceph_mds_request * 1838c2ecf20Sopenharmony_ciprepare_open_request(struct super_block *sb, int flags, int create_mode) 1848c2ecf20Sopenharmony_ci{ 1858c2ecf20Sopenharmony_ci struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb); 1868c2ecf20Sopenharmony_ci struct ceph_mds_request *req; 1878c2ecf20Sopenharmony_ci int want_auth = USE_ANY_MDS; 1888c2ecf20Sopenharmony_ci int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; 1898c2ecf20Sopenharmony_ci 1908c2ecf20Sopenharmony_ci if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC)) 1918c2ecf20Sopenharmony_ci want_auth = USE_AUTH_MDS; 1928c2ecf20Sopenharmony_ci 1938c2ecf20Sopenharmony_ci req = ceph_mdsc_create_request(mdsc, op, want_auth); 1948c2ecf20Sopenharmony_ci if (IS_ERR(req)) 1958c2ecf20Sopenharmony_ci goto out; 1968c2ecf20Sopenharmony_ci req->r_fmode = ceph_flags_to_mode(flags); 1978c2ecf20Sopenharmony_ci req->r_args.open.flags = ceph_flags_sys2wire(flags); 1988c2ecf20Sopenharmony_ci req->r_args.open.mode = cpu_to_le32(create_mode); 1998c2ecf20Sopenharmony_ciout: 2008c2ecf20Sopenharmony_ci return req; 2018c2ecf20Sopenharmony_ci} 2028c2ecf20Sopenharmony_ci 2038c2ecf20Sopenharmony_cistatic int ceph_init_file_info(struct inode *inode, struct file *file, 2048c2ecf20Sopenharmony_ci int fmode, bool isdir) 2058c2ecf20Sopenharmony_ci{ 2068c2ecf20Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 2078c2ecf20Sopenharmony_ci struct ceph_file_info *fi; 2088c2ecf20Sopenharmony_ci 2098c2ecf20Sopenharmony_ci dout("%s %p %p 0%o (%s)\n", __func__, inode, file, 2108c2ecf20Sopenharmony_ci inode->i_mode, isdir ? "dir" : "regular"); 2118c2ecf20Sopenharmony_ci BUG_ON(inode->i_fop->release != ceph_release); 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_ci if (isdir) { 2148c2ecf20Sopenharmony_ci struct ceph_dir_file_info *dfi = 2158c2ecf20Sopenharmony_ci kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); 2168c2ecf20Sopenharmony_ci if (!dfi) 2178c2ecf20Sopenharmony_ci return -ENOMEM; 2188c2ecf20Sopenharmony_ci 2198c2ecf20Sopenharmony_ci file->private_data = dfi; 2208c2ecf20Sopenharmony_ci fi = &dfi->file_info; 2218c2ecf20Sopenharmony_ci dfi->next_offset = 2; 2228c2ecf20Sopenharmony_ci dfi->readdir_cache_idx = -1; 2238c2ecf20Sopenharmony_ci } else { 2248c2ecf20Sopenharmony_ci fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); 2258c2ecf20Sopenharmony_ci if (!fi) 2268c2ecf20Sopenharmony_ci return -ENOMEM; 2278c2ecf20Sopenharmony_ci 2288c2ecf20Sopenharmony_ci file->private_data = fi; 2298c2ecf20Sopenharmony_ci } 2308c2ecf20Sopenharmony_ci 2318c2ecf20Sopenharmony_ci ceph_get_fmode(ci, fmode, 1); 2328c2ecf20Sopenharmony_ci fi->fmode = fmode; 2338c2ecf20Sopenharmony_ci 2348c2ecf20Sopenharmony_ci spin_lock_init(&fi->rw_contexts_lock); 2358c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&fi->rw_contexts); 2368c2ecf20Sopenharmony_ci fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); 2378c2ecf20Sopenharmony_ci 2388c2ecf20Sopenharmony_ci return 0; 2398c2ecf20Sopenharmony_ci} 2408c2ecf20Sopenharmony_ci 2418c2ecf20Sopenharmony_ci/* 2428c2ecf20Sopenharmony_ci * initialize private struct file data. 2438c2ecf20Sopenharmony_ci * if we fail, clean up by dropping fmode reference on the ceph_inode 2448c2ecf20Sopenharmony_ci */ 2458c2ecf20Sopenharmony_cistatic int ceph_init_file(struct inode *inode, struct file *file, int fmode) 2468c2ecf20Sopenharmony_ci{ 2478c2ecf20Sopenharmony_ci int ret = 0; 2488c2ecf20Sopenharmony_ci 2498c2ecf20Sopenharmony_ci switch (inode->i_mode & S_IFMT) { 2508c2ecf20Sopenharmony_ci case S_IFREG: 2518c2ecf20Sopenharmony_ci ceph_fscache_register_inode_cookie(inode); 2528c2ecf20Sopenharmony_ci ceph_fscache_file_set_cookie(inode, file); 2538c2ecf20Sopenharmony_ci fallthrough; 2548c2ecf20Sopenharmony_ci case S_IFDIR: 2558c2ecf20Sopenharmony_ci ret = ceph_init_file_info(inode, file, fmode, 2568c2ecf20Sopenharmony_ci S_ISDIR(inode->i_mode)); 2578c2ecf20Sopenharmony_ci break; 2588c2ecf20Sopenharmony_ci 2598c2ecf20Sopenharmony_ci case S_IFLNK: 2608c2ecf20Sopenharmony_ci dout("init_file %p %p 0%o (symlink)\n", inode, file, 2618c2ecf20Sopenharmony_ci inode->i_mode); 2628c2ecf20Sopenharmony_ci break; 2638c2ecf20Sopenharmony_ci 2648c2ecf20Sopenharmony_ci default: 2658c2ecf20Sopenharmony_ci dout("init_file %p %p 0%o (special)\n", inode, file, 2668c2ecf20Sopenharmony_ci inode->i_mode); 2678c2ecf20Sopenharmony_ci /* 2688c2ecf20Sopenharmony_ci * we need to drop the open ref now, since we don't 2698c2ecf20Sopenharmony_ci * have .release set to ceph_release. 2708c2ecf20Sopenharmony_ci */ 2718c2ecf20Sopenharmony_ci BUG_ON(inode->i_fop->release == ceph_release); 2728c2ecf20Sopenharmony_ci 2738c2ecf20Sopenharmony_ci /* call the proper open fop */ 2748c2ecf20Sopenharmony_ci ret = inode->i_fop->open(inode, file); 2758c2ecf20Sopenharmony_ci } 2768c2ecf20Sopenharmony_ci return ret; 2778c2ecf20Sopenharmony_ci} 2788c2ecf20Sopenharmony_ci 2798c2ecf20Sopenharmony_ci/* 2808c2ecf20Sopenharmony_ci * try renew caps after session gets killed. 2818c2ecf20Sopenharmony_ci */ 2828c2ecf20Sopenharmony_ciint ceph_renew_caps(struct inode *inode, int fmode) 2838c2ecf20Sopenharmony_ci{ 2848c2ecf20Sopenharmony_ci struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); 2858c2ecf20Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 2868c2ecf20Sopenharmony_ci struct ceph_mds_request *req; 2878c2ecf20Sopenharmony_ci int err, flags, wanted; 2888c2ecf20Sopenharmony_ci 2898c2ecf20Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 2908c2ecf20Sopenharmony_ci __ceph_touch_fmode(ci, mdsc, fmode); 2918c2ecf20Sopenharmony_ci wanted = __ceph_caps_file_wanted(ci); 2928c2ecf20Sopenharmony_ci if (__ceph_is_any_real_caps(ci) && 2938c2ecf20Sopenharmony_ci (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) { 2948c2ecf20Sopenharmony_ci int issued = __ceph_caps_issued(ci, NULL); 2958c2ecf20Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 2968c2ecf20Sopenharmony_ci dout("renew caps %p want %s issued %s updating mds_wanted\n", 2978c2ecf20Sopenharmony_ci inode, ceph_cap_string(wanted), ceph_cap_string(issued)); 2988c2ecf20Sopenharmony_ci ceph_check_caps(ci, 0, NULL); 2998c2ecf20Sopenharmony_ci return 0; 3008c2ecf20Sopenharmony_ci } 3018c2ecf20Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 3028c2ecf20Sopenharmony_ci 3038c2ecf20Sopenharmony_ci flags = 0; 3048c2ecf20Sopenharmony_ci if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR)) 3058c2ecf20Sopenharmony_ci flags = O_RDWR; 3068c2ecf20Sopenharmony_ci else if (wanted & CEPH_CAP_FILE_RD) 3078c2ecf20Sopenharmony_ci flags = O_RDONLY; 3088c2ecf20Sopenharmony_ci else if (wanted & CEPH_CAP_FILE_WR) 3098c2ecf20Sopenharmony_ci flags = O_WRONLY; 3108c2ecf20Sopenharmony_ci#ifdef O_LAZY 3118c2ecf20Sopenharmony_ci if (wanted & CEPH_CAP_FILE_LAZYIO) 3128c2ecf20Sopenharmony_ci flags |= O_LAZY; 3138c2ecf20Sopenharmony_ci#endif 3148c2ecf20Sopenharmony_ci 3158c2ecf20Sopenharmony_ci req = prepare_open_request(inode->i_sb, flags, 0); 3168c2ecf20Sopenharmony_ci if (IS_ERR(req)) { 3178c2ecf20Sopenharmony_ci err = PTR_ERR(req); 3188c2ecf20Sopenharmony_ci goto out; 3198c2ecf20Sopenharmony_ci } 3208c2ecf20Sopenharmony_ci 3218c2ecf20Sopenharmony_ci req->r_inode = inode; 3228c2ecf20Sopenharmony_ci ihold(inode); 3238c2ecf20Sopenharmony_ci req->r_num_caps = 1; 3248c2ecf20Sopenharmony_ci 3258c2ecf20Sopenharmony_ci err = ceph_mdsc_do_request(mdsc, NULL, req); 3268c2ecf20Sopenharmony_ci ceph_mdsc_put_request(req); 3278c2ecf20Sopenharmony_ciout: 3288c2ecf20Sopenharmony_ci dout("renew caps %p open result=%d\n", inode, err); 3298c2ecf20Sopenharmony_ci return err < 0 ? err : 0; 3308c2ecf20Sopenharmony_ci} 3318c2ecf20Sopenharmony_ci 3328c2ecf20Sopenharmony_ci/* 3338c2ecf20Sopenharmony_ci * If we already have the requisite capabilities, we can satisfy 3348c2ecf20Sopenharmony_ci * the open request locally (no need to request new caps from the 3358c2ecf20Sopenharmony_ci * MDS). We do, however, need to inform the MDS (asynchronously) 3368c2ecf20Sopenharmony_ci * if our wanted caps set expands. 3378c2ecf20Sopenharmony_ci */ 3388c2ecf20Sopenharmony_ciint ceph_open(struct inode *inode, struct file *file) 3398c2ecf20Sopenharmony_ci{ 3408c2ecf20Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 3418c2ecf20Sopenharmony_ci struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); 3428c2ecf20Sopenharmony_ci struct ceph_mds_client *mdsc = fsc->mdsc; 3438c2ecf20Sopenharmony_ci struct ceph_mds_request *req; 3448c2ecf20Sopenharmony_ci struct ceph_file_info *fi = file->private_data; 3458c2ecf20Sopenharmony_ci int err; 3468c2ecf20Sopenharmony_ci int flags, fmode, wanted; 3478c2ecf20Sopenharmony_ci 3488c2ecf20Sopenharmony_ci if (fi) { 3498c2ecf20Sopenharmony_ci dout("open file %p is already opened\n", file); 3508c2ecf20Sopenharmony_ci return 0; 3518c2ecf20Sopenharmony_ci } 3528c2ecf20Sopenharmony_ci 3538c2ecf20Sopenharmony_ci /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */ 3548c2ecf20Sopenharmony_ci flags = file->f_flags & ~(O_CREAT|O_EXCL); 3558c2ecf20Sopenharmony_ci if (S_ISDIR(inode->i_mode)) 3568c2ecf20Sopenharmony_ci flags = O_DIRECTORY; /* mds likes to know */ 3578c2ecf20Sopenharmony_ci 3588c2ecf20Sopenharmony_ci dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode, 3598c2ecf20Sopenharmony_ci ceph_vinop(inode), file, flags, file->f_flags); 3608c2ecf20Sopenharmony_ci fmode = ceph_flags_to_mode(flags); 3618c2ecf20Sopenharmony_ci wanted = ceph_caps_for_mode(fmode); 3628c2ecf20Sopenharmony_ci 3638c2ecf20Sopenharmony_ci /* snapped files are read-only */ 3648c2ecf20Sopenharmony_ci if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE)) 3658c2ecf20Sopenharmony_ci return -EROFS; 3668c2ecf20Sopenharmony_ci 3678c2ecf20Sopenharmony_ci /* trivially open snapdir */ 3688c2ecf20Sopenharmony_ci if (ceph_snap(inode) == CEPH_SNAPDIR) { 3698c2ecf20Sopenharmony_ci return ceph_init_file(inode, file, fmode); 3708c2ecf20Sopenharmony_ci } 3718c2ecf20Sopenharmony_ci 3728c2ecf20Sopenharmony_ci /* 3738c2ecf20Sopenharmony_ci * No need to block if we have caps on the auth MDS (for 3748c2ecf20Sopenharmony_ci * write) or any MDS (for read). Update wanted set 3758c2ecf20Sopenharmony_ci * asynchronously. 3768c2ecf20Sopenharmony_ci */ 3778c2ecf20Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 3788c2ecf20Sopenharmony_ci if (__ceph_is_any_real_caps(ci) && 3798c2ecf20Sopenharmony_ci (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { 3808c2ecf20Sopenharmony_ci int mds_wanted = __ceph_caps_mds_wanted(ci, true); 3818c2ecf20Sopenharmony_ci int issued = __ceph_caps_issued(ci, NULL); 3828c2ecf20Sopenharmony_ci 3838c2ecf20Sopenharmony_ci dout("open %p fmode %d want %s issued %s using existing\n", 3848c2ecf20Sopenharmony_ci inode, fmode, ceph_cap_string(wanted), 3858c2ecf20Sopenharmony_ci ceph_cap_string(issued)); 3868c2ecf20Sopenharmony_ci __ceph_touch_fmode(ci, mdsc, fmode); 3878c2ecf20Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 3888c2ecf20Sopenharmony_ci 3898c2ecf20Sopenharmony_ci /* adjust wanted? */ 3908c2ecf20Sopenharmony_ci if ((issued & wanted) != wanted && 3918c2ecf20Sopenharmony_ci (mds_wanted & wanted) != wanted && 3928c2ecf20Sopenharmony_ci ceph_snap(inode) != CEPH_SNAPDIR) 3938c2ecf20Sopenharmony_ci ceph_check_caps(ci, 0, NULL); 3948c2ecf20Sopenharmony_ci 3958c2ecf20Sopenharmony_ci return ceph_init_file(inode, file, fmode); 3968c2ecf20Sopenharmony_ci } else if (ceph_snap(inode) != CEPH_NOSNAP && 3978c2ecf20Sopenharmony_ci (ci->i_snap_caps & wanted) == wanted) { 3988c2ecf20Sopenharmony_ci __ceph_touch_fmode(ci, mdsc, fmode); 3998c2ecf20Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 4008c2ecf20Sopenharmony_ci return ceph_init_file(inode, file, fmode); 4018c2ecf20Sopenharmony_ci } 4028c2ecf20Sopenharmony_ci 4038c2ecf20Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 4048c2ecf20Sopenharmony_ci 4058c2ecf20Sopenharmony_ci dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted)); 4068c2ecf20Sopenharmony_ci req = prepare_open_request(inode->i_sb, flags, 0); 4078c2ecf20Sopenharmony_ci if (IS_ERR(req)) { 4088c2ecf20Sopenharmony_ci err = PTR_ERR(req); 4098c2ecf20Sopenharmony_ci goto out; 4108c2ecf20Sopenharmony_ci } 4118c2ecf20Sopenharmony_ci req->r_inode = inode; 4128c2ecf20Sopenharmony_ci ihold(inode); 4138c2ecf20Sopenharmony_ci 4148c2ecf20Sopenharmony_ci req->r_num_caps = 1; 4158c2ecf20Sopenharmony_ci err = ceph_mdsc_do_request(mdsc, NULL, req); 4168c2ecf20Sopenharmony_ci if (!err) 4178c2ecf20Sopenharmony_ci err = ceph_init_file(inode, file, req->r_fmode); 4188c2ecf20Sopenharmony_ci ceph_mdsc_put_request(req); 4198c2ecf20Sopenharmony_ci dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode)); 4208c2ecf20Sopenharmony_ciout: 4218c2ecf20Sopenharmony_ci return err; 4228c2ecf20Sopenharmony_ci} 4238c2ecf20Sopenharmony_ci 4248c2ecf20Sopenharmony_ci/* Clone the layout from a synchronous create, if the dir now has Dc caps */ 4258c2ecf20Sopenharmony_cistatic void 4268c2ecf20Sopenharmony_cicache_file_layout(struct inode *dst, struct inode *src) 4278c2ecf20Sopenharmony_ci{ 4288c2ecf20Sopenharmony_ci struct ceph_inode_info *cdst = ceph_inode(dst); 4298c2ecf20Sopenharmony_ci struct ceph_inode_info *csrc = ceph_inode(src); 4308c2ecf20Sopenharmony_ci 4318c2ecf20Sopenharmony_ci spin_lock(&cdst->i_ceph_lock); 4328c2ecf20Sopenharmony_ci if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) && 4338c2ecf20Sopenharmony_ci !ceph_file_layout_is_valid(&cdst->i_cached_layout)) { 4348c2ecf20Sopenharmony_ci memcpy(&cdst->i_cached_layout, &csrc->i_layout, 4358c2ecf20Sopenharmony_ci sizeof(cdst->i_cached_layout)); 4368c2ecf20Sopenharmony_ci rcu_assign_pointer(cdst->i_cached_layout.pool_ns, 4378c2ecf20Sopenharmony_ci ceph_try_get_string(csrc->i_layout.pool_ns)); 4388c2ecf20Sopenharmony_ci } 4398c2ecf20Sopenharmony_ci spin_unlock(&cdst->i_ceph_lock); 4408c2ecf20Sopenharmony_ci} 4418c2ecf20Sopenharmony_ci 4428c2ecf20Sopenharmony_ci/* 4438c2ecf20Sopenharmony_ci * Try to set up an async create. We need caps, a file layout, and inode number, 4448c2ecf20Sopenharmony_ci * and either a lease on the dentry or complete dir info. If any of those 4458c2ecf20Sopenharmony_ci * criteria are not satisfied, then return false and the caller can go 4468c2ecf20Sopenharmony_ci * synchronous. 4478c2ecf20Sopenharmony_ci */ 4488c2ecf20Sopenharmony_cistatic int try_prep_async_create(struct inode *dir, struct dentry *dentry, 4498c2ecf20Sopenharmony_ci struct ceph_file_layout *lo, u64 *pino) 4508c2ecf20Sopenharmony_ci{ 4518c2ecf20Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(dir); 4528c2ecf20Sopenharmony_ci struct ceph_dentry_info *di = ceph_dentry(dentry); 4538c2ecf20Sopenharmony_ci int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE; 4548c2ecf20Sopenharmony_ci u64 ino; 4558c2ecf20Sopenharmony_ci 4568c2ecf20Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 4578c2ecf20Sopenharmony_ci /* No auth cap means no chance for Dc caps */ 4588c2ecf20Sopenharmony_ci if (!ci->i_auth_cap) 4598c2ecf20Sopenharmony_ci goto no_async; 4608c2ecf20Sopenharmony_ci 4618c2ecf20Sopenharmony_ci /* Any delegated inos? */ 4628c2ecf20Sopenharmony_ci if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos)) 4638c2ecf20Sopenharmony_ci goto no_async; 4648c2ecf20Sopenharmony_ci 4658c2ecf20Sopenharmony_ci if (!ceph_file_layout_is_valid(&ci->i_cached_layout)) 4668c2ecf20Sopenharmony_ci goto no_async; 4678c2ecf20Sopenharmony_ci 4688c2ecf20Sopenharmony_ci if ((__ceph_caps_issued(ci, NULL) & want) != want) 4698c2ecf20Sopenharmony_ci goto no_async; 4708c2ecf20Sopenharmony_ci 4718c2ecf20Sopenharmony_ci if (d_in_lookup(dentry)) { 4728c2ecf20Sopenharmony_ci if (!__ceph_dir_is_complete(ci)) 4738c2ecf20Sopenharmony_ci goto no_async; 4748c2ecf20Sopenharmony_ci spin_lock(&dentry->d_lock); 4758c2ecf20Sopenharmony_ci di->lease_shared_gen = atomic_read(&ci->i_shared_gen); 4768c2ecf20Sopenharmony_ci spin_unlock(&dentry->d_lock); 4778c2ecf20Sopenharmony_ci } else if (atomic_read(&ci->i_shared_gen) != 4788c2ecf20Sopenharmony_ci READ_ONCE(di->lease_shared_gen)) { 4798c2ecf20Sopenharmony_ci goto no_async; 4808c2ecf20Sopenharmony_ci } 4818c2ecf20Sopenharmony_ci 4828c2ecf20Sopenharmony_ci ino = ceph_get_deleg_ino(ci->i_auth_cap->session); 4838c2ecf20Sopenharmony_ci if (!ino) 4848c2ecf20Sopenharmony_ci goto no_async; 4858c2ecf20Sopenharmony_ci 4868c2ecf20Sopenharmony_ci *pino = ino; 4878c2ecf20Sopenharmony_ci ceph_take_cap_refs(ci, want, false); 4888c2ecf20Sopenharmony_ci memcpy(lo, &ci->i_cached_layout, sizeof(*lo)); 4898c2ecf20Sopenharmony_ci rcu_assign_pointer(lo->pool_ns, 4908c2ecf20Sopenharmony_ci ceph_try_get_string(ci->i_cached_layout.pool_ns)); 4918c2ecf20Sopenharmony_ci got = want; 4928c2ecf20Sopenharmony_cino_async: 4938c2ecf20Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 4948c2ecf20Sopenharmony_ci return got; 4958c2ecf20Sopenharmony_ci} 4968c2ecf20Sopenharmony_ci 4978c2ecf20Sopenharmony_cistatic void restore_deleg_ino(struct inode *dir, u64 ino) 4988c2ecf20Sopenharmony_ci{ 4998c2ecf20Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(dir); 5008c2ecf20Sopenharmony_ci struct ceph_mds_session *s = NULL; 5018c2ecf20Sopenharmony_ci 5028c2ecf20Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 5038c2ecf20Sopenharmony_ci if (ci->i_auth_cap) 5048c2ecf20Sopenharmony_ci s = ceph_get_mds_session(ci->i_auth_cap->session); 5058c2ecf20Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 5068c2ecf20Sopenharmony_ci if (s) { 5078c2ecf20Sopenharmony_ci int err = ceph_restore_deleg_ino(s, ino); 5088c2ecf20Sopenharmony_ci if (err) 5098c2ecf20Sopenharmony_ci pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n", 5108c2ecf20Sopenharmony_ci ino, err); 5118c2ecf20Sopenharmony_ci ceph_put_mds_session(s); 5128c2ecf20Sopenharmony_ci } 5138c2ecf20Sopenharmony_ci} 5148c2ecf20Sopenharmony_ci 5158c2ecf20Sopenharmony_cistatic void ceph_async_create_cb(struct ceph_mds_client *mdsc, 5168c2ecf20Sopenharmony_ci struct ceph_mds_request *req) 5178c2ecf20Sopenharmony_ci{ 5188c2ecf20Sopenharmony_ci int result = req->r_err ? req->r_err : 5198c2ecf20Sopenharmony_ci le32_to_cpu(req->r_reply_info.head->result); 5208c2ecf20Sopenharmony_ci 5218c2ecf20Sopenharmony_ci if (result == -EJUKEBOX) 5228c2ecf20Sopenharmony_ci goto out; 5238c2ecf20Sopenharmony_ci 5248c2ecf20Sopenharmony_ci mapping_set_error(req->r_parent->i_mapping, result); 5258c2ecf20Sopenharmony_ci 5268c2ecf20Sopenharmony_ci if (result) { 5278c2ecf20Sopenharmony_ci struct dentry *dentry = req->r_dentry; 5288c2ecf20Sopenharmony_ci int pathlen = 0; 5298c2ecf20Sopenharmony_ci u64 base = 0; 5308c2ecf20Sopenharmony_ci char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, 5318c2ecf20Sopenharmony_ci &base, 0); 5328c2ecf20Sopenharmony_ci 5338c2ecf20Sopenharmony_ci ceph_dir_clear_complete(req->r_parent); 5348c2ecf20Sopenharmony_ci if (!d_unhashed(dentry)) 5358c2ecf20Sopenharmony_ci d_drop(dentry); 5368c2ecf20Sopenharmony_ci 5378c2ecf20Sopenharmony_ci /* FIXME: start returning I/O errors on all accesses? */ 5388c2ecf20Sopenharmony_ci pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", 5398c2ecf20Sopenharmony_ci base, IS_ERR(path) ? "<<bad>>" : path, result); 5408c2ecf20Sopenharmony_ci ceph_mdsc_free_path(path, pathlen); 5418c2ecf20Sopenharmony_ci } 5428c2ecf20Sopenharmony_ci 5438c2ecf20Sopenharmony_ci if (req->r_target_inode) { 5448c2ecf20Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); 5458c2ecf20Sopenharmony_ci u64 ino = ceph_vino(req->r_target_inode).ino; 5468c2ecf20Sopenharmony_ci 5478c2ecf20Sopenharmony_ci if (req->r_deleg_ino != ino) 5488c2ecf20Sopenharmony_ci pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n", 5498c2ecf20Sopenharmony_ci __func__, req->r_err, req->r_deleg_ino, ino); 5508c2ecf20Sopenharmony_ci mapping_set_error(req->r_target_inode->i_mapping, result); 5518c2ecf20Sopenharmony_ci 5528c2ecf20Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 5538c2ecf20Sopenharmony_ci if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { 5548c2ecf20Sopenharmony_ci ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; 5558c2ecf20Sopenharmony_ci wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); 5568c2ecf20Sopenharmony_ci } 5578c2ecf20Sopenharmony_ci ceph_kick_flushing_inode_caps(req->r_session, ci); 5588c2ecf20Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 5598c2ecf20Sopenharmony_ci } else { 5608c2ecf20Sopenharmony_ci pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__, 5618c2ecf20Sopenharmony_ci req->r_deleg_ino); 5628c2ecf20Sopenharmony_ci } 5638c2ecf20Sopenharmony_ciout: 5648c2ecf20Sopenharmony_ci ceph_mdsc_release_dir_caps(req); 5658c2ecf20Sopenharmony_ci} 5668c2ecf20Sopenharmony_ci 5678c2ecf20Sopenharmony_cistatic int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, 5688c2ecf20Sopenharmony_ci struct file *file, umode_t mode, 5698c2ecf20Sopenharmony_ci struct ceph_mds_request *req, 5708c2ecf20Sopenharmony_ci struct ceph_acl_sec_ctx *as_ctx, 5718c2ecf20Sopenharmony_ci struct ceph_file_layout *lo) 5728c2ecf20Sopenharmony_ci{ 5738c2ecf20Sopenharmony_ci int ret; 5748c2ecf20Sopenharmony_ci char xattr_buf[4]; 5758c2ecf20Sopenharmony_ci struct ceph_mds_reply_inode in = { }; 5768c2ecf20Sopenharmony_ci struct ceph_mds_reply_info_in iinfo = { .in = &in }; 5778c2ecf20Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(dir); 5788c2ecf20Sopenharmony_ci struct inode *inode; 5798c2ecf20Sopenharmony_ci struct timespec64 now; 5808c2ecf20Sopenharmony_ci struct ceph_string *pool_ns; 5818c2ecf20Sopenharmony_ci struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); 5828c2ecf20Sopenharmony_ci struct ceph_vino vino = { .ino = req->r_deleg_ino, 5838c2ecf20Sopenharmony_ci .snap = CEPH_NOSNAP }; 5848c2ecf20Sopenharmony_ci 5858c2ecf20Sopenharmony_ci ktime_get_real_ts64(&now); 5868c2ecf20Sopenharmony_ci 5878c2ecf20Sopenharmony_ci inode = ceph_get_inode(dentry->d_sb, vino); 5888c2ecf20Sopenharmony_ci if (IS_ERR(inode)) 5898c2ecf20Sopenharmony_ci return PTR_ERR(inode); 5908c2ecf20Sopenharmony_ci 5918c2ecf20Sopenharmony_ci iinfo.inline_version = CEPH_INLINE_NONE; 5928c2ecf20Sopenharmony_ci iinfo.change_attr = 1; 5938c2ecf20Sopenharmony_ci ceph_encode_timespec64(&iinfo.btime, &now); 5948c2ecf20Sopenharmony_ci 5958c2ecf20Sopenharmony_ci if (req->r_pagelist) { 5968c2ecf20Sopenharmony_ci iinfo.xattr_len = req->r_pagelist->length; 5978c2ecf20Sopenharmony_ci iinfo.xattr_data = req->r_pagelist->mapped_tail; 5988c2ecf20Sopenharmony_ci } else { 5998c2ecf20Sopenharmony_ci /* fake it */ 6008c2ecf20Sopenharmony_ci iinfo.xattr_len = ARRAY_SIZE(xattr_buf); 6018c2ecf20Sopenharmony_ci iinfo.xattr_data = xattr_buf; 6028c2ecf20Sopenharmony_ci memset(iinfo.xattr_data, 0, iinfo.xattr_len); 6038c2ecf20Sopenharmony_ci } 6048c2ecf20Sopenharmony_ci 6058c2ecf20Sopenharmony_ci in.ino = cpu_to_le64(vino.ino); 6068c2ecf20Sopenharmony_ci in.snapid = cpu_to_le64(CEPH_NOSNAP); 6078c2ecf20Sopenharmony_ci in.version = cpu_to_le64(1); // ??? 6088c2ecf20Sopenharmony_ci in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE); 6098c2ecf20Sopenharmony_ci in.cap.cap_id = cpu_to_le64(1); 6108c2ecf20Sopenharmony_ci in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino); 6118c2ecf20Sopenharmony_ci in.cap.flags = CEPH_CAP_FLAG_AUTH; 6128c2ecf20Sopenharmony_ci in.ctime = in.mtime = in.atime = iinfo.btime; 6138c2ecf20Sopenharmony_ci in.truncate_seq = cpu_to_le32(1); 6148c2ecf20Sopenharmony_ci in.truncate_size = cpu_to_le64(-1ULL); 6158c2ecf20Sopenharmony_ci in.xattr_version = cpu_to_le64(1); 6168c2ecf20Sopenharmony_ci in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid())); 6178c2ecf20Sopenharmony_ci if (dir->i_mode & S_ISGID) { 6188c2ecf20Sopenharmony_ci in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_gid)); 6198c2ecf20Sopenharmony_ci 6208c2ecf20Sopenharmony_ci /* Directories always inherit the setgid bit. */ 6218c2ecf20Sopenharmony_ci if (S_ISDIR(mode)) 6228c2ecf20Sopenharmony_ci mode |= S_ISGID; 6238c2ecf20Sopenharmony_ci else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) && 6248c2ecf20Sopenharmony_ci !in_group_p(dir->i_gid) && 6258c2ecf20Sopenharmony_ci !capable_wrt_inode_uidgid(dir, CAP_FSETID)) 6268c2ecf20Sopenharmony_ci mode &= ~S_ISGID; 6278c2ecf20Sopenharmony_ci } else { 6288c2ecf20Sopenharmony_ci in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid())); 6298c2ecf20Sopenharmony_ci } 6308c2ecf20Sopenharmony_ci in.mode = cpu_to_le32((u32)mode); 6318c2ecf20Sopenharmony_ci 6328c2ecf20Sopenharmony_ci in.nlink = cpu_to_le32(1); 6338c2ecf20Sopenharmony_ci in.max_size = cpu_to_le64(lo->stripe_unit); 6348c2ecf20Sopenharmony_ci 6358c2ecf20Sopenharmony_ci ceph_file_layout_to_legacy(lo, &in.layout); 6368c2ecf20Sopenharmony_ci /* lo is private, so pool_ns can't change */ 6378c2ecf20Sopenharmony_ci pool_ns = rcu_dereference_raw(lo->pool_ns); 6388c2ecf20Sopenharmony_ci if (pool_ns) { 6398c2ecf20Sopenharmony_ci iinfo.pool_ns_len = pool_ns->len; 6408c2ecf20Sopenharmony_ci iinfo.pool_ns_data = pool_ns->str; 6418c2ecf20Sopenharmony_ci } 6428c2ecf20Sopenharmony_ci 6438c2ecf20Sopenharmony_ci down_read(&mdsc->snap_rwsem); 6448c2ecf20Sopenharmony_ci ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session, 6458c2ecf20Sopenharmony_ci req->r_fmode, NULL); 6468c2ecf20Sopenharmony_ci up_read(&mdsc->snap_rwsem); 6478c2ecf20Sopenharmony_ci if (ret) { 6488c2ecf20Sopenharmony_ci dout("%s failed to fill inode: %d\n", __func__, ret); 6498c2ecf20Sopenharmony_ci ceph_dir_clear_complete(dir); 6508c2ecf20Sopenharmony_ci if (!d_unhashed(dentry)) 6518c2ecf20Sopenharmony_ci d_drop(dentry); 6528c2ecf20Sopenharmony_ci if (inode->i_state & I_NEW) 6538c2ecf20Sopenharmony_ci discard_new_inode(inode); 6548c2ecf20Sopenharmony_ci } else { 6558c2ecf20Sopenharmony_ci struct dentry *dn; 6568c2ecf20Sopenharmony_ci 6578c2ecf20Sopenharmony_ci dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__, 6588c2ecf20Sopenharmony_ci vino.ino, ceph_ino(dir), dentry->d_name.name); 6598c2ecf20Sopenharmony_ci ceph_dir_clear_ordered(dir); 6608c2ecf20Sopenharmony_ci ceph_init_inode_acls(inode, as_ctx); 6618c2ecf20Sopenharmony_ci if (inode->i_state & I_NEW) { 6628c2ecf20Sopenharmony_ci /* 6638c2ecf20Sopenharmony_ci * If it's not I_NEW, then someone created this before 6648c2ecf20Sopenharmony_ci * we got here. Assume the server is aware of it at 6658c2ecf20Sopenharmony_ci * that point and don't worry about setting 6668c2ecf20Sopenharmony_ci * CEPH_I_ASYNC_CREATE. 6678c2ecf20Sopenharmony_ci */ 6688c2ecf20Sopenharmony_ci ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE; 6698c2ecf20Sopenharmony_ci unlock_new_inode(inode); 6708c2ecf20Sopenharmony_ci } 6718c2ecf20Sopenharmony_ci if (d_in_lookup(dentry) || d_really_is_negative(dentry)) { 6728c2ecf20Sopenharmony_ci if (!d_unhashed(dentry)) 6738c2ecf20Sopenharmony_ci d_drop(dentry); 6748c2ecf20Sopenharmony_ci dn = d_splice_alias(inode, dentry); 6758c2ecf20Sopenharmony_ci WARN_ON_ONCE(dn && dn != dentry); 6768c2ecf20Sopenharmony_ci } 6778c2ecf20Sopenharmony_ci file->f_mode |= FMODE_CREATED; 6788c2ecf20Sopenharmony_ci ret = finish_open(file, dentry, ceph_open); 6798c2ecf20Sopenharmony_ci } 6808c2ecf20Sopenharmony_ci return ret; 6818c2ecf20Sopenharmony_ci} 6828c2ecf20Sopenharmony_ci 6838c2ecf20Sopenharmony_ci/* 6848c2ecf20Sopenharmony_ci * Do a lookup + open with a single request. If we get a non-existent 6858c2ecf20Sopenharmony_ci * file or symlink, return 1 so the VFS can retry. 6868c2ecf20Sopenharmony_ci */ 6878c2ecf20Sopenharmony_ciint ceph_atomic_open(struct inode *dir, struct dentry *dentry, 6888c2ecf20Sopenharmony_ci struct file *file, unsigned flags, umode_t mode) 6898c2ecf20Sopenharmony_ci{ 6908c2ecf20Sopenharmony_ci struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 6918c2ecf20Sopenharmony_ci struct ceph_mds_client *mdsc = fsc->mdsc; 6928c2ecf20Sopenharmony_ci struct ceph_mds_request *req; 6938c2ecf20Sopenharmony_ci struct dentry *dn; 6948c2ecf20Sopenharmony_ci struct ceph_acl_sec_ctx as_ctx = {}; 6958c2ecf20Sopenharmony_ci bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS); 6968c2ecf20Sopenharmony_ci int mask; 6978c2ecf20Sopenharmony_ci int err; 6988c2ecf20Sopenharmony_ci 6998c2ecf20Sopenharmony_ci dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n", 7008c2ecf20Sopenharmony_ci dir, dentry, dentry, 7018c2ecf20Sopenharmony_ci d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode); 7028c2ecf20Sopenharmony_ci 7038c2ecf20Sopenharmony_ci if (dentry->d_name.len > NAME_MAX) 7048c2ecf20Sopenharmony_ci return -ENAMETOOLONG; 7058c2ecf20Sopenharmony_ci 7068c2ecf20Sopenharmony_ci /* 7078c2ecf20Sopenharmony_ci * Do not truncate the file, since atomic_open is called before the 7088c2ecf20Sopenharmony_ci * permission check. The caller will do the truncation afterward. 7098c2ecf20Sopenharmony_ci */ 7108c2ecf20Sopenharmony_ci flags &= ~O_TRUNC; 7118c2ecf20Sopenharmony_ci 7128c2ecf20Sopenharmony_ci if (flags & O_CREAT) { 7138c2ecf20Sopenharmony_ci if (ceph_quota_is_max_files_exceeded(dir)) 7148c2ecf20Sopenharmony_ci return -EDQUOT; 7158c2ecf20Sopenharmony_ci err = ceph_pre_init_acls(dir, &mode, &as_ctx); 7168c2ecf20Sopenharmony_ci if (err < 0) 7178c2ecf20Sopenharmony_ci return err; 7188c2ecf20Sopenharmony_ci err = ceph_security_init_secctx(dentry, mode, &as_ctx); 7198c2ecf20Sopenharmony_ci if (err < 0) 7208c2ecf20Sopenharmony_ci goto out_ctx; 7218c2ecf20Sopenharmony_ci /* Async create can't handle more than a page of xattrs */ 7228c2ecf20Sopenharmony_ci if (as_ctx.pagelist && 7238c2ecf20Sopenharmony_ci !list_is_singular(&as_ctx.pagelist->head)) 7248c2ecf20Sopenharmony_ci try_async = false; 7258c2ecf20Sopenharmony_ci } else if (!d_in_lookup(dentry)) { 7268c2ecf20Sopenharmony_ci /* If it's not being looked up, it's negative */ 7278c2ecf20Sopenharmony_ci return -ENOENT; 7288c2ecf20Sopenharmony_ci } 7298c2ecf20Sopenharmony_ciretry: 7308c2ecf20Sopenharmony_ci /* do the open */ 7318c2ecf20Sopenharmony_ci req = prepare_open_request(dir->i_sb, flags, mode); 7328c2ecf20Sopenharmony_ci if (IS_ERR(req)) { 7338c2ecf20Sopenharmony_ci err = PTR_ERR(req); 7348c2ecf20Sopenharmony_ci goto out_ctx; 7358c2ecf20Sopenharmony_ci } 7368c2ecf20Sopenharmony_ci req->r_dentry = dget(dentry); 7378c2ecf20Sopenharmony_ci req->r_num_caps = 2; 7388c2ecf20Sopenharmony_ci mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; 7398c2ecf20Sopenharmony_ci if (ceph_security_xattr_wanted(dir)) 7408c2ecf20Sopenharmony_ci mask |= CEPH_CAP_XATTR_SHARED; 7418c2ecf20Sopenharmony_ci req->r_args.open.mask = cpu_to_le32(mask); 7428c2ecf20Sopenharmony_ci req->r_parent = dir; 7438c2ecf20Sopenharmony_ci 7448c2ecf20Sopenharmony_ci if (flags & O_CREAT) { 7458c2ecf20Sopenharmony_ci struct ceph_file_layout lo; 7468c2ecf20Sopenharmony_ci 7478c2ecf20Sopenharmony_ci req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; 7488c2ecf20Sopenharmony_ci req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 7498c2ecf20Sopenharmony_ci if (as_ctx.pagelist) { 7508c2ecf20Sopenharmony_ci req->r_pagelist = as_ctx.pagelist; 7518c2ecf20Sopenharmony_ci as_ctx.pagelist = NULL; 7528c2ecf20Sopenharmony_ci } 7538c2ecf20Sopenharmony_ci if (try_async && 7548c2ecf20Sopenharmony_ci (req->r_dir_caps = 7558c2ecf20Sopenharmony_ci try_prep_async_create(dir, dentry, &lo, 7568c2ecf20Sopenharmony_ci &req->r_deleg_ino))) { 7578c2ecf20Sopenharmony_ci set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); 7588c2ecf20Sopenharmony_ci req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL); 7598c2ecf20Sopenharmony_ci req->r_callback = ceph_async_create_cb; 7608c2ecf20Sopenharmony_ci err = ceph_mdsc_submit_request(mdsc, dir, req); 7618c2ecf20Sopenharmony_ci if (!err) { 7628c2ecf20Sopenharmony_ci err = ceph_finish_async_create(dir, dentry, 7638c2ecf20Sopenharmony_ci file, mode, req, 7648c2ecf20Sopenharmony_ci &as_ctx, &lo); 7658c2ecf20Sopenharmony_ci } else if (err == -EJUKEBOX) { 7668c2ecf20Sopenharmony_ci restore_deleg_ino(dir, req->r_deleg_ino); 7678c2ecf20Sopenharmony_ci ceph_mdsc_put_request(req); 7688c2ecf20Sopenharmony_ci try_async = false; 7698c2ecf20Sopenharmony_ci ceph_put_string(rcu_dereference_raw(lo.pool_ns)); 7708c2ecf20Sopenharmony_ci goto retry; 7718c2ecf20Sopenharmony_ci } 7728c2ecf20Sopenharmony_ci ceph_put_string(rcu_dereference_raw(lo.pool_ns)); 7738c2ecf20Sopenharmony_ci goto out_req; 7748c2ecf20Sopenharmony_ci } 7758c2ecf20Sopenharmony_ci } 7768c2ecf20Sopenharmony_ci 7778c2ecf20Sopenharmony_ci set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); 7788c2ecf20Sopenharmony_ci err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req); 7798c2ecf20Sopenharmony_ci err = ceph_handle_snapdir(req, dentry, err); 7808c2ecf20Sopenharmony_ci if (err) 7818c2ecf20Sopenharmony_ci goto out_req; 7828c2ecf20Sopenharmony_ci 7838c2ecf20Sopenharmony_ci if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) 7848c2ecf20Sopenharmony_ci err = ceph_handle_notrace_create(dir, dentry); 7858c2ecf20Sopenharmony_ci 7868c2ecf20Sopenharmony_ci if (d_in_lookup(dentry)) { 7878c2ecf20Sopenharmony_ci dn = ceph_finish_lookup(req, dentry, err); 7888c2ecf20Sopenharmony_ci if (IS_ERR(dn)) 7898c2ecf20Sopenharmony_ci err = PTR_ERR(dn); 7908c2ecf20Sopenharmony_ci } else { 7918c2ecf20Sopenharmony_ci /* we were given a hashed negative dentry */ 7928c2ecf20Sopenharmony_ci dn = NULL; 7938c2ecf20Sopenharmony_ci } 7948c2ecf20Sopenharmony_ci if (err) 7958c2ecf20Sopenharmony_ci goto out_req; 7968c2ecf20Sopenharmony_ci if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) { 7978c2ecf20Sopenharmony_ci /* make vfs retry on splice, ENOENT, or symlink */ 7988c2ecf20Sopenharmony_ci dout("atomic_open finish_no_open on dn %p\n", dn); 7998c2ecf20Sopenharmony_ci err = finish_no_open(file, dn); 8008c2ecf20Sopenharmony_ci } else { 8018c2ecf20Sopenharmony_ci dout("atomic_open finish_open on dn %p\n", dn); 8028c2ecf20Sopenharmony_ci if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { 8038c2ecf20Sopenharmony_ci struct inode *newino = d_inode(dentry); 8048c2ecf20Sopenharmony_ci 8058c2ecf20Sopenharmony_ci cache_file_layout(dir, newino); 8068c2ecf20Sopenharmony_ci ceph_init_inode_acls(newino, &as_ctx); 8078c2ecf20Sopenharmony_ci file->f_mode |= FMODE_CREATED; 8088c2ecf20Sopenharmony_ci } 8098c2ecf20Sopenharmony_ci err = finish_open(file, dentry, ceph_open); 8108c2ecf20Sopenharmony_ci } 8118c2ecf20Sopenharmony_ciout_req: 8128c2ecf20Sopenharmony_ci ceph_mdsc_put_request(req); 8138c2ecf20Sopenharmony_ciout_ctx: 8148c2ecf20Sopenharmony_ci ceph_release_acl_sec_ctx(&as_ctx); 8158c2ecf20Sopenharmony_ci dout("atomic_open result=%d\n", err); 8168c2ecf20Sopenharmony_ci return err; 8178c2ecf20Sopenharmony_ci} 8188c2ecf20Sopenharmony_ci 8198c2ecf20Sopenharmony_ciint ceph_release(struct inode *inode, struct file *file) 8208c2ecf20Sopenharmony_ci{ 8218c2ecf20Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 8228c2ecf20Sopenharmony_ci 8238c2ecf20Sopenharmony_ci if (S_ISDIR(inode->i_mode)) { 8248c2ecf20Sopenharmony_ci struct ceph_dir_file_info *dfi = file->private_data; 8258c2ecf20Sopenharmony_ci dout("release inode %p dir file %p\n", inode, file); 8268c2ecf20Sopenharmony_ci WARN_ON(!list_empty(&dfi->file_info.rw_contexts)); 8278c2ecf20Sopenharmony_ci 8288c2ecf20Sopenharmony_ci ceph_put_fmode(ci, dfi->file_info.fmode, 1); 8298c2ecf20Sopenharmony_ci 8308c2ecf20Sopenharmony_ci if (dfi->last_readdir) 8318c2ecf20Sopenharmony_ci ceph_mdsc_put_request(dfi->last_readdir); 8328c2ecf20Sopenharmony_ci kfree(dfi->last_name); 8338c2ecf20Sopenharmony_ci kfree(dfi->dir_info); 8348c2ecf20Sopenharmony_ci kmem_cache_free(ceph_dir_file_cachep, dfi); 8358c2ecf20Sopenharmony_ci } else { 8368c2ecf20Sopenharmony_ci struct ceph_file_info *fi = file->private_data; 8378c2ecf20Sopenharmony_ci dout("release inode %p regular file %p\n", inode, file); 8388c2ecf20Sopenharmony_ci WARN_ON(!list_empty(&fi->rw_contexts)); 8398c2ecf20Sopenharmony_ci 8408c2ecf20Sopenharmony_ci ceph_put_fmode(ci, fi->fmode, 1); 8418c2ecf20Sopenharmony_ci 8428c2ecf20Sopenharmony_ci kmem_cache_free(ceph_file_cachep, fi); 8438c2ecf20Sopenharmony_ci } 8448c2ecf20Sopenharmony_ci 8458c2ecf20Sopenharmony_ci /* wake up anyone waiting for caps on this inode */ 8468c2ecf20Sopenharmony_ci wake_up_all(&ci->i_cap_wq); 8478c2ecf20Sopenharmony_ci return 0; 8488c2ecf20Sopenharmony_ci} 8498c2ecf20Sopenharmony_ci 8508c2ecf20Sopenharmony_cienum { 8518c2ecf20Sopenharmony_ci HAVE_RETRIED = 1, 8528c2ecf20Sopenharmony_ci CHECK_EOF = 2, 8538c2ecf20Sopenharmony_ci READ_INLINE = 3, 8548c2ecf20Sopenharmony_ci}; 8558c2ecf20Sopenharmony_ci 8568c2ecf20Sopenharmony_ci/* 8578c2ecf20Sopenharmony_ci * Completely synchronous read and write methods. Direct from __user 8588c2ecf20Sopenharmony_ci * buffer to osd, or directly to user pages (if O_DIRECT). 8598c2ecf20Sopenharmony_ci * 8608c2ecf20Sopenharmony_ci * If the read spans object boundary, just do multiple reads. (That's not 8618c2ecf20Sopenharmony_ci * atomic, but good enough for now.) 8628c2ecf20Sopenharmony_ci * 8638c2ecf20Sopenharmony_ci * If we get a short result from the OSD, check against i_size; we need to 8648c2ecf20Sopenharmony_ci * only return a short read to the caller if we hit EOF. 8658c2ecf20Sopenharmony_ci */ 8668c2ecf20Sopenharmony_cistatic ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, 8678c2ecf20Sopenharmony_ci int *retry_op) 8688c2ecf20Sopenharmony_ci{ 8698c2ecf20Sopenharmony_ci struct file *file = iocb->ki_filp; 8708c2ecf20Sopenharmony_ci struct inode *inode = file_inode(file); 8718c2ecf20Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 8728c2ecf20Sopenharmony_ci struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 8738c2ecf20Sopenharmony_ci struct ceph_osd_client *osdc = &fsc->client->osdc; 8748c2ecf20Sopenharmony_ci ssize_t ret; 8758c2ecf20Sopenharmony_ci u64 off = iocb->ki_pos; 8768c2ecf20Sopenharmony_ci u64 len = iov_iter_count(to); 8778c2ecf20Sopenharmony_ci 8788c2ecf20Sopenharmony_ci dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, 8798c2ecf20Sopenharmony_ci (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 8808c2ecf20Sopenharmony_ci 8818c2ecf20Sopenharmony_ci if (!len) 8828c2ecf20Sopenharmony_ci return 0; 8838c2ecf20Sopenharmony_ci /* 8848c2ecf20Sopenharmony_ci * flush any page cache pages in this range. this 8858c2ecf20Sopenharmony_ci * will make concurrent normal and sync io slow, 8868c2ecf20Sopenharmony_ci * but it will at least behave sensibly when they are 8878c2ecf20Sopenharmony_ci * in sequence. 8888c2ecf20Sopenharmony_ci */ 8898c2ecf20Sopenharmony_ci ret = filemap_write_and_wait_range(inode->i_mapping, 8908c2ecf20Sopenharmony_ci off, off + len - 1); 8918c2ecf20Sopenharmony_ci if (ret < 0) 8928c2ecf20Sopenharmony_ci return ret; 8938c2ecf20Sopenharmony_ci 8948c2ecf20Sopenharmony_ci ret = 0; 8958c2ecf20Sopenharmony_ci while ((len = iov_iter_count(to)) > 0) { 8968c2ecf20Sopenharmony_ci struct ceph_osd_request *req; 8978c2ecf20Sopenharmony_ci struct page **pages; 8988c2ecf20Sopenharmony_ci int num_pages; 8998c2ecf20Sopenharmony_ci size_t page_off; 9008c2ecf20Sopenharmony_ci u64 i_size; 9018c2ecf20Sopenharmony_ci bool more; 9028c2ecf20Sopenharmony_ci int idx; 9038c2ecf20Sopenharmony_ci size_t left; 9048c2ecf20Sopenharmony_ci 9058c2ecf20Sopenharmony_ci req = ceph_osdc_new_request(osdc, &ci->i_layout, 9068c2ecf20Sopenharmony_ci ci->i_vino, off, &len, 0, 1, 9078c2ecf20Sopenharmony_ci CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 9088c2ecf20Sopenharmony_ci NULL, ci->i_truncate_seq, 9098c2ecf20Sopenharmony_ci ci->i_truncate_size, false); 9108c2ecf20Sopenharmony_ci if (IS_ERR(req)) { 9118c2ecf20Sopenharmony_ci ret = PTR_ERR(req); 9128c2ecf20Sopenharmony_ci break; 9138c2ecf20Sopenharmony_ci } 9148c2ecf20Sopenharmony_ci 9158c2ecf20Sopenharmony_ci more = len < iov_iter_count(to); 9168c2ecf20Sopenharmony_ci 9178c2ecf20Sopenharmony_ci num_pages = calc_pages_for(off, len); 9188c2ecf20Sopenharmony_ci page_off = off & ~PAGE_MASK; 9198c2ecf20Sopenharmony_ci pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 9208c2ecf20Sopenharmony_ci if (IS_ERR(pages)) { 9218c2ecf20Sopenharmony_ci ceph_osdc_put_request(req); 9228c2ecf20Sopenharmony_ci ret = PTR_ERR(pages); 9238c2ecf20Sopenharmony_ci break; 9248c2ecf20Sopenharmony_ci } 9258c2ecf20Sopenharmony_ci 9268c2ecf20Sopenharmony_ci osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, 9278c2ecf20Sopenharmony_ci false, false); 9288c2ecf20Sopenharmony_ci ret = ceph_osdc_start_request(osdc, req, false); 9298c2ecf20Sopenharmony_ci if (!ret) 9308c2ecf20Sopenharmony_ci ret = ceph_osdc_wait_request(osdc, req); 9318c2ecf20Sopenharmony_ci 9328c2ecf20Sopenharmony_ci ceph_update_read_latency(&fsc->mdsc->metric, 9338c2ecf20Sopenharmony_ci req->r_start_latency, 9348c2ecf20Sopenharmony_ci req->r_end_latency, 9358c2ecf20Sopenharmony_ci ret); 9368c2ecf20Sopenharmony_ci 9378c2ecf20Sopenharmony_ci ceph_osdc_put_request(req); 9388c2ecf20Sopenharmony_ci 9398c2ecf20Sopenharmony_ci i_size = i_size_read(inode); 9408c2ecf20Sopenharmony_ci dout("sync_read %llu~%llu got %zd i_size %llu%s\n", 9418c2ecf20Sopenharmony_ci off, len, ret, i_size, (more ? " MORE" : "")); 9428c2ecf20Sopenharmony_ci 9438c2ecf20Sopenharmony_ci if (ret == -ENOENT) 9448c2ecf20Sopenharmony_ci ret = 0; 9458c2ecf20Sopenharmony_ci if (ret >= 0 && ret < len && (off + ret < i_size)) { 9468c2ecf20Sopenharmony_ci int zlen = min(len - ret, i_size - off - ret); 9478c2ecf20Sopenharmony_ci int zoff = page_off + ret; 9488c2ecf20Sopenharmony_ci dout("sync_read zero gap %llu~%llu\n", 9498c2ecf20Sopenharmony_ci off + ret, off + ret + zlen); 9508c2ecf20Sopenharmony_ci ceph_zero_page_vector_range(zoff, zlen, pages); 9518c2ecf20Sopenharmony_ci ret += zlen; 9528c2ecf20Sopenharmony_ci } 9538c2ecf20Sopenharmony_ci 9548c2ecf20Sopenharmony_ci idx = 0; 9558c2ecf20Sopenharmony_ci left = ret > 0 ? ret : 0; 9568c2ecf20Sopenharmony_ci while (left > 0) { 9578c2ecf20Sopenharmony_ci size_t len, copied; 9588c2ecf20Sopenharmony_ci page_off = off & ~PAGE_MASK; 9598c2ecf20Sopenharmony_ci len = min_t(size_t, left, PAGE_SIZE - page_off); 9608c2ecf20Sopenharmony_ci SetPageUptodate(pages[idx]); 9618c2ecf20Sopenharmony_ci copied = copy_page_to_iter(pages[idx++], 9628c2ecf20Sopenharmony_ci page_off, len, to); 9638c2ecf20Sopenharmony_ci off += copied; 9648c2ecf20Sopenharmony_ci left -= copied; 9658c2ecf20Sopenharmony_ci if (copied < len) { 9668c2ecf20Sopenharmony_ci ret = -EFAULT; 9678c2ecf20Sopenharmony_ci break; 9688c2ecf20Sopenharmony_ci } 9698c2ecf20Sopenharmony_ci } 9708c2ecf20Sopenharmony_ci ceph_release_page_vector(pages, num_pages); 9718c2ecf20Sopenharmony_ci 9728c2ecf20Sopenharmony_ci if (ret < 0) { 9738c2ecf20Sopenharmony_ci if (ret == -EBLOCKLISTED) 9748c2ecf20Sopenharmony_ci fsc->blocklisted = true; 9758c2ecf20Sopenharmony_ci break; 9768c2ecf20Sopenharmony_ci } 9778c2ecf20Sopenharmony_ci 9788c2ecf20Sopenharmony_ci if (off >= i_size || !more) 9798c2ecf20Sopenharmony_ci break; 9808c2ecf20Sopenharmony_ci } 9818c2ecf20Sopenharmony_ci 9828c2ecf20Sopenharmony_ci if (off > iocb->ki_pos) { 9838c2ecf20Sopenharmony_ci if (ret >= 0 && 9848c2ecf20Sopenharmony_ci iov_iter_count(to) > 0 && off >= i_size_read(inode)) 9858c2ecf20Sopenharmony_ci *retry_op = CHECK_EOF; 9868c2ecf20Sopenharmony_ci ret = off - iocb->ki_pos; 9878c2ecf20Sopenharmony_ci iocb->ki_pos = off; 9888c2ecf20Sopenharmony_ci } 9898c2ecf20Sopenharmony_ci 9908c2ecf20Sopenharmony_ci dout("sync_read result %zd retry_op %d\n", ret, *retry_op); 9918c2ecf20Sopenharmony_ci return ret; 9928c2ecf20Sopenharmony_ci} 9938c2ecf20Sopenharmony_ci 9948c2ecf20Sopenharmony_cistruct ceph_aio_request { 9958c2ecf20Sopenharmony_ci struct kiocb *iocb; 9968c2ecf20Sopenharmony_ci size_t total_len; 9978c2ecf20Sopenharmony_ci bool write; 9988c2ecf20Sopenharmony_ci bool should_dirty; 9998c2ecf20Sopenharmony_ci int error; 10008c2ecf20Sopenharmony_ci struct list_head osd_reqs; 10018c2ecf20Sopenharmony_ci unsigned num_reqs; 10028c2ecf20Sopenharmony_ci atomic_t pending_reqs; 10038c2ecf20Sopenharmony_ci struct timespec64 mtime; 10048c2ecf20Sopenharmony_ci struct ceph_cap_flush *prealloc_cf; 10058c2ecf20Sopenharmony_ci}; 10068c2ecf20Sopenharmony_ci 10078c2ecf20Sopenharmony_cistruct ceph_aio_work { 10088c2ecf20Sopenharmony_ci struct work_struct work; 10098c2ecf20Sopenharmony_ci struct ceph_osd_request *req; 10108c2ecf20Sopenharmony_ci}; 10118c2ecf20Sopenharmony_ci 10128c2ecf20Sopenharmony_cistatic void ceph_aio_retry_work(struct work_struct *work); 10138c2ecf20Sopenharmony_ci 10148c2ecf20Sopenharmony_cistatic void ceph_aio_complete(struct inode *inode, 10158c2ecf20Sopenharmony_ci struct ceph_aio_request *aio_req) 10168c2ecf20Sopenharmony_ci{ 10178c2ecf20Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 10188c2ecf20Sopenharmony_ci int ret; 10198c2ecf20Sopenharmony_ci 10208c2ecf20Sopenharmony_ci if (!atomic_dec_and_test(&aio_req->pending_reqs)) 10218c2ecf20Sopenharmony_ci return; 10228c2ecf20Sopenharmony_ci 10238c2ecf20Sopenharmony_ci if (aio_req->iocb->ki_flags & IOCB_DIRECT) 10248c2ecf20Sopenharmony_ci inode_dio_end(inode); 10258c2ecf20Sopenharmony_ci 10268c2ecf20Sopenharmony_ci ret = aio_req->error; 10278c2ecf20Sopenharmony_ci if (!ret) 10288c2ecf20Sopenharmony_ci ret = aio_req->total_len; 10298c2ecf20Sopenharmony_ci 10308c2ecf20Sopenharmony_ci dout("ceph_aio_complete %p rc %d\n", inode, ret); 10318c2ecf20Sopenharmony_ci 10328c2ecf20Sopenharmony_ci if (ret >= 0 && aio_req->write) { 10338c2ecf20Sopenharmony_ci int dirty; 10348c2ecf20Sopenharmony_ci 10358c2ecf20Sopenharmony_ci loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len; 10368c2ecf20Sopenharmony_ci if (endoff > i_size_read(inode)) { 10378c2ecf20Sopenharmony_ci if (ceph_inode_set_size(inode, endoff)) 10388c2ecf20Sopenharmony_ci ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 10398c2ecf20Sopenharmony_ci } 10408c2ecf20Sopenharmony_ci 10418c2ecf20Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 10428c2ecf20Sopenharmony_ci ci->i_inline_version = CEPH_INLINE_NONE; 10438c2ecf20Sopenharmony_ci dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, 10448c2ecf20Sopenharmony_ci &aio_req->prealloc_cf); 10458c2ecf20Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 10468c2ecf20Sopenharmony_ci if (dirty) 10478c2ecf20Sopenharmony_ci __mark_inode_dirty(inode, dirty); 10488c2ecf20Sopenharmony_ci 10498c2ecf20Sopenharmony_ci } 10508c2ecf20Sopenharmony_ci 10518c2ecf20Sopenharmony_ci ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR : 10528c2ecf20Sopenharmony_ci CEPH_CAP_FILE_RD)); 10538c2ecf20Sopenharmony_ci 10548c2ecf20Sopenharmony_ci aio_req->iocb->ki_complete(aio_req->iocb, ret, 0); 10558c2ecf20Sopenharmony_ci 10568c2ecf20Sopenharmony_ci ceph_free_cap_flush(aio_req->prealloc_cf); 10578c2ecf20Sopenharmony_ci kfree(aio_req); 10588c2ecf20Sopenharmony_ci} 10598c2ecf20Sopenharmony_ci 10608c2ecf20Sopenharmony_cistatic void ceph_aio_complete_req(struct ceph_osd_request *req) 10618c2ecf20Sopenharmony_ci{ 10628c2ecf20Sopenharmony_ci int rc = req->r_result; 10638c2ecf20Sopenharmony_ci struct inode *inode = req->r_inode; 10648c2ecf20Sopenharmony_ci struct ceph_aio_request *aio_req = req->r_priv; 10658c2ecf20Sopenharmony_ci struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); 10668c2ecf20Sopenharmony_ci struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric; 10678c2ecf20Sopenharmony_ci 10688c2ecf20Sopenharmony_ci BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); 10698c2ecf20Sopenharmony_ci BUG_ON(!osd_data->num_bvecs); 10708c2ecf20Sopenharmony_ci 10718c2ecf20Sopenharmony_ci dout("ceph_aio_complete_req %p rc %d bytes %u\n", 10728c2ecf20Sopenharmony_ci inode, rc, osd_data->bvec_pos.iter.bi_size); 10738c2ecf20Sopenharmony_ci 10748c2ecf20Sopenharmony_ci /* r_start_latency == 0 means the request was not submitted */ 10758c2ecf20Sopenharmony_ci if (req->r_start_latency) { 10768c2ecf20Sopenharmony_ci if (aio_req->write) 10778c2ecf20Sopenharmony_ci ceph_update_write_latency(metric, req->r_start_latency, 10788c2ecf20Sopenharmony_ci req->r_end_latency, rc); 10798c2ecf20Sopenharmony_ci else 10808c2ecf20Sopenharmony_ci ceph_update_read_latency(metric, req->r_start_latency, 10818c2ecf20Sopenharmony_ci req->r_end_latency, rc); 10828c2ecf20Sopenharmony_ci } 10838c2ecf20Sopenharmony_ci 10848c2ecf20Sopenharmony_ci if (rc == -EOLDSNAPC) { 10858c2ecf20Sopenharmony_ci struct ceph_aio_work *aio_work; 10868c2ecf20Sopenharmony_ci BUG_ON(!aio_req->write); 10878c2ecf20Sopenharmony_ci 10888c2ecf20Sopenharmony_ci aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS); 10898c2ecf20Sopenharmony_ci if (aio_work) { 10908c2ecf20Sopenharmony_ci INIT_WORK(&aio_work->work, ceph_aio_retry_work); 10918c2ecf20Sopenharmony_ci aio_work->req = req; 10928c2ecf20Sopenharmony_ci queue_work(ceph_inode_to_client(inode)->inode_wq, 10938c2ecf20Sopenharmony_ci &aio_work->work); 10948c2ecf20Sopenharmony_ci return; 10958c2ecf20Sopenharmony_ci } 10968c2ecf20Sopenharmony_ci rc = -ENOMEM; 10978c2ecf20Sopenharmony_ci } else if (!aio_req->write) { 10988c2ecf20Sopenharmony_ci if (rc == -ENOENT) 10998c2ecf20Sopenharmony_ci rc = 0; 11008c2ecf20Sopenharmony_ci if (rc >= 0 && osd_data->bvec_pos.iter.bi_size > rc) { 11018c2ecf20Sopenharmony_ci struct iov_iter i; 11028c2ecf20Sopenharmony_ci int zlen = osd_data->bvec_pos.iter.bi_size - rc; 11038c2ecf20Sopenharmony_ci 11048c2ecf20Sopenharmony_ci /* 11058c2ecf20Sopenharmony_ci * If read is satisfied by single OSD request, 11068c2ecf20Sopenharmony_ci * it can pass EOF. Otherwise read is within 11078c2ecf20Sopenharmony_ci * i_size. 11088c2ecf20Sopenharmony_ci */ 11098c2ecf20Sopenharmony_ci if (aio_req->num_reqs == 1) { 11108c2ecf20Sopenharmony_ci loff_t i_size = i_size_read(inode); 11118c2ecf20Sopenharmony_ci loff_t endoff = aio_req->iocb->ki_pos + rc; 11128c2ecf20Sopenharmony_ci if (endoff < i_size) 11138c2ecf20Sopenharmony_ci zlen = min_t(size_t, zlen, 11148c2ecf20Sopenharmony_ci i_size - endoff); 11158c2ecf20Sopenharmony_ci aio_req->total_len = rc + zlen; 11168c2ecf20Sopenharmony_ci } 11178c2ecf20Sopenharmony_ci 11188c2ecf20Sopenharmony_ci iov_iter_bvec(&i, READ, osd_data->bvec_pos.bvecs, 11198c2ecf20Sopenharmony_ci osd_data->num_bvecs, 11208c2ecf20Sopenharmony_ci osd_data->bvec_pos.iter.bi_size); 11218c2ecf20Sopenharmony_ci iov_iter_advance(&i, rc); 11228c2ecf20Sopenharmony_ci iov_iter_zero(zlen, &i); 11238c2ecf20Sopenharmony_ci } 11248c2ecf20Sopenharmony_ci } 11258c2ecf20Sopenharmony_ci 11268c2ecf20Sopenharmony_ci put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs, 11278c2ecf20Sopenharmony_ci aio_req->should_dirty); 11288c2ecf20Sopenharmony_ci ceph_osdc_put_request(req); 11298c2ecf20Sopenharmony_ci 11308c2ecf20Sopenharmony_ci if (rc < 0) 11318c2ecf20Sopenharmony_ci cmpxchg(&aio_req->error, 0, rc); 11328c2ecf20Sopenharmony_ci 11338c2ecf20Sopenharmony_ci ceph_aio_complete(inode, aio_req); 11348c2ecf20Sopenharmony_ci return; 11358c2ecf20Sopenharmony_ci} 11368c2ecf20Sopenharmony_ci 11378c2ecf20Sopenharmony_cistatic void ceph_aio_retry_work(struct work_struct *work) 11388c2ecf20Sopenharmony_ci{ 11398c2ecf20Sopenharmony_ci struct ceph_aio_work *aio_work = 11408c2ecf20Sopenharmony_ci container_of(work, struct ceph_aio_work, work); 11418c2ecf20Sopenharmony_ci struct ceph_osd_request *orig_req = aio_work->req; 11428c2ecf20Sopenharmony_ci struct ceph_aio_request *aio_req = orig_req->r_priv; 11438c2ecf20Sopenharmony_ci struct inode *inode = orig_req->r_inode; 11448c2ecf20Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 11458c2ecf20Sopenharmony_ci struct ceph_snap_context *snapc; 11468c2ecf20Sopenharmony_ci struct ceph_osd_request *req; 11478c2ecf20Sopenharmony_ci int ret; 11488c2ecf20Sopenharmony_ci 11498c2ecf20Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 11508c2ecf20Sopenharmony_ci if (__ceph_have_pending_cap_snap(ci)) { 11518c2ecf20Sopenharmony_ci struct ceph_cap_snap *capsnap = 11528c2ecf20Sopenharmony_ci list_last_entry(&ci->i_cap_snaps, 11538c2ecf20Sopenharmony_ci struct ceph_cap_snap, 11548c2ecf20Sopenharmony_ci ci_item); 11558c2ecf20Sopenharmony_ci snapc = ceph_get_snap_context(capsnap->context); 11568c2ecf20Sopenharmony_ci } else { 11578c2ecf20Sopenharmony_ci BUG_ON(!ci->i_head_snapc); 11588c2ecf20Sopenharmony_ci snapc = ceph_get_snap_context(ci->i_head_snapc); 11598c2ecf20Sopenharmony_ci } 11608c2ecf20Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 11618c2ecf20Sopenharmony_ci 11628c2ecf20Sopenharmony_ci req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 1, 11638c2ecf20Sopenharmony_ci false, GFP_NOFS); 11648c2ecf20Sopenharmony_ci if (!req) { 11658c2ecf20Sopenharmony_ci ret = -ENOMEM; 11668c2ecf20Sopenharmony_ci req = orig_req; 11678c2ecf20Sopenharmony_ci goto out; 11688c2ecf20Sopenharmony_ci } 11698c2ecf20Sopenharmony_ci 11708c2ecf20Sopenharmony_ci req->r_flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; 11718c2ecf20Sopenharmony_ci ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); 11728c2ecf20Sopenharmony_ci ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); 11738c2ecf20Sopenharmony_ci 11748c2ecf20Sopenharmony_ci req->r_ops[0] = orig_req->r_ops[0]; 11758c2ecf20Sopenharmony_ci 11768c2ecf20Sopenharmony_ci req->r_mtime = aio_req->mtime; 11778c2ecf20Sopenharmony_ci req->r_data_offset = req->r_ops[0].extent.offset; 11788c2ecf20Sopenharmony_ci 11798c2ecf20Sopenharmony_ci ret = ceph_osdc_alloc_messages(req, GFP_NOFS); 11808c2ecf20Sopenharmony_ci if (ret) { 11818c2ecf20Sopenharmony_ci ceph_osdc_put_request(req); 11828c2ecf20Sopenharmony_ci req = orig_req; 11838c2ecf20Sopenharmony_ci goto out; 11848c2ecf20Sopenharmony_ci } 11858c2ecf20Sopenharmony_ci 11868c2ecf20Sopenharmony_ci ceph_osdc_put_request(orig_req); 11878c2ecf20Sopenharmony_ci 11888c2ecf20Sopenharmony_ci req->r_callback = ceph_aio_complete_req; 11898c2ecf20Sopenharmony_ci req->r_inode = inode; 11908c2ecf20Sopenharmony_ci req->r_priv = aio_req; 11918c2ecf20Sopenharmony_ci 11928c2ecf20Sopenharmony_ci ret = ceph_osdc_start_request(req->r_osdc, req, false); 11938c2ecf20Sopenharmony_ciout: 11948c2ecf20Sopenharmony_ci if (ret < 0) { 11958c2ecf20Sopenharmony_ci req->r_result = ret; 11968c2ecf20Sopenharmony_ci ceph_aio_complete_req(req); 11978c2ecf20Sopenharmony_ci } 11988c2ecf20Sopenharmony_ci 11998c2ecf20Sopenharmony_ci ceph_put_snap_context(snapc); 12008c2ecf20Sopenharmony_ci kfree(aio_work); 12018c2ecf20Sopenharmony_ci} 12028c2ecf20Sopenharmony_ci 12038c2ecf20Sopenharmony_cistatic ssize_t 12048c2ecf20Sopenharmony_ciceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, 12058c2ecf20Sopenharmony_ci struct ceph_snap_context *snapc, 12068c2ecf20Sopenharmony_ci struct ceph_cap_flush **pcf) 12078c2ecf20Sopenharmony_ci{ 12088c2ecf20Sopenharmony_ci struct file *file = iocb->ki_filp; 12098c2ecf20Sopenharmony_ci struct inode *inode = file_inode(file); 12108c2ecf20Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 12118c2ecf20Sopenharmony_ci struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 12128c2ecf20Sopenharmony_ci struct ceph_client_metric *metric = &fsc->mdsc->metric; 12138c2ecf20Sopenharmony_ci struct ceph_vino vino; 12148c2ecf20Sopenharmony_ci struct ceph_osd_request *req; 12158c2ecf20Sopenharmony_ci struct bio_vec *bvecs; 12168c2ecf20Sopenharmony_ci struct ceph_aio_request *aio_req = NULL; 12178c2ecf20Sopenharmony_ci int num_pages = 0; 12188c2ecf20Sopenharmony_ci int flags; 12198c2ecf20Sopenharmony_ci int ret = 0; 12208c2ecf20Sopenharmony_ci struct timespec64 mtime = current_time(inode); 12218c2ecf20Sopenharmony_ci size_t count = iov_iter_count(iter); 12228c2ecf20Sopenharmony_ci loff_t pos = iocb->ki_pos; 12238c2ecf20Sopenharmony_ci bool write = iov_iter_rw(iter) == WRITE; 12248c2ecf20Sopenharmony_ci bool should_dirty = !write && iter_is_iovec(iter); 12258c2ecf20Sopenharmony_ci 12268c2ecf20Sopenharmony_ci if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) 12278c2ecf20Sopenharmony_ci return -EROFS; 12288c2ecf20Sopenharmony_ci 12298c2ecf20Sopenharmony_ci dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n", 12308c2ecf20Sopenharmony_ci (write ? "write" : "read"), file, pos, (unsigned)count, 12318c2ecf20Sopenharmony_ci snapc, snapc ? snapc->seq : 0); 12328c2ecf20Sopenharmony_ci 12338c2ecf20Sopenharmony_ci if (write) { 12348c2ecf20Sopenharmony_ci int ret2 = invalidate_inode_pages2_range(inode->i_mapping, 12358c2ecf20Sopenharmony_ci pos >> PAGE_SHIFT, 12368c2ecf20Sopenharmony_ci (pos + count - 1) >> PAGE_SHIFT); 12378c2ecf20Sopenharmony_ci if (ret2 < 0) 12388c2ecf20Sopenharmony_ci dout("invalidate_inode_pages2_range returned %d\n", ret2); 12398c2ecf20Sopenharmony_ci 12408c2ecf20Sopenharmony_ci flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; 12418c2ecf20Sopenharmony_ci } else { 12428c2ecf20Sopenharmony_ci flags = CEPH_OSD_FLAG_READ; 12438c2ecf20Sopenharmony_ci } 12448c2ecf20Sopenharmony_ci 12458c2ecf20Sopenharmony_ci while (iov_iter_count(iter) > 0) { 12468c2ecf20Sopenharmony_ci u64 size = iov_iter_count(iter); 12478c2ecf20Sopenharmony_ci ssize_t len; 12488c2ecf20Sopenharmony_ci 12498c2ecf20Sopenharmony_ci if (write) 12508c2ecf20Sopenharmony_ci size = min_t(u64, size, fsc->mount_options->wsize); 12518c2ecf20Sopenharmony_ci else 12528c2ecf20Sopenharmony_ci size = min_t(u64, size, fsc->mount_options->rsize); 12538c2ecf20Sopenharmony_ci 12548c2ecf20Sopenharmony_ci vino = ceph_vino(inode); 12558c2ecf20Sopenharmony_ci req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 12568c2ecf20Sopenharmony_ci vino, pos, &size, 0, 12578c2ecf20Sopenharmony_ci 1, 12588c2ecf20Sopenharmony_ci write ? CEPH_OSD_OP_WRITE : 12598c2ecf20Sopenharmony_ci CEPH_OSD_OP_READ, 12608c2ecf20Sopenharmony_ci flags, snapc, 12618c2ecf20Sopenharmony_ci ci->i_truncate_seq, 12628c2ecf20Sopenharmony_ci ci->i_truncate_size, 12638c2ecf20Sopenharmony_ci false); 12648c2ecf20Sopenharmony_ci if (IS_ERR(req)) { 12658c2ecf20Sopenharmony_ci ret = PTR_ERR(req); 12668c2ecf20Sopenharmony_ci break; 12678c2ecf20Sopenharmony_ci } 12688c2ecf20Sopenharmony_ci 12698c2ecf20Sopenharmony_ci len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages); 12708c2ecf20Sopenharmony_ci if (len < 0) { 12718c2ecf20Sopenharmony_ci ceph_osdc_put_request(req); 12728c2ecf20Sopenharmony_ci ret = len; 12738c2ecf20Sopenharmony_ci break; 12748c2ecf20Sopenharmony_ci } 12758c2ecf20Sopenharmony_ci if (len != size) 12768c2ecf20Sopenharmony_ci osd_req_op_extent_update(req, 0, len); 12778c2ecf20Sopenharmony_ci 12788c2ecf20Sopenharmony_ci /* 12798c2ecf20Sopenharmony_ci * To simplify error handling, allow AIO when IO within i_size 12808c2ecf20Sopenharmony_ci * or IO can be satisfied by single OSD request. 12818c2ecf20Sopenharmony_ci */ 12828c2ecf20Sopenharmony_ci if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) && 12838c2ecf20Sopenharmony_ci (len == count || pos + count <= i_size_read(inode))) { 12848c2ecf20Sopenharmony_ci aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL); 12858c2ecf20Sopenharmony_ci if (aio_req) { 12868c2ecf20Sopenharmony_ci aio_req->iocb = iocb; 12878c2ecf20Sopenharmony_ci aio_req->write = write; 12888c2ecf20Sopenharmony_ci aio_req->should_dirty = should_dirty; 12898c2ecf20Sopenharmony_ci INIT_LIST_HEAD(&aio_req->osd_reqs); 12908c2ecf20Sopenharmony_ci if (write) { 12918c2ecf20Sopenharmony_ci aio_req->mtime = mtime; 12928c2ecf20Sopenharmony_ci swap(aio_req->prealloc_cf, *pcf); 12938c2ecf20Sopenharmony_ci } 12948c2ecf20Sopenharmony_ci } 12958c2ecf20Sopenharmony_ci /* ignore error */ 12968c2ecf20Sopenharmony_ci } 12978c2ecf20Sopenharmony_ci 12988c2ecf20Sopenharmony_ci if (write) { 12998c2ecf20Sopenharmony_ci /* 13008c2ecf20Sopenharmony_ci * throw out any page cache pages in this range. this 13018c2ecf20Sopenharmony_ci * may block. 13028c2ecf20Sopenharmony_ci */ 13038c2ecf20Sopenharmony_ci truncate_inode_pages_range(inode->i_mapping, pos, 13048c2ecf20Sopenharmony_ci PAGE_ALIGN(pos + len) - 1); 13058c2ecf20Sopenharmony_ci 13068c2ecf20Sopenharmony_ci req->r_mtime = mtime; 13078c2ecf20Sopenharmony_ci } 13088c2ecf20Sopenharmony_ci 13098c2ecf20Sopenharmony_ci osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); 13108c2ecf20Sopenharmony_ci 13118c2ecf20Sopenharmony_ci if (aio_req) { 13128c2ecf20Sopenharmony_ci aio_req->total_len += len; 13138c2ecf20Sopenharmony_ci aio_req->num_reqs++; 13148c2ecf20Sopenharmony_ci atomic_inc(&aio_req->pending_reqs); 13158c2ecf20Sopenharmony_ci 13168c2ecf20Sopenharmony_ci req->r_callback = ceph_aio_complete_req; 13178c2ecf20Sopenharmony_ci req->r_inode = inode; 13188c2ecf20Sopenharmony_ci req->r_priv = aio_req; 13198c2ecf20Sopenharmony_ci list_add_tail(&req->r_private_item, &aio_req->osd_reqs); 13208c2ecf20Sopenharmony_ci 13218c2ecf20Sopenharmony_ci pos += len; 13228c2ecf20Sopenharmony_ci continue; 13238c2ecf20Sopenharmony_ci } 13248c2ecf20Sopenharmony_ci 13258c2ecf20Sopenharmony_ci ret = ceph_osdc_start_request(req->r_osdc, req, false); 13268c2ecf20Sopenharmony_ci if (!ret) 13278c2ecf20Sopenharmony_ci ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 13288c2ecf20Sopenharmony_ci 13298c2ecf20Sopenharmony_ci if (write) 13308c2ecf20Sopenharmony_ci ceph_update_write_latency(metric, req->r_start_latency, 13318c2ecf20Sopenharmony_ci req->r_end_latency, ret); 13328c2ecf20Sopenharmony_ci else 13338c2ecf20Sopenharmony_ci ceph_update_read_latency(metric, req->r_start_latency, 13348c2ecf20Sopenharmony_ci req->r_end_latency, ret); 13358c2ecf20Sopenharmony_ci 13368c2ecf20Sopenharmony_ci size = i_size_read(inode); 13378c2ecf20Sopenharmony_ci if (!write) { 13388c2ecf20Sopenharmony_ci if (ret == -ENOENT) 13398c2ecf20Sopenharmony_ci ret = 0; 13408c2ecf20Sopenharmony_ci if (ret >= 0 && ret < len && pos + ret < size) { 13418c2ecf20Sopenharmony_ci struct iov_iter i; 13428c2ecf20Sopenharmony_ci int zlen = min_t(size_t, len - ret, 13438c2ecf20Sopenharmony_ci size - pos - ret); 13448c2ecf20Sopenharmony_ci 13458c2ecf20Sopenharmony_ci iov_iter_bvec(&i, READ, bvecs, num_pages, len); 13468c2ecf20Sopenharmony_ci iov_iter_advance(&i, ret); 13478c2ecf20Sopenharmony_ci iov_iter_zero(zlen, &i); 13488c2ecf20Sopenharmony_ci ret += zlen; 13498c2ecf20Sopenharmony_ci } 13508c2ecf20Sopenharmony_ci if (ret >= 0) 13518c2ecf20Sopenharmony_ci len = ret; 13528c2ecf20Sopenharmony_ci } 13538c2ecf20Sopenharmony_ci 13548c2ecf20Sopenharmony_ci put_bvecs(bvecs, num_pages, should_dirty); 13558c2ecf20Sopenharmony_ci ceph_osdc_put_request(req); 13568c2ecf20Sopenharmony_ci if (ret < 0) 13578c2ecf20Sopenharmony_ci break; 13588c2ecf20Sopenharmony_ci 13598c2ecf20Sopenharmony_ci pos += len; 13608c2ecf20Sopenharmony_ci if (!write && pos >= size) 13618c2ecf20Sopenharmony_ci break; 13628c2ecf20Sopenharmony_ci 13638c2ecf20Sopenharmony_ci if (write && pos > size) { 13648c2ecf20Sopenharmony_ci if (ceph_inode_set_size(inode, pos)) 13658c2ecf20Sopenharmony_ci ceph_check_caps(ceph_inode(inode), 13668c2ecf20Sopenharmony_ci CHECK_CAPS_AUTHONLY, 13678c2ecf20Sopenharmony_ci NULL); 13688c2ecf20Sopenharmony_ci } 13698c2ecf20Sopenharmony_ci } 13708c2ecf20Sopenharmony_ci 13718c2ecf20Sopenharmony_ci if (aio_req) { 13728c2ecf20Sopenharmony_ci LIST_HEAD(osd_reqs); 13738c2ecf20Sopenharmony_ci 13748c2ecf20Sopenharmony_ci if (aio_req->num_reqs == 0) { 13758c2ecf20Sopenharmony_ci kfree(aio_req); 13768c2ecf20Sopenharmony_ci return ret; 13778c2ecf20Sopenharmony_ci } 13788c2ecf20Sopenharmony_ci 13798c2ecf20Sopenharmony_ci ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR : 13808c2ecf20Sopenharmony_ci CEPH_CAP_FILE_RD); 13818c2ecf20Sopenharmony_ci 13828c2ecf20Sopenharmony_ci list_splice(&aio_req->osd_reqs, &osd_reqs); 13838c2ecf20Sopenharmony_ci inode_dio_begin(inode); 13848c2ecf20Sopenharmony_ci while (!list_empty(&osd_reqs)) { 13858c2ecf20Sopenharmony_ci req = list_first_entry(&osd_reqs, 13868c2ecf20Sopenharmony_ci struct ceph_osd_request, 13878c2ecf20Sopenharmony_ci r_private_item); 13888c2ecf20Sopenharmony_ci list_del_init(&req->r_private_item); 13898c2ecf20Sopenharmony_ci if (ret >= 0) 13908c2ecf20Sopenharmony_ci ret = ceph_osdc_start_request(req->r_osdc, 13918c2ecf20Sopenharmony_ci req, false); 13928c2ecf20Sopenharmony_ci if (ret < 0) { 13938c2ecf20Sopenharmony_ci req->r_result = ret; 13948c2ecf20Sopenharmony_ci ceph_aio_complete_req(req); 13958c2ecf20Sopenharmony_ci } 13968c2ecf20Sopenharmony_ci } 13978c2ecf20Sopenharmony_ci return -EIOCBQUEUED; 13988c2ecf20Sopenharmony_ci } 13998c2ecf20Sopenharmony_ci 14008c2ecf20Sopenharmony_ci if (ret != -EOLDSNAPC && pos > iocb->ki_pos) { 14018c2ecf20Sopenharmony_ci ret = pos - iocb->ki_pos; 14028c2ecf20Sopenharmony_ci iocb->ki_pos = pos; 14038c2ecf20Sopenharmony_ci } 14048c2ecf20Sopenharmony_ci return ret; 14058c2ecf20Sopenharmony_ci} 14068c2ecf20Sopenharmony_ci 14078c2ecf20Sopenharmony_ci/* 14088c2ecf20Sopenharmony_ci * Synchronous write, straight from __user pointer or user pages. 14098c2ecf20Sopenharmony_ci * 14108c2ecf20Sopenharmony_ci * If write spans object boundary, just do multiple writes. (For a 14118c2ecf20Sopenharmony_ci * correct atomic write, we should e.g. take write locks on all 14128c2ecf20Sopenharmony_ci * objects, rollback on failure, etc.) 14138c2ecf20Sopenharmony_ci */ 14148c2ecf20Sopenharmony_cistatic ssize_t 14158c2ecf20Sopenharmony_ciceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, 14168c2ecf20Sopenharmony_ci struct ceph_snap_context *snapc) 14178c2ecf20Sopenharmony_ci{ 14188c2ecf20Sopenharmony_ci struct file *file = iocb->ki_filp; 14198c2ecf20Sopenharmony_ci struct inode *inode = file_inode(file); 14208c2ecf20Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 14218c2ecf20Sopenharmony_ci struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 14228c2ecf20Sopenharmony_ci struct ceph_vino vino; 14238c2ecf20Sopenharmony_ci struct ceph_osd_request *req; 14248c2ecf20Sopenharmony_ci struct page **pages; 14258c2ecf20Sopenharmony_ci u64 len; 14268c2ecf20Sopenharmony_ci int num_pages; 14278c2ecf20Sopenharmony_ci int written = 0; 14288c2ecf20Sopenharmony_ci int flags; 14298c2ecf20Sopenharmony_ci int ret; 14308c2ecf20Sopenharmony_ci bool check_caps = false; 14318c2ecf20Sopenharmony_ci struct timespec64 mtime = current_time(inode); 14328c2ecf20Sopenharmony_ci size_t count = iov_iter_count(from); 14338c2ecf20Sopenharmony_ci 14348c2ecf20Sopenharmony_ci if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) 14358c2ecf20Sopenharmony_ci return -EROFS; 14368c2ecf20Sopenharmony_ci 14378c2ecf20Sopenharmony_ci dout("sync_write on file %p %lld~%u snapc %p seq %lld\n", 14388c2ecf20Sopenharmony_ci file, pos, (unsigned)count, snapc, snapc->seq); 14398c2ecf20Sopenharmony_ci 14408c2ecf20Sopenharmony_ci ret = filemap_write_and_wait_range(inode->i_mapping, 14418c2ecf20Sopenharmony_ci pos, pos + count - 1); 14428c2ecf20Sopenharmony_ci if (ret < 0) 14438c2ecf20Sopenharmony_ci return ret; 14448c2ecf20Sopenharmony_ci 14458c2ecf20Sopenharmony_ci ret = invalidate_inode_pages2_range(inode->i_mapping, 14468c2ecf20Sopenharmony_ci pos >> PAGE_SHIFT, 14478c2ecf20Sopenharmony_ci (pos + count - 1) >> PAGE_SHIFT); 14488c2ecf20Sopenharmony_ci if (ret < 0) 14498c2ecf20Sopenharmony_ci dout("invalidate_inode_pages2_range returned %d\n", ret); 14508c2ecf20Sopenharmony_ci 14518c2ecf20Sopenharmony_ci flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; 14528c2ecf20Sopenharmony_ci 14538c2ecf20Sopenharmony_ci while ((len = iov_iter_count(from)) > 0) { 14548c2ecf20Sopenharmony_ci size_t left; 14558c2ecf20Sopenharmony_ci int n; 14568c2ecf20Sopenharmony_ci 14578c2ecf20Sopenharmony_ci vino = ceph_vino(inode); 14588c2ecf20Sopenharmony_ci req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 14598c2ecf20Sopenharmony_ci vino, pos, &len, 0, 1, 14608c2ecf20Sopenharmony_ci CEPH_OSD_OP_WRITE, flags, snapc, 14618c2ecf20Sopenharmony_ci ci->i_truncate_seq, 14628c2ecf20Sopenharmony_ci ci->i_truncate_size, 14638c2ecf20Sopenharmony_ci false); 14648c2ecf20Sopenharmony_ci if (IS_ERR(req)) { 14658c2ecf20Sopenharmony_ci ret = PTR_ERR(req); 14668c2ecf20Sopenharmony_ci break; 14678c2ecf20Sopenharmony_ci } 14688c2ecf20Sopenharmony_ci 14698c2ecf20Sopenharmony_ci /* 14708c2ecf20Sopenharmony_ci * write from beginning of first page, 14718c2ecf20Sopenharmony_ci * regardless of io alignment 14728c2ecf20Sopenharmony_ci */ 14738c2ecf20Sopenharmony_ci num_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; 14748c2ecf20Sopenharmony_ci 14758c2ecf20Sopenharmony_ci pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 14768c2ecf20Sopenharmony_ci if (IS_ERR(pages)) { 14778c2ecf20Sopenharmony_ci ret = PTR_ERR(pages); 14788c2ecf20Sopenharmony_ci goto out; 14798c2ecf20Sopenharmony_ci } 14808c2ecf20Sopenharmony_ci 14818c2ecf20Sopenharmony_ci left = len; 14828c2ecf20Sopenharmony_ci for (n = 0; n < num_pages; n++) { 14838c2ecf20Sopenharmony_ci size_t plen = min_t(size_t, left, PAGE_SIZE); 14848c2ecf20Sopenharmony_ci ret = copy_page_from_iter(pages[n], 0, plen, from); 14858c2ecf20Sopenharmony_ci if (ret != plen) { 14868c2ecf20Sopenharmony_ci ret = -EFAULT; 14878c2ecf20Sopenharmony_ci break; 14888c2ecf20Sopenharmony_ci } 14898c2ecf20Sopenharmony_ci left -= ret; 14908c2ecf20Sopenharmony_ci } 14918c2ecf20Sopenharmony_ci 14928c2ecf20Sopenharmony_ci if (ret < 0) { 14938c2ecf20Sopenharmony_ci ceph_release_page_vector(pages, num_pages); 14948c2ecf20Sopenharmony_ci goto out; 14958c2ecf20Sopenharmony_ci } 14968c2ecf20Sopenharmony_ci 14978c2ecf20Sopenharmony_ci req->r_inode = inode; 14988c2ecf20Sopenharmony_ci 14998c2ecf20Sopenharmony_ci osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, 15008c2ecf20Sopenharmony_ci false, true); 15018c2ecf20Sopenharmony_ci 15028c2ecf20Sopenharmony_ci req->r_mtime = mtime; 15038c2ecf20Sopenharmony_ci ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 15048c2ecf20Sopenharmony_ci if (!ret) 15058c2ecf20Sopenharmony_ci ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 15068c2ecf20Sopenharmony_ci 15078c2ecf20Sopenharmony_ci ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, 15088c2ecf20Sopenharmony_ci req->r_end_latency, ret); 15098c2ecf20Sopenharmony_ciout: 15108c2ecf20Sopenharmony_ci ceph_osdc_put_request(req); 15118c2ecf20Sopenharmony_ci if (ret != 0) { 15128c2ecf20Sopenharmony_ci ceph_set_error_write(ci); 15138c2ecf20Sopenharmony_ci break; 15148c2ecf20Sopenharmony_ci } 15158c2ecf20Sopenharmony_ci 15168c2ecf20Sopenharmony_ci ceph_clear_error_write(ci); 15178c2ecf20Sopenharmony_ci pos += len; 15188c2ecf20Sopenharmony_ci written += len; 15198c2ecf20Sopenharmony_ci if (pos > i_size_read(inode)) { 15208c2ecf20Sopenharmony_ci check_caps = ceph_inode_set_size(inode, pos); 15218c2ecf20Sopenharmony_ci if (check_caps) 15228c2ecf20Sopenharmony_ci ceph_check_caps(ceph_inode(inode), 15238c2ecf20Sopenharmony_ci CHECK_CAPS_AUTHONLY, 15248c2ecf20Sopenharmony_ci NULL); 15258c2ecf20Sopenharmony_ci } 15268c2ecf20Sopenharmony_ci 15278c2ecf20Sopenharmony_ci } 15288c2ecf20Sopenharmony_ci 15298c2ecf20Sopenharmony_ci if (ret != -EOLDSNAPC && written > 0) { 15308c2ecf20Sopenharmony_ci ret = written; 15318c2ecf20Sopenharmony_ci iocb->ki_pos = pos; 15328c2ecf20Sopenharmony_ci } 15338c2ecf20Sopenharmony_ci return ret; 15348c2ecf20Sopenharmony_ci} 15358c2ecf20Sopenharmony_ci 15368c2ecf20Sopenharmony_ci/* 15378c2ecf20Sopenharmony_ci * Wrap generic_file_aio_read with checks for cap bits on the inode. 15388c2ecf20Sopenharmony_ci * Atomically grab references, so that those bits are not released 15398c2ecf20Sopenharmony_ci * back to the MDS mid-read. 15408c2ecf20Sopenharmony_ci * 15418c2ecf20Sopenharmony_ci * Hmm, the sync read case isn't actually async... should it be? 15428c2ecf20Sopenharmony_ci */ 15438c2ecf20Sopenharmony_cistatic ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) 15448c2ecf20Sopenharmony_ci{ 15458c2ecf20Sopenharmony_ci struct file *filp = iocb->ki_filp; 15468c2ecf20Sopenharmony_ci struct ceph_file_info *fi = filp->private_data; 15478c2ecf20Sopenharmony_ci size_t len = iov_iter_count(to); 15488c2ecf20Sopenharmony_ci struct inode *inode = file_inode(filp); 15498c2ecf20Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 15508c2ecf20Sopenharmony_ci struct page *pinned_page = NULL; 15518c2ecf20Sopenharmony_ci bool direct_lock = iocb->ki_flags & IOCB_DIRECT; 15528c2ecf20Sopenharmony_ci ssize_t ret; 15538c2ecf20Sopenharmony_ci int want, got = 0; 15548c2ecf20Sopenharmony_ci int retry_op = 0, read = 0; 15558c2ecf20Sopenharmony_ci 15568c2ecf20Sopenharmony_ciagain: 15578c2ecf20Sopenharmony_ci dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", 15588c2ecf20Sopenharmony_ci inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); 15598c2ecf20Sopenharmony_ci 15608c2ecf20Sopenharmony_ci if (direct_lock) 15618c2ecf20Sopenharmony_ci ceph_start_io_direct(inode); 15628c2ecf20Sopenharmony_ci else 15638c2ecf20Sopenharmony_ci ceph_start_io_read(inode); 15648c2ecf20Sopenharmony_ci 15658c2ecf20Sopenharmony_ci if (fi->fmode & CEPH_FILE_MODE_LAZY) 15668c2ecf20Sopenharmony_ci want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 15678c2ecf20Sopenharmony_ci else 15688c2ecf20Sopenharmony_ci want = CEPH_CAP_FILE_CACHE; 15698c2ecf20Sopenharmony_ci ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, 15708c2ecf20Sopenharmony_ci &got, &pinned_page); 15718c2ecf20Sopenharmony_ci if (ret < 0) { 15728c2ecf20Sopenharmony_ci if (iocb->ki_flags & IOCB_DIRECT) 15738c2ecf20Sopenharmony_ci ceph_end_io_direct(inode); 15748c2ecf20Sopenharmony_ci else 15758c2ecf20Sopenharmony_ci ceph_end_io_read(inode); 15768c2ecf20Sopenharmony_ci return ret; 15778c2ecf20Sopenharmony_ci } 15788c2ecf20Sopenharmony_ci 15798c2ecf20Sopenharmony_ci if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || 15808c2ecf20Sopenharmony_ci (iocb->ki_flags & IOCB_DIRECT) || 15818c2ecf20Sopenharmony_ci (fi->flags & CEPH_F_SYNC)) { 15828c2ecf20Sopenharmony_ci 15838c2ecf20Sopenharmony_ci dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n", 15848c2ecf20Sopenharmony_ci inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 15858c2ecf20Sopenharmony_ci ceph_cap_string(got)); 15868c2ecf20Sopenharmony_ci 15878c2ecf20Sopenharmony_ci if (ci->i_inline_version == CEPH_INLINE_NONE) { 15888c2ecf20Sopenharmony_ci if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { 15898c2ecf20Sopenharmony_ci ret = ceph_direct_read_write(iocb, to, 15908c2ecf20Sopenharmony_ci NULL, NULL); 15918c2ecf20Sopenharmony_ci if (ret >= 0 && ret < len) 15928c2ecf20Sopenharmony_ci retry_op = CHECK_EOF; 15938c2ecf20Sopenharmony_ci } else { 15948c2ecf20Sopenharmony_ci ret = ceph_sync_read(iocb, to, &retry_op); 15958c2ecf20Sopenharmony_ci } 15968c2ecf20Sopenharmony_ci } else { 15978c2ecf20Sopenharmony_ci retry_op = READ_INLINE; 15988c2ecf20Sopenharmony_ci } 15998c2ecf20Sopenharmony_ci } else { 16008c2ecf20Sopenharmony_ci CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); 16018c2ecf20Sopenharmony_ci dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", 16028c2ecf20Sopenharmony_ci inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 16038c2ecf20Sopenharmony_ci ceph_cap_string(got)); 16048c2ecf20Sopenharmony_ci ceph_add_rw_context(fi, &rw_ctx); 16058c2ecf20Sopenharmony_ci ret = generic_file_read_iter(iocb, to); 16068c2ecf20Sopenharmony_ci ceph_del_rw_context(fi, &rw_ctx); 16078c2ecf20Sopenharmony_ci } 16088c2ecf20Sopenharmony_ci 16098c2ecf20Sopenharmony_ci dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", 16108c2ecf20Sopenharmony_ci inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); 16118c2ecf20Sopenharmony_ci if (pinned_page) { 16128c2ecf20Sopenharmony_ci put_page(pinned_page); 16138c2ecf20Sopenharmony_ci pinned_page = NULL; 16148c2ecf20Sopenharmony_ci } 16158c2ecf20Sopenharmony_ci ceph_put_cap_refs(ci, got); 16168c2ecf20Sopenharmony_ci 16178c2ecf20Sopenharmony_ci if (direct_lock) 16188c2ecf20Sopenharmony_ci ceph_end_io_direct(inode); 16198c2ecf20Sopenharmony_ci else 16208c2ecf20Sopenharmony_ci ceph_end_io_read(inode); 16218c2ecf20Sopenharmony_ci 16228c2ecf20Sopenharmony_ci if (retry_op > HAVE_RETRIED && ret >= 0) { 16238c2ecf20Sopenharmony_ci int statret; 16248c2ecf20Sopenharmony_ci struct page *page = NULL; 16258c2ecf20Sopenharmony_ci loff_t i_size; 16268c2ecf20Sopenharmony_ci if (retry_op == READ_INLINE) { 16278c2ecf20Sopenharmony_ci page = __page_cache_alloc(GFP_KERNEL); 16288c2ecf20Sopenharmony_ci if (!page) 16298c2ecf20Sopenharmony_ci return -ENOMEM; 16308c2ecf20Sopenharmony_ci } 16318c2ecf20Sopenharmony_ci 16328c2ecf20Sopenharmony_ci statret = __ceph_do_getattr(inode, page, 16338c2ecf20Sopenharmony_ci CEPH_STAT_CAP_INLINE_DATA, !!page); 16348c2ecf20Sopenharmony_ci if (statret < 0) { 16358c2ecf20Sopenharmony_ci if (page) 16368c2ecf20Sopenharmony_ci __free_page(page); 16378c2ecf20Sopenharmony_ci if (statret == -ENODATA) { 16388c2ecf20Sopenharmony_ci BUG_ON(retry_op != READ_INLINE); 16398c2ecf20Sopenharmony_ci goto again; 16408c2ecf20Sopenharmony_ci } 16418c2ecf20Sopenharmony_ci return statret; 16428c2ecf20Sopenharmony_ci } 16438c2ecf20Sopenharmony_ci 16448c2ecf20Sopenharmony_ci i_size = i_size_read(inode); 16458c2ecf20Sopenharmony_ci if (retry_op == READ_INLINE) { 16468c2ecf20Sopenharmony_ci BUG_ON(ret > 0 || read > 0); 16478c2ecf20Sopenharmony_ci if (iocb->ki_pos < i_size && 16488c2ecf20Sopenharmony_ci iocb->ki_pos < PAGE_SIZE) { 16498c2ecf20Sopenharmony_ci loff_t end = min_t(loff_t, i_size, 16508c2ecf20Sopenharmony_ci iocb->ki_pos + len); 16518c2ecf20Sopenharmony_ci end = min_t(loff_t, end, PAGE_SIZE); 16528c2ecf20Sopenharmony_ci if (statret < end) 16538c2ecf20Sopenharmony_ci zero_user_segment(page, statret, end); 16548c2ecf20Sopenharmony_ci ret = copy_page_to_iter(page, 16558c2ecf20Sopenharmony_ci iocb->ki_pos & ~PAGE_MASK, 16568c2ecf20Sopenharmony_ci end - iocb->ki_pos, to); 16578c2ecf20Sopenharmony_ci iocb->ki_pos += ret; 16588c2ecf20Sopenharmony_ci read += ret; 16598c2ecf20Sopenharmony_ci } 16608c2ecf20Sopenharmony_ci if (iocb->ki_pos < i_size && read < len) { 16618c2ecf20Sopenharmony_ci size_t zlen = min_t(size_t, len - read, 16628c2ecf20Sopenharmony_ci i_size - iocb->ki_pos); 16638c2ecf20Sopenharmony_ci ret = iov_iter_zero(zlen, to); 16648c2ecf20Sopenharmony_ci iocb->ki_pos += ret; 16658c2ecf20Sopenharmony_ci read += ret; 16668c2ecf20Sopenharmony_ci } 16678c2ecf20Sopenharmony_ci __free_pages(page, 0); 16688c2ecf20Sopenharmony_ci return read; 16698c2ecf20Sopenharmony_ci } 16708c2ecf20Sopenharmony_ci 16718c2ecf20Sopenharmony_ci /* hit EOF or hole? */ 16728c2ecf20Sopenharmony_ci if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && 16738c2ecf20Sopenharmony_ci ret < len) { 16748c2ecf20Sopenharmony_ci dout("sync_read hit hole, ppos %lld < size %lld" 16758c2ecf20Sopenharmony_ci ", reading more\n", iocb->ki_pos, i_size); 16768c2ecf20Sopenharmony_ci 16778c2ecf20Sopenharmony_ci read += ret; 16788c2ecf20Sopenharmony_ci len -= ret; 16798c2ecf20Sopenharmony_ci retry_op = HAVE_RETRIED; 16808c2ecf20Sopenharmony_ci goto again; 16818c2ecf20Sopenharmony_ci } 16828c2ecf20Sopenharmony_ci } 16838c2ecf20Sopenharmony_ci 16848c2ecf20Sopenharmony_ci if (ret >= 0) 16858c2ecf20Sopenharmony_ci ret += read; 16868c2ecf20Sopenharmony_ci 16878c2ecf20Sopenharmony_ci return ret; 16888c2ecf20Sopenharmony_ci} 16898c2ecf20Sopenharmony_ci 16908c2ecf20Sopenharmony_ci/* 16918c2ecf20Sopenharmony_ci * Take cap references to avoid releasing caps to MDS mid-write. 16928c2ecf20Sopenharmony_ci * 16938c2ecf20Sopenharmony_ci * If we are synchronous, and write with an old snap context, the OSD 16948c2ecf20Sopenharmony_ci * may return EOLDSNAPC. In that case, retry the write.. _after_ 16958c2ecf20Sopenharmony_ci * dropping our cap refs and allowing the pending snap to logically 16968c2ecf20Sopenharmony_ci * complete _before_ this write occurs. 16978c2ecf20Sopenharmony_ci * 16988c2ecf20Sopenharmony_ci * If we are near ENOSPC, write synchronously. 16998c2ecf20Sopenharmony_ci */ 17008c2ecf20Sopenharmony_cistatic ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) 17018c2ecf20Sopenharmony_ci{ 17028c2ecf20Sopenharmony_ci struct file *file = iocb->ki_filp; 17038c2ecf20Sopenharmony_ci struct ceph_file_info *fi = file->private_data; 17048c2ecf20Sopenharmony_ci struct inode *inode = file_inode(file); 17058c2ecf20Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 17068c2ecf20Sopenharmony_ci struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 17078c2ecf20Sopenharmony_ci struct ceph_osd_client *osdc = &fsc->client->osdc; 17088c2ecf20Sopenharmony_ci struct ceph_cap_flush *prealloc_cf; 17098c2ecf20Sopenharmony_ci ssize_t count, written = 0; 17108c2ecf20Sopenharmony_ci int err, want, got; 17118c2ecf20Sopenharmony_ci bool direct_lock = false; 17128c2ecf20Sopenharmony_ci u32 map_flags; 17138c2ecf20Sopenharmony_ci u64 pool_flags; 17148c2ecf20Sopenharmony_ci loff_t pos; 17158c2ecf20Sopenharmony_ci loff_t limit = max(i_size_read(inode), fsc->max_file_size); 17168c2ecf20Sopenharmony_ci 17178c2ecf20Sopenharmony_ci if (ceph_snap(inode) != CEPH_NOSNAP) 17188c2ecf20Sopenharmony_ci return -EROFS; 17198c2ecf20Sopenharmony_ci 17208c2ecf20Sopenharmony_ci prealloc_cf = ceph_alloc_cap_flush(); 17218c2ecf20Sopenharmony_ci if (!prealloc_cf) 17228c2ecf20Sopenharmony_ci return -ENOMEM; 17238c2ecf20Sopenharmony_ci 17248c2ecf20Sopenharmony_ci if ((iocb->ki_flags & (IOCB_DIRECT | IOCB_APPEND)) == IOCB_DIRECT) 17258c2ecf20Sopenharmony_ci direct_lock = true; 17268c2ecf20Sopenharmony_ci 17278c2ecf20Sopenharmony_ciretry_snap: 17288c2ecf20Sopenharmony_ci if (direct_lock) 17298c2ecf20Sopenharmony_ci ceph_start_io_direct(inode); 17308c2ecf20Sopenharmony_ci else 17318c2ecf20Sopenharmony_ci ceph_start_io_write(inode); 17328c2ecf20Sopenharmony_ci 17338c2ecf20Sopenharmony_ci /* We can write back this queue in page reclaim */ 17348c2ecf20Sopenharmony_ci current->backing_dev_info = inode_to_bdi(inode); 17358c2ecf20Sopenharmony_ci 17368c2ecf20Sopenharmony_ci if (iocb->ki_flags & IOCB_APPEND) { 17378c2ecf20Sopenharmony_ci err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); 17388c2ecf20Sopenharmony_ci if (err < 0) 17398c2ecf20Sopenharmony_ci goto out; 17408c2ecf20Sopenharmony_ci } 17418c2ecf20Sopenharmony_ci 17428c2ecf20Sopenharmony_ci err = generic_write_checks(iocb, from); 17438c2ecf20Sopenharmony_ci if (err <= 0) 17448c2ecf20Sopenharmony_ci goto out; 17458c2ecf20Sopenharmony_ci 17468c2ecf20Sopenharmony_ci pos = iocb->ki_pos; 17478c2ecf20Sopenharmony_ci if (unlikely(pos >= limit)) { 17488c2ecf20Sopenharmony_ci err = -EFBIG; 17498c2ecf20Sopenharmony_ci goto out; 17508c2ecf20Sopenharmony_ci } else { 17518c2ecf20Sopenharmony_ci iov_iter_truncate(from, limit - pos); 17528c2ecf20Sopenharmony_ci } 17538c2ecf20Sopenharmony_ci 17548c2ecf20Sopenharmony_ci count = iov_iter_count(from); 17558c2ecf20Sopenharmony_ci if (ceph_quota_is_max_bytes_exceeded(inode, pos + count)) { 17568c2ecf20Sopenharmony_ci err = -EDQUOT; 17578c2ecf20Sopenharmony_ci goto out; 17588c2ecf20Sopenharmony_ci } 17598c2ecf20Sopenharmony_ci 17608c2ecf20Sopenharmony_ci down_read(&osdc->lock); 17618c2ecf20Sopenharmony_ci map_flags = osdc->osdmap->flags; 17628c2ecf20Sopenharmony_ci pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id); 17638c2ecf20Sopenharmony_ci up_read(&osdc->lock); 17648c2ecf20Sopenharmony_ci if ((map_flags & CEPH_OSDMAP_FULL) || 17658c2ecf20Sopenharmony_ci (pool_flags & CEPH_POOL_FLAG_FULL)) { 17668c2ecf20Sopenharmony_ci err = -ENOSPC; 17678c2ecf20Sopenharmony_ci goto out; 17688c2ecf20Sopenharmony_ci } 17698c2ecf20Sopenharmony_ci 17708c2ecf20Sopenharmony_ci err = file_remove_privs(file); 17718c2ecf20Sopenharmony_ci if (err) 17728c2ecf20Sopenharmony_ci goto out; 17738c2ecf20Sopenharmony_ci 17748c2ecf20Sopenharmony_ci if (ci->i_inline_version != CEPH_INLINE_NONE) { 17758c2ecf20Sopenharmony_ci err = ceph_uninline_data(file, NULL); 17768c2ecf20Sopenharmony_ci if (err < 0) 17778c2ecf20Sopenharmony_ci goto out; 17788c2ecf20Sopenharmony_ci } 17798c2ecf20Sopenharmony_ci 17808c2ecf20Sopenharmony_ci dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", 17818c2ecf20Sopenharmony_ci inode, ceph_vinop(inode), pos, count, i_size_read(inode)); 17828c2ecf20Sopenharmony_ci if (fi->fmode & CEPH_FILE_MODE_LAZY) 17838c2ecf20Sopenharmony_ci want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; 17848c2ecf20Sopenharmony_ci else 17858c2ecf20Sopenharmony_ci want = CEPH_CAP_FILE_BUFFER; 17868c2ecf20Sopenharmony_ci got = 0; 17878c2ecf20Sopenharmony_ci err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, 17888c2ecf20Sopenharmony_ci &got, NULL); 17898c2ecf20Sopenharmony_ci if (err < 0) 17908c2ecf20Sopenharmony_ci goto out; 17918c2ecf20Sopenharmony_ci 17928c2ecf20Sopenharmony_ci err = file_update_time(file); 17938c2ecf20Sopenharmony_ci if (err) 17948c2ecf20Sopenharmony_ci goto out_caps; 17958c2ecf20Sopenharmony_ci 17968c2ecf20Sopenharmony_ci inode_inc_iversion_raw(inode); 17978c2ecf20Sopenharmony_ci 17988c2ecf20Sopenharmony_ci dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n", 17998c2ecf20Sopenharmony_ci inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); 18008c2ecf20Sopenharmony_ci 18018c2ecf20Sopenharmony_ci if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || 18028c2ecf20Sopenharmony_ci (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) || 18038c2ecf20Sopenharmony_ci (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { 18048c2ecf20Sopenharmony_ci struct ceph_snap_context *snapc; 18058c2ecf20Sopenharmony_ci struct iov_iter data; 18068c2ecf20Sopenharmony_ci 18078c2ecf20Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 18088c2ecf20Sopenharmony_ci if (__ceph_have_pending_cap_snap(ci)) { 18098c2ecf20Sopenharmony_ci struct ceph_cap_snap *capsnap = 18108c2ecf20Sopenharmony_ci list_last_entry(&ci->i_cap_snaps, 18118c2ecf20Sopenharmony_ci struct ceph_cap_snap, 18128c2ecf20Sopenharmony_ci ci_item); 18138c2ecf20Sopenharmony_ci snapc = ceph_get_snap_context(capsnap->context); 18148c2ecf20Sopenharmony_ci } else { 18158c2ecf20Sopenharmony_ci BUG_ON(!ci->i_head_snapc); 18168c2ecf20Sopenharmony_ci snapc = ceph_get_snap_context(ci->i_head_snapc); 18178c2ecf20Sopenharmony_ci } 18188c2ecf20Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 18198c2ecf20Sopenharmony_ci 18208c2ecf20Sopenharmony_ci /* we might need to revert back to that point */ 18218c2ecf20Sopenharmony_ci data = *from; 18228c2ecf20Sopenharmony_ci if (iocb->ki_flags & IOCB_DIRECT) 18238c2ecf20Sopenharmony_ci written = ceph_direct_read_write(iocb, &data, snapc, 18248c2ecf20Sopenharmony_ci &prealloc_cf); 18258c2ecf20Sopenharmony_ci else 18268c2ecf20Sopenharmony_ci written = ceph_sync_write(iocb, &data, pos, snapc); 18278c2ecf20Sopenharmony_ci if (direct_lock) 18288c2ecf20Sopenharmony_ci ceph_end_io_direct(inode); 18298c2ecf20Sopenharmony_ci else 18308c2ecf20Sopenharmony_ci ceph_end_io_write(inode); 18318c2ecf20Sopenharmony_ci if (written > 0) 18328c2ecf20Sopenharmony_ci iov_iter_advance(from, written); 18338c2ecf20Sopenharmony_ci ceph_put_snap_context(snapc); 18348c2ecf20Sopenharmony_ci } else { 18358c2ecf20Sopenharmony_ci /* 18368c2ecf20Sopenharmony_ci * No need to acquire the i_truncate_mutex. Because 18378c2ecf20Sopenharmony_ci * the MDS revokes Fwb caps before sending truncate 18388c2ecf20Sopenharmony_ci * message to us. We can't get Fwb cap while there 18398c2ecf20Sopenharmony_ci * are pending vmtruncate. So write and vmtruncate 18408c2ecf20Sopenharmony_ci * can not run at the same time 18418c2ecf20Sopenharmony_ci */ 18428c2ecf20Sopenharmony_ci written = generic_perform_write(file, from, pos); 18438c2ecf20Sopenharmony_ci if (likely(written >= 0)) 18448c2ecf20Sopenharmony_ci iocb->ki_pos = pos + written; 18458c2ecf20Sopenharmony_ci ceph_end_io_write(inode); 18468c2ecf20Sopenharmony_ci } 18478c2ecf20Sopenharmony_ci 18488c2ecf20Sopenharmony_ci if (written >= 0) { 18498c2ecf20Sopenharmony_ci int dirty; 18508c2ecf20Sopenharmony_ci 18518c2ecf20Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 18528c2ecf20Sopenharmony_ci ci->i_inline_version = CEPH_INLINE_NONE; 18538c2ecf20Sopenharmony_ci dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, 18548c2ecf20Sopenharmony_ci &prealloc_cf); 18558c2ecf20Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 18568c2ecf20Sopenharmony_ci if (dirty) 18578c2ecf20Sopenharmony_ci __mark_inode_dirty(inode, dirty); 18588c2ecf20Sopenharmony_ci if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos)) 18598c2ecf20Sopenharmony_ci ceph_check_caps(ci, 0, NULL); 18608c2ecf20Sopenharmony_ci } 18618c2ecf20Sopenharmony_ci 18628c2ecf20Sopenharmony_ci dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", 18638c2ecf20Sopenharmony_ci inode, ceph_vinop(inode), pos, (unsigned)count, 18648c2ecf20Sopenharmony_ci ceph_cap_string(got)); 18658c2ecf20Sopenharmony_ci ceph_put_cap_refs(ci, got); 18668c2ecf20Sopenharmony_ci 18678c2ecf20Sopenharmony_ci if (written == -EOLDSNAPC) { 18688c2ecf20Sopenharmony_ci dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", 18698c2ecf20Sopenharmony_ci inode, ceph_vinop(inode), pos, (unsigned)count); 18708c2ecf20Sopenharmony_ci goto retry_snap; 18718c2ecf20Sopenharmony_ci } 18728c2ecf20Sopenharmony_ci 18738c2ecf20Sopenharmony_ci if (written >= 0) { 18748c2ecf20Sopenharmony_ci if ((map_flags & CEPH_OSDMAP_NEARFULL) || 18758c2ecf20Sopenharmony_ci (pool_flags & CEPH_POOL_FLAG_NEARFULL)) 18768c2ecf20Sopenharmony_ci iocb->ki_flags |= IOCB_DSYNC; 18778c2ecf20Sopenharmony_ci written = generic_write_sync(iocb, written); 18788c2ecf20Sopenharmony_ci } 18798c2ecf20Sopenharmony_ci 18808c2ecf20Sopenharmony_ci goto out_unlocked; 18818c2ecf20Sopenharmony_ciout_caps: 18828c2ecf20Sopenharmony_ci ceph_put_cap_refs(ci, got); 18838c2ecf20Sopenharmony_ciout: 18848c2ecf20Sopenharmony_ci if (direct_lock) 18858c2ecf20Sopenharmony_ci ceph_end_io_direct(inode); 18868c2ecf20Sopenharmony_ci else 18878c2ecf20Sopenharmony_ci ceph_end_io_write(inode); 18888c2ecf20Sopenharmony_ciout_unlocked: 18898c2ecf20Sopenharmony_ci ceph_free_cap_flush(prealloc_cf); 18908c2ecf20Sopenharmony_ci current->backing_dev_info = NULL; 18918c2ecf20Sopenharmony_ci return written ? written : err; 18928c2ecf20Sopenharmony_ci} 18938c2ecf20Sopenharmony_ci 18948c2ecf20Sopenharmony_ci/* 18958c2ecf20Sopenharmony_ci * llseek. be sure to verify file size on SEEK_END. 18968c2ecf20Sopenharmony_ci */ 18978c2ecf20Sopenharmony_cistatic loff_t ceph_llseek(struct file *file, loff_t offset, int whence) 18988c2ecf20Sopenharmony_ci{ 18998c2ecf20Sopenharmony_ci struct inode *inode = file->f_mapping->host; 19008c2ecf20Sopenharmony_ci struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 19018c2ecf20Sopenharmony_ci loff_t i_size; 19028c2ecf20Sopenharmony_ci loff_t ret; 19038c2ecf20Sopenharmony_ci 19048c2ecf20Sopenharmony_ci inode_lock(inode); 19058c2ecf20Sopenharmony_ci 19068c2ecf20Sopenharmony_ci if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { 19078c2ecf20Sopenharmony_ci ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); 19088c2ecf20Sopenharmony_ci if (ret < 0) 19098c2ecf20Sopenharmony_ci goto out; 19108c2ecf20Sopenharmony_ci } 19118c2ecf20Sopenharmony_ci 19128c2ecf20Sopenharmony_ci i_size = i_size_read(inode); 19138c2ecf20Sopenharmony_ci switch (whence) { 19148c2ecf20Sopenharmony_ci case SEEK_END: 19158c2ecf20Sopenharmony_ci offset += i_size; 19168c2ecf20Sopenharmony_ci break; 19178c2ecf20Sopenharmony_ci case SEEK_CUR: 19188c2ecf20Sopenharmony_ci /* 19198c2ecf20Sopenharmony_ci * Here we special-case the lseek(fd, 0, SEEK_CUR) 19208c2ecf20Sopenharmony_ci * position-querying operation. Avoid rewriting the "same" 19218c2ecf20Sopenharmony_ci * f_pos value back to the file because a concurrent read(), 19228c2ecf20Sopenharmony_ci * write() or lseek() might have altered it 19238c2ecf20Sopenharmony_ci */ 19248c2ecf20Sopenharmony_ci if (offset == 0) { 19258c2ecf20Sopenharmony_ci ret = file->f_pos; 19268c2ecf20Sopenharmony_ci goto out; 19278c2ecf20Sopenharmony_ci } 19288c2ecf20Sopenharmony_ci offset += file->f_pos; 19298c2ecf20Sopenharmony_ci break; 19308c2ecf20Sopenharmony_ci case SEEK_DATA: 19318c2ecf20Sopenharmony_ci if (offset < 0 || offset >= i_size) { 19328c2ecf20Sopenharmony_ci ret = -ENXIO; 19338c2ecf20Sopenharmony_ci goto out; 19348c2ecf20Sopenharmony_ci } 19358c2ecf20Sopenharmony_ci break; 19368c2ecf20Sopenharmony_ci case SEEK_HOLE: 19378c2ecf20Sopenharmony_ci if (offset < 0 || offset >= i_size) { 19388c2ecf20Sopenharmony_ci ret = -ENXIO; 19398c2ecf20Sopenharmony_ci goto out; 19408c2ecf20Sopenharmony_ci } 19418c2ecf20Sopenharmony_ci offset = i_size; 19428c2ecf20Sopenharmony_ci break; 19438c2ecf20Sopenharmony_ci } 19448c2ecf20Sopenharmony_ci 19458c2ecf20Sopenharmony_ci ret = vfs_setpos(file, offset, max(i_size, fsc->max_file_size)); 19468c2ecf20Sopenharmony_ci 19478c2ecf20Sopenharmony_ciout: 19488c2ecf20Sopenharmony_ci inode_unlock(inode); 19498c2ecf20Sopenharmony_ci return ret; 19508c2ecf20Sopenharmony_ci} 19518c2ecf20Sopenharmony_ci 19528c2ecf20Sopenharmony_cistatic inline void ceph_zero_partial_page( 19538c2ecf20Sopenharmony_ci struct inode *inode, loff_t offset, unsigned size) 19548c2ecf20Sopenharmony_ci{ 19558c2ecf20Sopenharmony_ci struct page *page; 19568c2ecf20Sopenharmony_ci pgoff_t index = offset >> PAGE_SHIFT; 19578c2ecf20Sopenharmony_ci 19588c2ecf20Sopenharmony_ci page = find_lock_page(inode->i_mapping, index); 19598c2ecf20Sopenharmony_ci if (page) { 19608c2ecf20Sopenharmony_ci wait_on_page_writeback(page); 19618c2ecf20Sopenharmony_ci zero_user(page, offset & (PAGE_SIZE - 1), size); 19628c2ecf20Sopenharmony_ci unlock_page(page); 19638c2ecf20Sopenharmony_ci put_page(page); 19648c2ecf20Sopenharmony_ci } 19658c2ecf20Sopenharmony_ci} 19668c2ecf20Sopenharmony_ci 19678c2ecf20Sopenharmony_cistatic void ceph_zero_pagecache_range(struct inode *inode, loff_t offset, 19688c2ecf20Sopenharmony_ci loff_t length) 19698c2ecf20Sopenharmony_ci{ 19708c2ecf20Sopenharmony_ci loff_t nearly = round_up(offset, PAGE_SIZE); 19718c2ecf20Sopenharmony_ci if (offset < nearly) { 19728c2ecf20Sopenharmony_ci loff_t size = nearly - offset; 19738c2ecf20Sopenharmony_ci if (length < size) 19748c2ecf20Sopenharmony_ci size = length; 19758c2ecf20Sopenharmony_ci ceph_zero_partial_page(inode, offset, size); 19768c2ecf20Sopenharmony_ci offset += size; 19778c2ecf20Sopenharmony_ci length -= size; 19788c2ecf20Sopenharmony_ci } 19798c2ecf20Sopenharmony_ci if (length >= PAGE_SIZE) { 19808c2ecf20Sopenharmony_ci loff_t size = round_down(length, PAGE_SIZE); 19818c2ecf20Sopenharmony_ci truncate_pagecache_range(inode, offset, offset + size - 1); 19828c2ecf20Sopenharmony_ci offset += size; 19838c2ecf20Sopenharmony_ci length -= size; 19848c2ecf20Sopenharmony_ci } 19858c2ecf20Sopenharmony_ci if (length) 19868c2ecf20Sopenharmony_ci ceph_zero_partial_page(inode, offset, length); 19878c2ecf20Sopenharmony_ci} 19888c2ecf20Sopenharmony_ci 19898c2ecf20Sopenharmony_cistatic int ceph_zero_partial_object(struct inode *inode, 19908c2ecf20Sopenharmony_ci loff_t offset, loff_t *length) 19918c2ecf20Sopenharmony_ci{ 19928c2ecf20Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 19938c2ecf20Sopenharmony_ci struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 19948c2ecf20Sopenharmony_ci struct ceph_osd_request *req; 19958c2ecf20Sopenharmony_ci int ret = 0; 19968c2ecf20Sopenharmony_ci loff_t zero = 0; 19978c2ecf20Sopenharmony_ci int op; 19988c2ecf20Sopenharmony_ci 19998c2ecf20Sopenharmony_ci if (!length) { 20008c2ecf20Sopenharmony_ci op = offset ? CEPH_OSD_OP_DELETE : CEPH_OSD_OP_TRUNCATE; 20018c2ecf20Sopenharmony_ci length = &zero; 20028c2ecf20Sopenharmony_ci } else { 20038c2ecf20Sopenharmony_ci op = CEPH_OSD_OP_ZERO; 20048c2ecf20Sopenharmony_ci } 20058c2ecf20Sopenharmony_ci 20068c2ecf20Sopenharmony_ci req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 20078c2ecf20Sopenharmony_ci ceph_vino(inode), 20088c2ecf20Sopenharmony_ci offset, length, 20098c2ecf20Sopenharmony_ci 0, 1, op, 20108c2ecf20Sopenharmony_ci CEPH_OSD_FLAG_WRITE, 20118c2ecf20Sopenharmony_ci NULL, 0, 0, false); 20128c2ecf20Sopenharmony_ci if (IS_ERR(req)) { 20138c2ecf20Sopenharmony_ci ret = PTR_ERR(req); 20148c2ecf20Sopenharmony_ci goto out; 20158c2ecf20Sopenharmony_ci } 20168c2ecf20Sopenharmony_ci 20178c2ecf20Sopenharmony_ci req->r_mtime = inode->i_mtime; 20188c2ecf20Sopenharmony_ci ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 20198c2ecf20Sopenharmony_ci if (!ret) { 20208c2ecf20Sopenharmony_ci ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 20218c2ecf20Sopenharmony_ci if (ret == -ENOENT) 20228c2ecf20Sopenharmony_ci ret = 0; 20238c2ecf20Sopenharmony_ci } 20248c2ecf20Sopenharmony_ci ceph_osdc_put_request(req); 20258c2ecf20Sopenharmony_ci 20268c2ecf20Sopenharmony_ciout: 20278c2ecf20Sopenharmony_ci return ret; 20288c2ecf20Sopenharmony_ci} 20298c2ecf20Sopenharmony_ci 20308c2ecf20Sopenharmony_cistatic int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length) 20318c2ecf20Sopenharmony_ci{ 20328c2ecf20Sopenharmony_ci int ret = 0; 20338c2ecf20Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 20348c2ecf20Sopenharmony_ci s32 stripe_unit = ci->i_layout.stripe_unit; 20358c2ecf20Sopenharmony_ci s32 stripe_count = ci->i_layout.stripe_count; 20368c2ecf20Sopenharmony_ci s32 object_size = ci->i_layout.object_size; 20378c2ecf20Sopenharmony_ci u64 object_set_size = object_size * stripe_count; 20388c2ecf20Sopenharmony_ci u64 nearly, t; 20398c2ecf20Sopenharmony_ci 20408c2ecf20Sopenharmony_ci /* round offset up to next period boundary */ 20418c2ecf20Sopenharmony_ci nearly = offset + object_set_size - 1; 20428c2ecf20Sopenharmony_ci t = nearly; 20438c2ecf20Sopenharmony_ci nearly -= do_div(t, object_set_size); 20448c2ecf20Sopenharmony_ci 20458c2ecf20Sopenharmony_ci while (length && offset < nearly) { 20468c2ecf20Sopenharmony_ci loff_t size = length; 20478c2ecf20Sopenharmony_ci ret = ceph_zero_partial_object(inode, offset, &size); 20488c2ecf20Sopenharmony_ci if (ret < 0) 20498c2ecf20Sopenharmony_ci return ret; 20508c2ecf20Sopenharmony_ci offset += size; 20518c2ecf20Sopenharmony_ci length -= size; 20528c2ecf20Sopenharmony_ci } 20538c2ecf20Sopenharmony_ci while (length >= object_set_size) { 20548c2ecf20Sopenharmony_ci int i; 20558c2ecf20Sopenharmony_ci loff_t pos = offset; 20568c2ecf20Sopenharmony_ci for (i = 0; i < stripe_count; ++i) { 20578c2ecf20Sopenharmony_ci ret = ceph_zero_partial_object(inode, pos, NULL); 20588c2ecf20Sopenharmony_ci if (ret < 0) 20598c2ecf20Sopenharmony_ci return ret; 20608c2ecf20Sopenharmony_ci pos += stripe_unit; 20618c2ecf20Sopenharmony_ci } 20628c2ecf20Sopenharmony_ci offset += object_set_size; 20638c2ecf20Sopenharmony_ci length -= object_set_size; 20648c2ecf20Sopenharmony_ci } 20658c2ecf20Sopenharmony_ci while (length) { 20668c2ecf20Sopenharmony_ci loff_t size = length; 20678c2ecf20Sopenharmony_ci ret = ceph_zero_partial_object(inode, offset, &size); 20688c2ecf20Sopenharmony_ci if (ret < 0) 20698c2ecf20Sopenharmony_ci return ret; 20708c2ecf20Sopenharmony_ci offset += size; 20718c2ecf20Sopenharmony_ci length -= size; 20728c2ecf20Sopenharmony_ci } 20738c2ecf20Sopenharmony_ci return ret; 20748c2ecf20Sopenharmony_ci} 20758c2ecf20Sopenharmony_ci 20768c2ecf20Sopenharmony_cistatic long ceph_fallocate(struct file *file, int mode, 20778c2ecf20Sopenharmony_ci loff_t offset, loff_t length) 20788c2ecf20Sopenharmony_ci{ 20798c2ecf20Sopenharmony_ci struct ceph_file_info *fi = file->private_data; 20808c2ecf20Sopenharmony_ci struct inode *inode = file_inode(file); 20818c2ecf20Sopenharmony_ci struct ceph_inode_info *ci = ceph_inode(inode); 20828c2ecf20Sopenharmony_ci struct ceph_cap_flush *prealloc_cf; 20838c2ecf20Sopenharmony_ci int want, got = 0; 20848c2ecf20Sopenharmony_ci int dirty; 20858c2ecf20Sopenharmony_ci int ret = 0; 20868c2ecf20Sopenharmony_ci loff_t endoff = 0; 20878c2ecf20Sopenharmony_ci loff_t size; 20888c2ecf20Sopenharmony_ci 20898c2ecf20Sopenharmony_ci if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 20908c2ecf20Sopenharmony_ci return -EOPNOTSUPP; 20918c2ecf20Sopenharmony_ci 20928c2ecf20Sopenharmony_ci if (!S_ISREG(inode->i_mode)) 20938c2ecf20Sopenharmony_ci return -EOPNOTSUPP; 20948c2ecf20Sopenharmony_ci 20958c2ecf20Sopenharmony_ci prealloc_cf = ceph_alloc_cap_flush(); 20968c2ecf20Sopenharmony_ci if (!prealloc_cf) 20978c2ecf20Sopenharmony_ci return -ENOMEM; 20988c2ecf20Sopenharmony_ci 20998c2ecf20Sopenharmony_ci inode_lock(inode); 21008c2ecf20Sopenharmony_ci 21018c2ecf20Sopenharmony_ci if (ceph_snap(inode) != CEPH_NOSNAP) { 21028c2ecf20Sopenharmony_ci ret = -EROFS; 21038c2ecf20Sopenharmony_ci goto unlock; 21048c2ecf20Sopenharmony_ci } 21058c2ecf20Sopenharmony_ci 21068c2ecf20Sopenharmony_ci if (ci->i_inline_version != CEPH_INLINE_NONE) { 21078c2ecf20Sopenharmony_ci ret = ceph_uninline_data(file, NULL); 21088c2ecf20Sopenharmony_ci if (ret < 0) 21098c2ecf20Sopenharmony_ci goto unlock; 21108c2ecf20Sopenharmony_ci } 21118c2ecf20Sopenharmony_ci 21128c2ecf20Sopenharmony_ci size = i_size_read(inode); 21138c2ecf20Sopenharmony_ci 21148c2ecf20Sopenharmony_ci /* Are we punching a hole beyond EOF? */ 21158c2ecf20Sopenharmony_ci if (offset >= size) 21168c2ecf20Sopenharmony_ci goto unlock; 21178c2ecf20Sopenharmony_ci if ((offset + length) > size) 21188c2ecf20Sopenharmony_ci length = size - offset; 21198c2ecf20Sopenharmony_ci 21208c2ecf20Sopenharmony_ci if (fi->fmode & CEPH_FILE_MODE_LAZY) 21218c2ecf20Sopenharmony_ci want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; 21228c2ecf20Sopenharmony_ci else 21238c2ecf20Sopenharmony_ci want = CEPH_CAP_FILE_BUFFER; 21248c2ecf20Sopenharmony_ci 21258c2ecf20Sopenharmony_ci ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); 21268c2ecf20Sopenharmony_ci if (ret < 0) 21278c2ecf20Sopenharmony_ci goto unlock; 21288c2ecf20Sopenharmony_ci 21298c2ecf20Sopenharmony_ci ceph_zero_pagecache_range(inode, offset, length); 21308c2ecf20Sopenharmony_ci ret = ceph_zero_objects(inode, offset, length); 21318c2ecf20Sopenharmony_ci 21328c2ecf20Sopenharmony_ci if (!ret) { 21338c2ecf20Sopenharmony_ci spin_lock(&ci->i_ceph_lock); 21348c2ecf20Sopenharmony_ci ci->i_inline_version = CEPH_INLINE_NONE; 21358c2ecf20Sopenharmony_ci dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, 21368c2ecf20Sopenharmony_ci &prealloc_cf); 21378c2ecf20Sopenharmony_ci spin_unlock(&ci->i_ceph_lock); 21388c2ecf20Sopenharmony_ci if (dirty) 21398c2ecf20Sopenharmony_ci __mark_inode_dirty(inode, dirty); 21408c2ecf20Sopenharmony_ci } 21418c2ecf20Sopenharmony_ci 21428c2ecf20Sopenharmony_ci ceph_put_cap_refs(ci, got); 21438c2ecf20Sopenharmony_ciunlock: 21448c2ecf20Sopenharmony_ci inode_unlock(inode); 21458c2ecf20Sopenharmony_ci ceph_free_cap_flush(prealloc_cf); 21468c2ecf20Sopenharmony_ci return ret; 21478c2ecf20Sopenharmony_ci} 21488c2ecf20Sopenharmony_ci 21498c2ecf20Sopenharmony_ci/* 21508c2ecf20Sopenharmony_ci * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for 21518c2ecf20Sopenharmony_ci * src_ci. Two attempts are made to obtain both caps, and an error is return if 21528c2ecf20Sopenharmony_ci * this fails; zero is returned on success. 21538c2ecf20Sopenharmony_ci */ 21548c2ecf20Sopenharmony_cistatic int get_rd_wr_caps(struct file *src_filp, int *src_got, 21558c2ecf20Sopenharmony_ci struct file *dst_filp, 21568c2ecf20Sopenharmony_ci loff_t dst_endoff, int *dst_got) 21578c2ecf20Sopenharmony_ci{ 21588c2ecf20Sopenharmony_ci int ret = 0; 21598c2ecf20Sopenharmony_ci bool retrying = false; 21608c2ecf20Sopenharmony_ci 21618c2ecf20Sopenharmony_ciretry_caps: 21628c2ecf20Sopenharmony_ci ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, 21638c2ecf20Sopenharmony_ci dst_endoff, dst_got, NULL); 21648c2ecf20Sopenharmony_ci if (ret < 0) 21658c2ecf20Sopenharmony_ci return ret; 21668c2ecf20Sopenharmony_ci 21678c2ecf20Sopenharmony_ci /* 21688c2ecf20Sopenharmony_ci * Since we're already holding the FILE_WR capability for the dst file, 21698c2ecf20Sopenharmony_ci * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some 21708c2ecf20Sopenharmony_ci * retry dance instead to try to get both capabilities. 21718c2ecf20Sopenharmony_ci */ 21728c2ecf20Sopenharmony_ci ret = ceph_try_get_caps(file_inode(src_filp), 21738c2ecf20Sopenharmony_ci CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, 21748c2ecf20Sopenharmony_ci false, src_got); 21758c2ecf20Sopenharmony_ci if (ret <= 0) { 21768c2ecf20Sopenharmony_ci /* Start by dropping dst_ci caps and getting src_ci caps */ 21778c2ecf20Sopenharmony_ci ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got); 21788c2ecf20Sopenharmony_ci if (retrying) { 21798c2ecf20Sopenharmony_ci if (!ret) 21808c2ecf20Sopenharmony_ci /* ceph_try_get_caps masks EAGAIN */ 21818c2ecf20Sopenharmony_ci ret = -EAGAIN; 21828c2ecf20Sopenharmony_ci return ret; 21838c2ecf20Sopenharmony_ci } 21848c2ecf20Sopenharmony_ci ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD, 21858c2ecf20Sopenharmony_ci CEPH_CAP_FILE_SHARED, -1, src_got, NULL); 21868c2ecf20Sopenharmony_ci if (ret < 0) 21878c2ecf20Sopenharmony_ci return ret; 21888c2ecf20Sopenharmony_ci /*... drop src_ci caps too, and retry */ 21898c2ecf20Sopenharmony_ci ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got); 21908c2ecf20Sopenharmony_ci retrying = true; 21918c2ecf20Sopenharmony_ci goto retry_caps; 21928c2ecf20Sopenharmony_ci } 21938c2ecf20Sopenharmony_ci return ret; 21948c2ecf20Sopenharmony_ci} 21958c2ecf20Sopenharmony_ci 21968c2ecf20Sopenharmony_cistatic void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got, 21978c2ecf20Sopenharmony_ci struct ceph_inode_info *dst_ci, int dst_got) 21988c2ecf20Sopenharmony_ci{ 21998c2ecf20Sopenharmony_ci ceph_put_cap_refs(src_ci, src_got); 22008c2ecf20Sopenharmony_ci ceph_put_cap_refs(dst_ci, dst_got); 22018c2ecf20Sopenharmony_ci} 22028c2ecf20Sopenharmony_ci 22038c2ecf20Sopenharmony_ci/* 22048c2ecf20Sopenharmony_ci * This function does several size-related checks, returning an error if: 22058c2ecf20Sopenharmony_ci * - source file is smaller than off+len 22068c2ecf20Sopenharmony_ci * - destination file size is not OK (inode_newsize_ok()) 22078c2ecf20Sopenharmony_ci * - max bytes quotas is exceeded 22088c2ecf20Sopenharmony_ci */ 22098c2ecf20Sopenharmony_cistatic int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode, 22108c2ecf20Sopenharmony_ci loff_t src_off, loff_t dst_off, size_t len) 22118c2ecf20Sopenharmony_ci{ 22128c2ecf20Sopenharmony_ci loff_t size, endoff; 22138c2ecf20Sopenharmony_ci 22148c2ecf20Sopenharmony_ci size = i_size_read(src_inode); 22158c2ecf20Sopenharmony_ci /* 22168c2ecf20Sopenharmony_ci * Don't copy beyond source file EOF. Instead of simply setting length 22178c2ecf20Sopenharmony_ci * to (size - src_off), just drop to VFS default implementation, as the 22188c2ecf20Sopenharmony_ci * local i_size may be stale due to other clients writing to the source 22198c2ecf20Sopenharmony_ci * inode. 22208c2ecf20Sopenharmony_ci */ 22218c2ecf20Sopenharmony_ci if (src_off + len > size) { 22228c2ecf20Sopenharmony_ci dout("Copy beyond EOF (%llu + %zu > %llu)\n", 22238c2ecf20Sopenharmony_ci src_off, len, size); 22248c2ecf20Sopenharmony_ci return -EOPNOTSUPP; 22258c2ecf20Sopenharmony_ci } 22268c2ecf20Sopenharmony_ci size = i_size_read(dst_inode); 22278c2ecf20Sopenharmony_ci 22288c2ecf20Sopenharmony_ci endoff = dst_off + len; 22298c2ecf20Sopenharmony_ci if (inode_newsize_ok(dst_inode, endoff)) 22308c2ecf20Sopenharmony_ci return -EOPNOTSUPP; 22318c2ecf20Sopenharmony_ci 22328c2ecf20Sopenharmony_ci if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff)) 22338c2ecf20Sopenharmony_ci return -EDQUOT; 22348c2ecf20Sopenharmony_ci 22358c2ecf20Sopenharmony_ci return 0; 22368c2ecf20Sopenharmony_ci} 22378c2ecf20Sopenharmony_ci 22388c2ecf20Sopenharmony_cistatic ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off, 22398c2ecf20Sopenharmony_ci struct ceph_inode_info *dst_ci, u64 *dst_off, 22408c2ecf20Sopenharmony_ci struct ceph_fs_client *fsc, 22418c2ecf20Sopenharmony_ci size_t len, unsigned int flags) 22428c2ecf20Sopenharmony_ci{ 22438c2ecf20Sopenharmony_ci struct ceph_object_locator src_oloc, dst_oloc; 22448c2ecf20Sopenharmony_ci struct ceph_object_id src_oid, dst_oid; 22458c2ecf20Sopenharmony_ci size_t bytes = 0; 22468c2ecf20Sopenharmony_ci u64 src_objnum, src_objoff, dst_objnum, dst_objoff; 22478c2ecf20Sopenharmony_ci u32 src_objlen, dst_objlen; 22488c2ecf20Sopenharmony_ci u32 object_size = src_ci->i_layout.object_size; 22498c2ecf20Sopenharmony_ci int ret; 22508c2ecf20Sopenharmony_ci 22518c2ecf20Sopenharmony_ci src_oloc.pool = src_ci->i_layout.pool_id; 22528c2ecf20Sopenharmony_ci src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns); 22538c2ecf20Sopenharmony_ci dst_oloc.pool = dst_ci->i_layout.pool_id; 22548c2ecf20Sopenharmony_ci dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns); 22558c2ecf20Sopenharmony_ci 22568c2ecf20Sopenharmony_ci while (len >= object_size) { 22578c2ecf20Sopenharmony_ci ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off, 22588c2ecf20Sopenharmony_ci object_size, &src_objnum, 22598c2ecf20Sopenharmony_ci &src_objoff, &src_objlen); 22608c2ecf20Sopenharmony_ci ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off, 22618c2ecf20Sopenharmony_ci object_size, &dst_objnum, 22628c2ecf20Sopenharmony_ci &dst_objoff, &dst_objlen); 22638c2ecf20Sopenharmony_ci ceph_oid_init(&src_oid); 22648c2ecf20Sopenharmony_ci ceph_oid_printf(&src_oid, "%llx.%08llx", 22658c2ecf20Sopenharmony_ci src_ci->i_vino.ino, src_objnum); 22668c2ecf20Sopenharmony_ci ceph_oid_init(&dst_oid); 22678c2ecf20Sopenharmony_ci ceph_oid_printf(&dst_oid, "%llx.%08llx", 22688c2ecf20Sopenharmony_ci dst_ci->i_vino.ino, dst_objnum); 22698c2ecf20Sopenharmony_ci /* Do an object remote copy */ 22708c2ecf20Sopenharmony_ci ret = ceph_osdc_copy_from(&fsc->client->osdc, 22718c2ecf20Sopenharmony_ci src_ci->i_vino.snap, 0, 22728c2ecf20Sopenharmony_ci &src_oid, &src_oloc, 22738c2ecf20Sopenharmony_ci CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | 22748c2ecf20Sopenharmony_ci CEPH_OSD_OP_FLAG_FADVISE_NOCACHE, 22758c2ecf20Sopenharmony_ci &dst_oid, &dst_oloc, 22768c2ecf20Sopenharmony_ci CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | 22778c2ecf20Sopenharmony_ci CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, 22788c2ecf20Sopenharmony_ci dst_ci->i_truncate_seq, 22798c2ecf20Sopenharmony_ci dst_ci->i_truncate_size, 22808c2ecf20Sopenharmony_ci CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ); 22818c2ecf20Sopenharmony_ci if (ret) { 22828c2ecf20Sopenharmony_ci if (ret == -EOPNOTSUPP) { 22838c2ecf20Sopenharmony_ci fsc->have_copy_from2 = false; 22848c2ecf20Sopenharmony_ci pr_notice("OSDs don't support copy-from2; disabling copy offload\n"); 22858c2ecf20Sopenharmony_ci } 22868c2ecf20Sopenharmony_ci dout("ceph_osdc_copy_from returned %d\n", ret); 22878c2ecf20Sopenharmony_ci if (!bytes) 22888c2ecf20Sopenharmony_ci bytes = ret; 22898c2ecf20Sopenharmony_ci goto out; 22908c2ecf20Sopenharmony_ci } 22918c2ecf20Sopenharmony_ci len -= object_size; 22928c2ecf20Sopenharmony_ci bytes += object_size; 22938c2ecf20Sopenharmony_ci *src_off += object_size; 22948c2ecf20Sopenharmony_ci *dst_off += object_size; 22958c2ecf20Sopenharmony_ci } 22968c2ecf20Sopenharmony_ci 22978c2ecf20Sopenharmony_ciout: 22988c2ecf20Sopenharmony_ci ceph_oloc_destroy(&src_oloc); 22998c2ecf20Sopenharmony_ci ceph_oloc_destroy(&dst_oloc); 23008c2ecf20Sopenharmony_ci return bytes; 23018c2ecf20Sopenharmony_ci} 23028c2ecf20Sopenharmony_ci 23038c2ecf20Sopenharmony_cistatic ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, 23048c2ecf20Sopenharmony_ci struct file *dst_file, loff_t dst_off, 23058c2ecf20Sopenharmony_ci size_t len, unsigned int flags) 23068c2ecf20Sopenharmony_ci{ 23078c2ecf20Sopenharmony_ci struct inode *src_inode = file_inode(src_file); 23088c2ecf20Sopenharmony_ci struct inode *dst_inode = file_inode(dst_file); 23098c2ecf20Sopenharmony_ci struct ceph_inode_info *src_ci = ceph_inode(src_inode); 23108c2ecf20Sopenharmony_ci struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); 23118c2ecf20Sopenharmony_ci struct ceph_cap_flush *prealloc_cf; 23128c2ecf20Sopenharmony_ci struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode); 23138c2ecf20Sopenharmony_ci loff_t size; 23148c2ecf20Sopenharmony_ci ssize_t ret = -EIO, bytes; 23158c2ecf20Sopenharmony_ci u64 src_objnum, dst_objnum, src_objoff, dst_objoff; 23168c2ecf20Sopenharmony_ci u32 src_objlen, dst_objlen; 23178c2ecf20Sopenharmony_ci int src_got = 0, dst_got = 0, err, dirty; 23188c2ecf20Sopenharmony_ci 23198c2ecf20Sopenharmony_ci if (src_inode->i_sb != dst_inode->i_sb) { 23208c2ecf20Sopenharmony_ci struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode); 23218c2ecf20Sopenharmony_ci 23228c2ecf20Sopenharmony_ci if (ceph_fsid_compare(&src_fsc->client->fsid, 23238c2ecf20Sopenharmony_ci &dst_fsc->client->fsid)) { 23248c2ecf20Sopenharmony_ci dout("Copying files across clusters: src: %pU dst: %pU\n", 23258c2ecf20Sopenharmony_ci &src_fsc->client->fsid, &dst_fsc->client->fsid); 23268c2ecf20Sopenharmony_ci return -EXDEV; 23278c2ecf20Sopenharmony_ci } 23288c2ecf20Sopenharmony_ci } 23298c2ecf20Sopenharmony_ci if (ceph_snap(dst_inode) != CEPH_NOSNAP) 23308c2ecf20Sopenharmony_ci return -EROFS; 23318c2ecf20Sopenharmony_ci 23328c2ecf20Sopenharmony_ci /* 23338c2ecf20Sopenharmony_ci * Some of the checks below will return -EOPNOTSUPP, which will force a 23348c2ecf20Sopenharmony_ci * fallback to the default VFS copy_file_range implementation. This is 23358c2ecf20Sopenharmony_ci * desirable in several cases (for ex, the 'len' is smaller than the 23368c2ecf20Sopenharmony_ci * size of the objects, or in cases where that would be more 23378c2ecf20Sopenharmony_ci * efficient). 23388c2ecf20Sopenharmony_ci */ 23398c2ecf20Sopenharmony_ci 23408c2ecf20Sopenharmony_ci if (ceph_test_mount_opt(src_fsc, NOCOPYFROM)) 23418c2ecf20Sopenharmony_ci return -EOPNOTSUPP; 23428c2ecf20Sopenharmony_ci 23438c2ecf20Sopenharmony_ci if (!src_fsc->have_copy_from2) 23448c2ecf20Sopenharmony_ci return -EOPNOTSUPP; 23458c2ecf20Sopenharmony_ci 23468c2ecf20Sopenharmony_ci /* 23478c2ecf20Sopenharmony_ci * Striped file layouts require that we copy partial objects, but the 23488c2ecf20Sopenharmony_ci * OSD copy-from operation only supports full-object copies. Limit 23498c2ecf20Sopenharmony_ci * this to non-striped file layouts for now. 23508c2ecf20Sopenharmony_ci */ 23518c2ecf20Sopenharmony_ci if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || 23528c2ecf20Sopenharmony_ci (src_ci->i_layout.stripe_count != 1) || 23538c2ecf20Sopenharmony_ci (dst_ci->i_layout.stripe_count != 1) || 23548c2ecf20Sopenharmony_ci (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) { 23558c2ecf20Sopenharmony_ci dout("Invalid src/dst files layout\n"); 23568c2ecf20Sopenharmony_ci return -EOPNOTSUPP; 23578c2ecf20Sopenharmony_ci } 23588c2ecf20Sopenharmony_ci 23598c2ecf20Sopenharmony_ci if (len < src_ci->i_layout.object_size) 23608c2ecf20Sopenharmony_ci return -EOPNOTSUPP; /* no remote copy will be done */ 23618c2ecf20Sopenharmony_ci 23628c2ecf20Sopenharmony_ci prealloc_cf = ceph_alloc_cap_flush(); 23638c2ecf20Sopenharmony_ci if (!prealloc_cf) 23648c2ecf20Sopenharmony_ci return -ENOMEM; 23658c2ecf20Sopenharmony_ci 23668c2ecf20Sopenharmony_ci /* Start by sync'ing the source and destination files */ 23678c2ecf20Sopenharmony_ci ret = file_write_and_wait_range(src_file, src_off, (src_off + len)); 23688c2ecf20Sopenharmony_ci if (ret < 0) { 23698c2ecf20Sopenharmony_ci dout("failed to write src file (%zd)\n", ret); 23708c2ecf20Sopenharmony_ci goto out; 23718c2ecf20Sopenharmony_ci } 23728c2ecf20Sopenharmony_ci ret = file_write_and_wait_range(dst_file, dst_off, (dst_off + len)); 23738c2ecf20Sopenharmony_ci if (ret < 0) { 23748c2ecf20Sopenharmony_ci dout("failed to write dst file (%zd)\n", ret); 23758c2ecf20Sopenharmony_ci goto out; 23768c2ecf20Sopenharmony_ci } 23778c2ecf20Sopenharmony_ci 23788c2ecf20Sopenharmony_ci /* 23798c2ecf20Sopenharmony_ci * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other 23808c2ecf20Sopenharmony_ci * clients may have dirty data in their caches. And OSDs know nothing 23818c2ecf20Sopenharmony_ci * about caps, so they can't safely do the remote object copies. 23828c2ecf20Sopenharmony_ci */ 23838c2ecf20Sopenharmony_ci err = get_rd_wr_caps(src_file, &src_got, 23848c2ecf20Sopenharmony_ci dst_file, (dst_off + len), &dst_got); 23858c2ecf20Sopenharmony_ci if (err < 0) { 23868c2ecf20Sopenharmony_ci dout("get_rd_wr_caps returned %d\n", err); 23878c2ecf20Sopenharmony_ci ret = -EOPNOTSUPP; 23888c2ecf20Sopenharmony_ci goto out; 23898c2ecf20Sopenharmony_ci } 23908c2ecf20Sopenharmony_ci 23918c2ecf20Sopenharmony_ci ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len); 23928c2ecf20Sopenharmony_ci if (ret < 0) 23938c2ecf20Sopenharmony_ci goto out_caps; 23948c2ecf20Sopenharmony_ci 23958c2ecf20Sopenharmony_ci /* Drop dst file cached pages */ 23968c2ecf20Sopenharmony_ci ret = invalidate_inode_pages2_range(dst_inode->i_mapping, 23978c2ecf20Sopenharmony_ci dst_off >> PAGE_SHIFT, 23988c2ecf20Sopenharmony_ci (dst_off + len) >> PAGE_SHIFT); 23998c2ecf20Sopenharmony_ci if (ret < 0) { 24008c2ecf20Sopenharmony_ci dout("Failed to invalidate inode pages (%zd)\n", ret); 24018c2ecf20Sopenharmony_ci ret = 0; /* XXX */ 24028c2ecf20Sopenharmony_ci } 24038c2ecf20Sopenharmony_ci ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, 24048c2ecf20Sopenharmony_ci src_ci->i_layout.object_size, 24058c2ecf20Sopenharmony_ci &src_objnum, &src_objoff, &src_objlen); 24068c2ecf20Sopenharmony_ci ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off, 24078c2ecf20Sopenharmony_ci dst_ci->i_layout.object_size, 24088c2ecf20Sopenharmony_ci &dst_objnum, &dst_objoff, &dst_objlen); 24098c2ecf20Sopenharmony_ci /* object-level offsets need to the same */ 24108c2ecf20Sopenharmony_ci if (src_objoff != dst_objoff) { 24118c2ecf20Sopenharmony_ci ret = -EOPNOTSUPP; 24128c2ecf20Sopenharmony_ci goto out_caps; 24138c2ecf20Sopenharmony_ci } 24148c2ecf20Sopenharmony_ci 24158c2ecf20Sopenharmony_ci /* 24168c2ecf20Sopenharmony_ci * Do a manual copy if the object offset isn't object aligned. 24178c2ecf20Sopenharmony_ci * 'src_objlen' contains the bytes left until the end of the object, 24188c2ecf20Sopenharmony_ci * starting at the src_off 24198c2ecf20Sopenharmony_ci */ 24208c2ecf20Sopenharmony_ci if (src_objoff) { 24218c2ecf20Sopenharmony_ci dout("Initial partial copy of %u bytes\n", src_objlen); 24228c2ecf20Sopenharmony_ci 24238c2ecf20Sopenharmony_ci /* 24248c2ecf20Sopenharmony_ci * we need to temporarily drop all caps as we'll be calling 24258c2ecf20Sopenharmony_ci * {read,write}_iter, which will get caps again. 24268c2ecf20Sopenharmony_ci */ 24278c2ecf20Sopenharmony_ci put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); 24288c2ecf20Sopenharmony_ci ret = do_splice_direct(src_file, &src_off, dst_file, 24298c2ecf20Sopenharmony_ci &dst_off, src_objlen, flags); 24308c2ecf20Sopenharmony_ci /* Abort on short copies or on error */ 24318c2ecf20Sopenharmony_ci if (ret < (long)src_objlen) { 24328c2ecf20Sopenharmony_ci dout("Failed partial copy (%zd)\n", ret); 24338c2ecf20Sopenharmony_ci goto out; 24348c2ecf20Sopenharmony_ci } 24358c2ecf20Sopenharmony_ci len -= ret; 24368c2ecf20Sopenharmony_ci err = get_rd_wr_caps(src_file, &src_got, 24378c2ecf20Sopenharmony_ci dst_file, (dst_off + len), &dst_got); 24388c2ecf20Sopenharmony_ci if (err < 0) 24398c2ecf20Sopenharmony_ci goto out; 24408c2ecf20Sopenharmony_ci err = is_file_size_ok(src_inode, dst_inode, 24418c2ecf20Sopenharmony_ci src_off, dst_off, len); 24428c2ecf20Sopenharmony_ci if (err < 0) 24438c2ecf20Sopenharmony_ci goto out_caps; 24448c2ecf20Sopenharmony_ci } 24458c2ecf20Sopenharmony_ci 24468c2ecf20Sopenharmony_ci size = i_size_read(dst_inode); 24478c2ecf20Sopenharmony_ci bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off, 24488c2ecf20Sopenharmony_ci src_fsc, len, flags); 24498c2ecf20Sopenharmony_ci if (bytes <= 0) { 24508c2ecf20Sopenharmony_ci if (!ret) 24518c2ecf20Sopenharmony_ci ret = bytes; 24528c2ecf20Sopenharmony_ci goto out_caps; 24538c2ecf20Sopenharmony_ci } 24548c2ecf20Sopenharmony_ci dout("Copied %zu bytes out of %zu\n", bytes, len); 24558c2ecf20Sopenharmony_ci len -= bytes; 24568c2ecf20Sopenharmony_ci ret += bytes; 24578c2ecf20Sopenharmony_ci 24588c2ecf20Sopenharmony_ci file_update_time(dst_file); 24598c2ecf20Sopenharmony_ci inode_inc_iversion_raw(dst_inode); 24608c2ecf20Sopenharmony_ci 24618c2ecf20Sopenharmony_ci if (dst_off > size) { 24628c2ecf20Sopenharmony_ci /* Let the MDS know about dst file size change */ 24638c2ecf20Sopenharmony_ci if (ceph_inode_set_size(dst_inode, dst_off) || 24648c2ecf20Sopenharmony_ci ceph_quota_is_max_bytes_approaching(dst_inode, dst_off)) 24658c2ecf20Sopenharmony_ci ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL); 24668c2ecf20Sopenharmony_ci } 24678c2ecf20Sopenharmony_ci /* Mark Fw dirty */ 24688c2ecf20Sopenharmony_ci spin_lock(&dst_ci->i_ceph_lock); 24698c2ecf20Sopenharmony_ci dst_ci->i_inline_version = CEPH_INLINE_NONE; 24708c2ecf20Sopenharmony_ci dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf); 24718c2ecf20Sopenharmony_ci spin_unlock(&dst_ci->i_ceph_lock); 24728c2ecf20Sopenharmony_ci if (dirty) 24738c2ecf20Sopenharmony_ci __mark_inode_dirty(dst_inode, dirty); 24748c2ecf20Sopenharmony_ci 24758c2ecf20Sopenharmony_ciout_caps: 24768c2ecf20Sopenharmony_ci put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); 24778c2ecf20Sopenharmony_ci 24788c2ecf20Sopenharmony_ci /* 24798c2ecf20Sopenharmony_ci * Do the final manual copy if we still have some bytes left, unless 24808c2ecf20Sopenharmony_ci * there were errors in remote object copies (len >= object_size). 24818c2ecf20Sopenharmony_ci */ 24828c2ecf20Sopenharmony_ci if (len && (len < src_ci->i_layout.object_size)) { 24838c2ecf20Sopenharmony_ci dout("Final partial copy of %zu bytes\n", len); 24848c2ecf20Sopenharmony_ci bytes = do_splice_direct(src_file, &src_off, dst_file, 24858c2ecf20Sopenharmony_ci &dst_off, len, flags); 24868c2ecf20Sopenharmony_ci if (bytes > 0) 24878c2ecf20Sopenharmony_ci ret += bytes; 24888c2ecf20Sopenharmony_ci else 24898c2ecf20Sopenharmony_ci dout("Failed partial copy (%zd)\n", bytes); 24908c2ecf20Sopenharmony_ci } 24918c2ecf20Sopenharmony_ci 24928c2ecf20Sopenharmony_ciout: 24938c2ecf20Sopenharmony_ci ceph_free_cap_flush(prealloc_cf); 24948c2ecf20Sopenharmony_ci 24958c2ecf20Sopenharmony_ci return ret; 24968c2ecf20Sopenharmony_ci} 24978c2ecf20Sopenharmony_ci 24988c2ecf20Sopenharmony_cistatic ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off, 24998c2ecf20Sopenharmony_ci struct file *dst_file, loff_t dst_off, 25008c2ecf20Sopenharmony_ci size_t len, unsigned int flags) 25018c2ecf20Sopenharmony_ci{ 25028c2ecf20Sopenharmony_ci ssize_t ret; 25038c2ecf20Sopenharmony_ci 25048c2ecf20Sopenharmony_ci ret = __ceph_copy_file_range(src_file, src_off, dst_file, dst_off, 25058c2ecf20Sopenharmony_ci len, flags); 25068c2ecf20Sopenharmony_ci 25078c2ecf20Sopenharmony_ci if (ret == -EOPNOTSUPP || ret == -EXDEV) 25088c2ecf20Sopenharmony_ci ret = generic_copy_file_range(src_file, src_off, dst_file, 25098c2ecf20Sopenharmony_ci dst_off, len, flags); 25108c2ecf20Sopenharmony_ci return ret; 25118c2ecf20Sopenharmony_ci} 25128c2ecf20Sopenharmony_ci 25138c2ecf20Sopenharmony_ciconst struct file_operations ceph_file_fops = { 25148c2ecf20Sopenharmony_ci .open = ceph_open, 25158c2ecf20Sopenharmony_ci .release = ceph_release, 25168c2ecf20Sopenharmony_ci .llseek = ceph_llseek, 25178c2ecf20Sopenharmony_ci .read_iter = ceph_read_iter, 25188c2ecf20Sopenharmony_ci .write_iter = ceph_write_iter, 25198c2ecf20Sopenharmony_ci .mmap = ceph_mmap, 25208c2ecf20Sopenharmony_ci .fsync = ceph_fsync, 25218c2ecf20Sopenharmony_ci .lock = ceph_lock, 25228c2ecf20Sopenharmony_ci .setlease = simple_nosetlease, 25238c2ecf20Sopenharmony_ci .flock = ceph_flock, 25248c2ecf20Sopenharmony_ci .splice_read = generic_file_splice_read, 25258c2ecf20Sopenharmony_ci .splice_write = iter_file_splice_write, 25268c2ecf20Sopenharmony_ci .unlocked_ioctl = ceph_ioctl, 25278c2ecf20Sopenharmony_ci .compat_ioctl = compat_ptr_ioctl, 25288c2ecf20Sopenharmony_ci .fallocate = ceph_fallocate, 25298c2ecf20Sopenharmony_ci .copy_file_range = ceph_copy_file_range, 25308c2ecf20Sopenharmony_ci}; 2531