18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later
28c2ecf20Sopenharmony_ci/* -*- mode: c; c-basic-offset: 8; -*-
38c2ecf20Sopenharmony_ci * vim: noexpandtab sw=8 ts=8 sts=0:
48c2ecf20Sopenharmony_ci *
58c2ecf20Sopenharmony_ci * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
68c2ecf20Sopenharmony_ci */
78c2ecf20Sopenharmony_ci
88c2ecf20Sopenharmony_ci#include <linux/fs.h>
98c2ecf20Sopenharmony_ci#include <linux/slab.h>
108c2ecf20Sopenharmony_ci#include <linux/highmem.h>
118c2ecf20Sopenharmony_ci#include <linux/pagemap.h>
128c2ecf20Sopenharmony_ci#include <asm/byteorder.h>
138c2ecf20Sopenharmony_ci#include <linux/swap.h>
148c2ecf20Sopenharmony_ci#include <linux/mpage.h>
158c2ecf20Sopenharmony_ci#include <linux/quotaops.h>
168c2ecf20Sopenharmony_ci#include <linux/blkdev.h>
178c2ecf20Sopenharmony_ci#include <linux/uio.h>
188c2ecf20Sopenharmony_ci#include <linux/mm.h>
198c2ecf20Sopenharmony_ci
208c2ecf20Sopenharmony_ci#include <cluster/masklog.h>
218c2ecf20Sopenharmony_ci
228c2ecf20Sopenharmony_ci#include "ocfs2.h"
238c2ecf20Sopenharmony_ci
248c2ecf20Sopenharmony_ci#include "alloc.h"
258c2ecf20Sopenharmony_ci#include "aops.h"
268c2ecf20Sopenharmony_ci#include "dlmglue.h"
278c2ecf20Sopenharmony_ci#include "extent_map.h"
288c2ecf20Sopenharmony_ci#include "file.h"
298c2ecf20Sopenharmony_ci#include "inode.h"
308c2ecf20Sopenharmony_ci#include "journal.h"
318c2ecf20Sopenharmony_ci#include "suballoc.h"
328c2ecf20Sopenharmony_ci#include "super.h"
338c2ecf20Sopenharmony_ci#include "symlink.h"
348c2ecf20Sopenharmony_ci#include "refcounttree.h"
358c2ecf20Sopenharmony_ci#include "ocfs2_trace.h"
368c2ecf20Sopenharmony_ci
378c2ecf20Sopenharmony_ci#include "buffer_head_io.h"
388c2ecf20Sopenharmony_ci#include "dir.h"
398c2ecf20Sopenharmony_ci#include "namei.h"
408c2ecf20Sopenharmony_ci#include "sysfile.h"
418c2ecf20Sopenharmony_ci
428c2ecf20Sopenharmony_cistatic int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
438c2ecf20Sopenharmony_ci				   struct buffer_head *bh_result, int create)
448c2ecf20Sopenharmony_ci{
458c2ecf20Sopenharmony_ci	int err = -EIO;
468c2ecf20Sopenharmony_ci	int status;
478c2ecf20Sopenharmony_ci	struct ocfs2_dinode *fe = NULL;
488c2ecf20Sopenharmony_ci	struct buffer_head *bh = NULL;
498c2ecf20Sopenharmony_ci	struct buffer_head *buffer_cache_bh = NULL;
508c2ecf20Sopenharmony_ci	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
518c2ecf20Sopenharmony_ci	void *kaddr;
528c2ecf20Sopenharmony_ci
538c2ecf20Sopenharmony_ci	trace_ocfs2_symlink_get_block(
548c2ecf20Sopenharmony_ci			(unsigned long long)OCFS2_I(inode)->ip_blkno,
558c2ecf20Sopenharmony_ci			(unsigned long long)iblock, bh_result, create);
568c2ecf20Sopenharmony_ci
578c2ecf20Sopenharmony_ci	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
588c2ecf20Sopenharmony_ci
598c2ecf20Sopenharmony_ci	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
608c2ecf20Sopenharmony_ci		mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
618c2ecf20Sopenharmony_ci		     (unsigned long long)iblock);
628c2ecf20Sopenharmony_ci		goto bail;
638c2ecf20Sopenharmony_ci	}
648c2ecf20Sopenharmony_ci
658c2ecf20Sopenharmony_ci	status = ocfs2_read_inode_block(inode, &bh);
668c2ecf20Sopenharmony_ci	if (status < 0) {
678c2ecf20Sopenharmony_ci		mlog_errno(status);
688c2ecf20Sopenharmony_ci		goto bail;
698c2ecf20Sopenharmony_ci	}
708c2ecf20Sopenharmony_ci	fe = (struct ocfs2_dinode *) bh->b_data;
718c2ecf20Sopenharmony_ci
728c2ecf20Sopenharmony_ci	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
738c2ecf20Sopenharmony_ci						    le32_to_cpu(fe->i_clusters))) {
748c2ecf20Sopenharmony_ci		err = -ENOMEM;
758c2ecf20Sopenharmony_ci		mlog(ML_ERROR, "block offset is outside the allocated size: "
768c2ecf20Sopenharmony_ci		     "%llu\n", (unsigned long long)iblock);
778c2ecf20Sopenharmony_ci		goto bail;
788c2ecf20Sopenharmony_ci	}
798c2ecf20Sopenharmony_ci
808c2ecf20Sopenharmony_ci	/* We don't use the page cache to create symlink data, so if
818c2ecf20Sopenharmony_ci	 * need be, copy it over from the buffer cache. */
828c2ecf20Sopenharmony_ci	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
838c2ecf20Sopenharmony_ci		u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
848c2ecf20Sopenharmony_ci			    iblock;
858c2ecf20Sopenharmony_ci		buffer_cache_bh = sb_getblk(osb->sb, blkno);
868c2ecf20Sopenharmony_ci		if (!buffer_cache_bh) {
878c2ecf20Sopenharmony_ci			err = -ENOMEM;
888c2ecf20Sopenharmony_ci			mlog(ML_ERROR, "couldn't getblock for symlink!\n");
898c2ecf20Sopenharmony_ci			goto bail;
908c2ecf20Sopenharmony_ci		}
918c2ecf20Sopenharmony_ci
928c2ecf20Sopenharmony_ci		/* we haven't locked out transactions, so a commit
938c2ecf20Sopenharmony_ci		 * could've happened. Since we've got a reference on
948c2ecf20Sopenharmony_ci		 * the bh, even if it commits while we're doing the
958c2ecf20Sopenharmony_ci		 * copy, the data is still good. */
968c2ecf20Sopenharmony_ci		if (buffer_jbd(buffer_cache_bh)
978c2ecf20Sopenharmony_ci		    && ocfs2_inode_is_new(inode)) {
988c2ecf20Sopenharmony_ci			kaddr = kmap_atomic(bh_result->b_page);
998c2ecf20Sopenharmony_ci			if (!kaddr) {
1008c2ecf20Sopenharmony_ci				mlog(ML_ERROR, "couldn't kmap!\n");
1018c2ecf20Sopenharmony_ci				goto bail;
1028c2ecf20Sopenharmony_ci			}
1038c2ecf20Sopenharmony_ci			memcpy(kaddr + (bh_result->b_size * iblock),
1048c2ecf20Sopenharmony_ci			       buffer_cache_bh->b_data,
1058c2ecf20Sopenharmony_ci			       bh_result->b_size);
1068c2ecf20Sopenharmony_ci			kunmap_atomic(kaddr);
1078c2ecf20Sopenharmony_ci			set_buffer_uptodate(bh_result);
1088c2ecf20Sopenharmony_ci		}
1098c2ecf20Sopenharmony_ci		brelse(buffer_cache_bh);
1108c2ecf20Sopenharmony_ci	}
1118c2ecf20Sopenharmony_ci
1128c2ecf20Sopenharmony_ci	map_bh(bh_result, inode->i_sb,
1138c2ecf20Sopenharmony_ci	       le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
1148c2ecf20Sopenharmony_ci
1158c2ecf20Sopenharmony_ci	err = 0;
1168c2ecf20Sopenharmony_ci
1178c2ecf20Sopenharmony_cibail:
1188c2ecf20Sopenharmony_ci	brelse(bh);
1198c2ecf20Sopenharmony_ci
1208c2ecf20Sopenharmony_ci	return err;
1218c2ecf20Sopenharmony_ci}
1228c2ecf20Sopenharmony_ci
1238c2ecf20Sopenharmony_cistatic int ocfs2_lock_get_block(struct inode *inode, sector_t iblock,
1248c2ecf20Sopenharmony_ci		    struct buffer_head *bh_result, int create)
1258c2ecf20Sopenharmony_ci{
1268c2ecf20Sopenharmony_ci	int ret = 0;
1278c2ecf20Sopenharmony_ci	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1288c2ecf20Sopenharmony_ci
1298c2ecf20Sopenharmony_ci	down_read(&oi->ip_alloc_sem);
1308c2ecf20Sopenharmony_ci	ret = ocfs2_get_block(inode, iblock, bh_result, create);
1318c2ecf20Sopenharmony_ci	up_read(&oi->ip_alloc_sem);
1328c2ecf20Sopenharmony_ci
1338c2ecf20Sopenharmony_ci	return ret;
1348c2ecf20Sopenharmony_ci}
1358c2ecf20Sopenharmony_ci
1368c2ecf20Sopenharmony_ciint ocfs2_get_block(struct inode *inode, sector_t iblock,
1378c2ecf20Sopenharmony_ci		    struct buffer_head *bh_result, int create)
1388c2ecf20Sopenharmony_ci{
1398c2ecf20Sopenharmony_ci	int err = 0;
1408c2ecf20Sopenharmony_ci	unsigned int ext_flags;
1418c2ecf20Sopenharmony_ci	u64 max_blocks = bh_result->b_size >> inode->i_blkbits;
1428c2ecf20Sopenharmony_ci	u64 p_blkno, count, past_eof;
1438c2ecf20Sopenharmony_ci	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1448c2ecf20Sopenharmony_ci
1458c2ecf20Sopenharmony_ci	trace_ocfs2_get_block((unsigned long long)OCFS2_I(inode)->ip_blkno,
1468c2ecf20Sopenharmony_ci			      (unsigned long long)iblock, bh_result, create);
1478c2ecf20Sopenharmony_ci
1488c2ecf20Sopenharmony_ci	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
1498c2ecf20Sopenharmony_ci		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
1508c2ecf20Sopenharmony_ci		     inode, inode->i_ino);
1518c2ecf20Sopenharmony_ci
1528c2ecf20Sopenharmony_ci	if (S_ISLNK(inode->i_mode)) {
1538c2ecf20Sopenharmony_ci		/* this always does I/O for some reason. */
1548c2ecf20Sopenharmony_ci		err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
1558c2ecf20Sopenharmony_ci		goto bail;
1568c2ecf20Sopenharmony_ci	}
1578c2ecf20Sopenharmony_ci
1588c2ecf20Sopenharmony_ci	err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count,
1598c2ecf20Sopenharmony_ci					  &ext_flags);
1608c2ecf20Sopenharmony_ci	if (err) {
1618c2ecf20Sopenharmony_ci		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
1628c2ecf20Sopenharmony_ci		     "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
1638c2ecf20Sopenharmony_ci		     (unsigned long long)p_blkno);
1648c2ecf20Sopenharmony_ci		goto bail;
1658c2ecf20Sopenharmony_ci	}
1668c2ecf20Sopenharmony_ci
1678c2ecf20Sopenharmony_ci	if (max_blocks < count)
1688c2ecf20Sopenharmony_ci		count = max_blocks;
1698c2ecf20Sopenharmony_ci
1708c2ecf20Sopenharmony_ci	/*
1718c2ecf20Sopenharmony_ci	 * ocfs2 never allocates in this function - the only time we
1728c2ecf20Sopenharmony_ci	 * need to use BH_New is when we're extending i_size on a file
1738c2ecf20Sopenharmony_ci	 * system which doesn't support holes, in which case BH_New
1748c2ecf20Sopenharmony_ci	 * allows __block_write_begin() to zero.
1758c2ecf20Sopenharmony_ci	 *
1768c2ecf20Sopenharmony_ci	 * If we see this on a sparse file system, then a truncate has
1778c2ecf20Sopenharmony_ci	 * raced us and removed the cluster. In this case, we clear
1788c2ecf20Sopenharmony_ci	 * the buffers dirty and uptodate bits and let the buffer code
1798c2ecf20Sopenharmony_ci	 * ignore it as a hole.
1808c2ecf20Sopenharmony_ci	 */
1818c2ecf20Sopenharmony_ci	if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) {
1828c2ecf20Sopenharmony_ci		clear_buffer_dirty(bh_result);
1838c2ecf20Sopenharmony_ci		clear_buffer_uptodate(bh_result);
1848c2ecf20Sopenharmony_ci		goto bail;
1858c2ecf20Sopenharmony_ci	}
1868c2ecf20Sopenharmony_ci
1878c2ecf20Sopenharmony_ci	/* Treat the unwritten extent as a hole for zeroing purposes. */
1888c2ecf20Sopenharmony_ci	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
1898c2ecf20Sopenharmony_ci		map_bh(bh_result, inode->i_sb, p_blkno);
1908c2ecf20Sopenharmony_ci
1918c2ecf20Sopenharmony_ci	bh_result->b_size = count << inode->i_blkbits;
1928c2ecf20Sopenharmony_ci
1938c2ecf20Sopenharmony_ci	if (!ocfs2_sparse_alloc(osb)) {
1948c2ecf20Sopenharmony_ci		if (p_blkno == 0) {
1958c2ecf20Sopenharmony_ci			err = -EIO;
1968c2ecf20Sopenharmony_ci			mlog(ML_ERROR,
1978c2ecf20Sopenharmony_ci			     "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
1988c2ecf20Sopenharmony_ci			     (unsigned long long)iblock,
1998c2ecf20Sopenharmony_ci			     (unsigned long long)p_blkno,
2008c2ecf20Sopenharmony_ci			     (unsigned long long)OCFS2_I(inode)->ip_blkno);
2018c2ecf20Sopenharmony_ci			mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
2028c2ecf20Sopenharmony_ci			dump_stack();
2038c2ecf20Sopenharmony_ci			goto bail;
2048c2ecf20Sopenharmony_ci		}
2058c2ecf20Sopenharmony_ci	}
2068c2ecf20Sopenharmony_ci
2078c2ecf20Sopenharmony_ci	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
2088c2ecf20Sopenharmony_ci
2098c2ecf20Sopenharmony_ci	trace_ocfs2_get_block_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
2108c2ecf20Sopenharmony_ci				  (unsigned long long)past_eof);
2118c2ecf20Sopenharmony_ci	if (create && (iblock >= past_eof))
2128c2ecf20Sopenharmony_ci		set_buffer_new(bh_result);
2138c2ecf20Sopenharmony_ci
2148c2ecf20Sopenharmony_cibail:
2158c2ecf20Sopenharmony_ci	if (err < 0)
2168c2ecf20Sopenharmony_ci		err = -EIO;
2178c2ecf20Sopenharmony_ci
2188c2ecf20Sopenharmony_ci	return err;
2198c2ecf20Sopenharmony_ci}
2208c2ecf20Sopenharmony_ci
2218c2ecf20Sopenharmony_ciint ocfs2_read_inline_data(struct inode *inode, struct page *page,
2228c2ecf20Sopenharmony_ci			   struct buffer_head *di_bh)
2238c2ecf20Sopenharmony_ci{
2248c2ecf20Sopenharmony_ci	void *kaddr;
2258c2ecf20Sopenharmony_ci	loff_t size;
2268c2ecf20Sopenharmony_ci	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2278c2ecf20Sopenharmony_ci
2288c2ecf20Sopenharmony_ci	if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
2298c2ecf20Sopenharmony_ci		ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n",
2308c2ecf20Sopenharmony_ci			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
2318c2ecf20Sopenharmony_ci		return -EROFS;
2328c2ecf20Sopenharmony_ci	}
2338c2ecf20Sopenharmony_ci
2348c2ecf20Sopenharmony_ci	size = i_size_read(inode);
2358c2ecf20Sopenharmony_ci
2368c2ecf20Sopenharmony_ci	if (size > PAGE_SIZE ||
2378c2ecf20Sopenharmony_ci	    size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
2388c2ecf20Sopenharmony_ci		ocfs2_error(inode->i_sb,
2398c2ecf20Sopenharmony_ci			    "Inode %llu has with inline data has bad size: %Lu\n",
2408c2ecf20Sopenharmony_ci			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
2418c2ecf20Sopenharmony_ci			    (unsigned long long)size);
2428c2ecf20Sopenharmony_ci		return -EROFS;
2438c2ecf20Sopenharmony_ci	}
2448c2ecf20Sopenharmony_ci
2458c2ecf20Sopenharmony_ci	kaddr = kmap_atomic(page);
2468c2ecf20Sopenharmony_ci	if (size)
2478c2ecf20Sopenharmony_ci		memcpy(kaddr, di->id2.i_data.id_data, size);
2488c2ecf20Sopenharmony_ci	/* Clear the remaining part of the page */
2498c2ecf20Sopenharmony_ci	memset(kaddr + size, 0, PAGE_SIZE - size);
2508c2ecf20Sopenharmony_ci	flush_dcache_page(page);
2518c2ecf20Sopenharmony_ci	kunmap_atomic(kaddr);
2528c2ecf20Sopenharmony_ci
2538c2ecf20Sopenharmony_ci	SetPageUptodate(page);
2548c2ecf20Sopenharmony_ci
2558c2ecf20Sopenharmony_ci	return 0;
2568c2ecf20Sopenharmony_ci}
2578c2ecf20Sopenharmony_ci
2588c2ecf20Sopenharmony_cistatic int ocfs2_readpage_inline(struct inode *inode, struct page *page)
2598c2ecf20Sopenharmony_ci{
2608c2ecf20Sopenharmony_ci	int ret;
2618c2ecf20Sopenharmony_ci	struct buffer_head *di_bh = NULL;
2628c2ecf20Sopenharmony_ci
2638c2ecf20Sopenharmony_ci	BUG_ON(!PageLocked(page));
2648c2ecf20Sopenharmony_ci	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
2658c2ecf20Sopenharmony_ci
2668c2ecf20Sopenharmony_ci	ret = ocfs2_read_inode_block(inode, &di_bh);
2678c2ecf20Sopenharmony_ci	if (ret) {
2688c2ecf20Sopenharmony_ci		mlog_errno(ret);
2698c2ecf20Sopenharmony_ci		goto out;
2708c2ecf20Sopenharmony_ci	}
2718c2ecf20Sopenharmony_ci
2728c2ecf20Sopenharmony_ci	ret = ocfs2_read_inline_data(inode, page, di_bh);
2738c2ecf20Sopenharmony_ciout:
2748c2ecf20Sopenharmony_ci	unlock_page(page);
2758c2ecf20Sopenharmony_ci
2768c2ecf20Sopenharmony_ci	brelse(di_bh);
2778c2ecf20Sopenharmony_ci	return ret;
2788c2ecf20Sopenharmony_ci}
2798c2ecf20Sopenharmony_ci
2808c2ecf20Sopenharmony_cistatic int ocfs2_readpage(struct file *file, struct page *page)
2818c2ecf20Sopenharmony_ci{
2828c2ecf20Sopenharmony_ci	struct inode *inode = page->mapping->host;
2838c2ecf20Sopenharmony_ci	struct ocfs2_inode_info *oi = OCFS2_I(inode);
2848c2ecf20Sopenharmony_ci	loff_t start = (loff_t)page->index << PAGE_SHIFT;
2858c2ecf20Sopenharmony_ci	int ret, unlock = 1;
2868c2ecf20Sopenharmony_ci
2878c2ecf20Sopenharmony_ci	trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
2888c2ecf20Sopenharmony_ci			     (page ? page->index : 0));
2898c2ecf20Sopenharmony_ci
2908c2ecf20Sopenharmony_ci	ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
2918c2ecf20Sopenharmony_ci	if (ret != 0) {
2928c2ecf20Sopenharmony_ci		if (ret == AOP_TRUNCATED_PAGE)
2938c2ecf20Sopenharmony_ci			unlock = 0;
2948c2ecf20Sopenharmony_ci		mlog_errno(ret);
2958c2ecf20Sopenharmony_ci		goto out;
2968c2ecf20Sopenharmony_ci	}
2978c2ecf20Sopenharmony_ci
2988c2ecf20Sopenharmony_ci	if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
2998c2ecf20Sopenharmony_ci		/*
3008c2ecf20Sopenharmony_ci		 * Unlock the page and cycle ip_alloc_sem so that we don't
3018c2ecf20Sopenharmony_ci		 * busyloop waiting for ip_alloc_sem to unlock
3028c2ecf20Sopenharmony_ci		 */
3038c2ecf20Sopenharmony_ci		ret = AOP_TRUNCATED_PAGE;
3048c2ecf20Sopenharmony_ci		unlock_page(page);
3058c2ecf20Sopenharmony_ci		unlock = 0;
3068c2ecf20Sopenharmony_ci		down_read(&oi->ip_alloc_sem);
3078c2ecf20Sopenharmony_ci		up_read(&oi->ip_alloc_sem);
3088c2ecf20Sopenharmony_ci		goto out_inode_unlock;
3098c2ecf20Sopenharmony_ci	}
3108c2ecf20Sopenharmony_ci
3118c2ecf20Sopenharmony_ci	/*
3128c2ecf20Sopenharmony_ci	 * i_size might have just been updated as we grabed the meta lock.  We
3138c2ecf20Sopenharmony_ci	 * might now be discovering a truncate that hit on another node.
3148c2ecf20Sopenharmony_ci	 * block_read_full_page->get_block freaks out if it is asked to read
3158c2ecf20Sopenharmony_ci	 * beyond the end of a file, so we check here.  Callers
3168c2ecf20Sopenharmony_ci	 * (generic_file_read, vm_ops->fault) are clever enough to check i_size
3178c2ecf20Sopenharmony_ci	 * and notice that the page they just read isn't needed.
3188c2ecf20Sopenharmony_ci	 *
3198c2ecf20Sopenharmony_ci	 * XXX sys_readahead() seems to get that wrong?
3208c2ecf20Sopenharmony_ci	 */
3218c2ecf20Sopenharmony_ci	if (start >= i_size_read(inode)) {
3228c2ecf20Sopenharmony_ci		zero_user(page, 0, PAGE_SIZE);
3238c2ecf20Sopenharmony_ci		SetPageUptodate(page);
3248c2ecf20Sopenharmony_ci		ret = 0;
3258c2ecf20Sopenharmony_ci		goto out_alloc;
3268c2ecf20Sopenharmony_ci	}
3278c2ecf20Sopenharmony_ci
3288c2ecf20Sopenharmony_ci	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
3298c2ecf20Sopenharmony_ci		ret = ocfs2_readpage_inline(inode, page);
3308c2ecf20Sopenharmony_ci	else
3318c2ecf20Sopenharmony_ci		ret = block_read_full_page(page, ocfs2_get_block);
3328c2ecf20Sopenharmony_ci	unlock = 0;
3338c2ecf20Sopenharmony_ci
3348c2ecf20Sopenharmony_ciout_alloc:
3358c2ecf20Sopenharmony_ci	up_read(&oi->ip_alloc_sem);
3368c2ecf20Sopenharmony_ciout_inode_unlock:
3378c2ecf20Sopenharmony_ci	ocfs2_inode_unlock(inode, 0);
3388c2ecf20Sopenharmony_ciout:
3398c2ecf20Sopenharmony_ci	if (unlock)
3408c2ecf20Sopenharmony_ci		unlock_page(page);
3418c2ecf20Sopenharmony_ci	return ret;
3428c2ecf20Sopenharmony_ci}
3438c2ecf20Sopenharmony_ci
3448c2ecf20Sopenharmony_ci/*
3458c2ecf20Sopenharmony_ci * This is used only for read-ahead. Failures or difficult to handle
3468c2ecf20Sopenharmony_ci * situations are safe to ignore.
3478c2ecf20Sopenharmony_ci *
3488c2ecf20Sopenharmony_ci * Right now, we don't bother with BH_Boundary - in-inode extent lists
3498c2ecf20Sopenharmony_ci * are quite large (243 extents on 4k blocks), so most inodes don't
3508c2ecf20Sopenharmony_ci * grow out to a tree. If need be, detecting boundary extents could
3518c2ecf20Sopenharmony_ci * trivially be added in a future version of ocfs2_get_block().
3528c2ecf20Sopenharmony_ci */
3538c2ecf20Sopenharmony_cistatic void ocfs2_readahead(struct readahead_control *rac)
3548c2ecf20Sopenharmony_ci{
3558c2ecf20Sopenharmony_ci	int ret;
3568c2ecf20Sopenharmony_ci	struct inode *inode = rac->mapping->host;
3578c2ecf20Sopenharmony_ci	struct ocfs2_inode_info *oi = OCFS2_I(inode);
3588c2ecf20Sopenharmony_ci
3598c2ecf20Sopenharmony_ci	/*
3608c2ecf20Sopenharmony_ci	 * Use the nonblocking flag for the dlm code to avoid page
3618c2ecf20Sopenharmony_ci	 * lock inversion, but don't bother with retrying.
3628c2ecf20Sopenharmony_ci	 */
3638c2ecf20Sopenharmony_ci	ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK);
3648c2ecf20Sopenharmony_ci	if (ret)
3658c2ecf20Sopenharmony_ci		return;
3668c2ecf20Sopenharmony_ci
3678c2ecf20Sopenharmony_ci	if (down_read_trylock(&oi->ip_alloc_sem) == 0)
3688c2ecf20Sopenharmony_ci		goto out_unlock;
3698c2ecf20Sopenharmony_ci
3708c2ecf20Sopenharmony_ci	/*
3718c2ecf20Sopenharmony_ci	 * Don't bother with inline-data. There isn't anything
3728c2ecf20Sopenharmony_ci	 * to read-ahead in that case anyway...
3738c2ecf20Sopenharmony_ci	 */
3748c2ecf20Sopenharmony_ci	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
3758c2ecf20Sopenharmony_ci		goto out_up;
3768c2ecf20Sopenharmony_ci
3778c2ecf20Sopenharmony_ci	/*
3788c2ecf20Sopenharmony_ci	 * Check whether a remote node truncated this file - we just
3798c2ecf20Sopenharmony_ci	 * drop out in that case as it's not worth handling here.
3808c2ecf20Sopenharmony_ci	 */
3818c2ecf20Sopenharmony_ci	if (readahead_pos(rac) >= i_size_read(inode))
3828c2ecf20Sopenharmony_ci		goto out_up;
3838c2ecf20Sopenharmony_ci
3848c2ecf20Sopenharmony_ci	mpage_readahead(rac, ocfs2_get_block);
3858c2ecf20Sopenharmony_ci
3868c2ecf20Sopenharmony_ciout_up:
3878c2ecf20Sopenharmony_ci	up_read(&oi->ip_alloc_sem);
3888c2ecf20Sopenharmony_ciout_unlock:
3898c2ecf20Sopenharmony_ci	ocfs2_inode_unlock(inode, 0);
3908c2ecf20Sopenharmony_ci}
3918c2ecf20Sopenharmony_ci
3928c2ecf20Sopenharmony_ci/* Note: Because we don't support holes, our allocation has
3938c2ecf20Sopenharmony_ci * already happened (allocation writes zeros to the file data)
3948c2ecf20Sopenharmony_ci * so we don't have to worry about ordered writes in
3958c2ecf20Sopenharmony_ci * ocfs2_writepage.
3968c2ecf20Sopenharmony_ci *
3978c2ecf20Sopenharmony_ci * ->writepage is called during the process of invalidating the page cache
3988c2ecf20Sopenharmony_ci * during blocked lock processing.  It can't block on any cluster locks
3998c2ecf20Sopenharmony_ci * to during block mapping.  It's relying on the fact that the block
4008c2ecf20Sopenharmony_ci * mapping can't have disappeared under the dirty pages that it is
4018c2ecf20Sopenharmony_ci * being asked to write back.
4028c2ecf20Sopenharmony_ci */
4038c2ecf20Sopenharmony_cistatic int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
4048c2ecf20Sopenharmony_ci{
4058c2ecf20Sopenharmony_ci	trace_ocfs2_writepage(
4068c2ecf20Sopenharmony_ci		(unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno,
4078c2ecf20Sopenharmony_ci		page->index);
4088c2ecf20Sopenharmony_ci
4098c2ecf20Sopenharmony_ci	return block_write_full_page(page, ocfs2_get_block, wbc);
4108c2ecf20Sopenharmony_ci}
4118c2ecf20Sopenharmony_ci
4128c2ecf20Sopenharmony_ci/* Taken from ext3. We don't necessarily need the full blown
4138c2ecf20Sopenharmony_ci * functionality yet, but IMHO it's better to cut and paste the whole
4148c2ecf20Sopenharmony_ci * thing so we can avoid introducing our own bugs (and easily pick up
4158c2ecf20Sopenharmony_ci * their fixes when they happen) --Mark */
4168c2ecf20Sopenharmony_ciint walk_page_buffers(	handle_t *handle,
4178c2ecf20Sopenharmony_ci			struct buffer_head *head,
4188c2ecf20Sopenharmony_ci			unsigned from,
4198c2ecf20Sopenharmony_ci			unsigned to,
4208c2ecf20Sopenharmony_ci			int *partial,
4218c2ecf20Sopenharmony_ci			int (*fn)(	handle_t *handle,
4228c2ecf20Sopenharmony_ci					struct buffer_head *bh))
4238c2ecf20Sopenharmony_ci{
4248c2ecf20Sopenharmony_ci	struct buffer_head *bh;
4258c2ecf20Sopenharmony_ci	unsigned block_start, block_end;
4268c2ecf20Sopenharmony_ci	unsigned blocksize = head->b_size;
4278c2ecf20Sopenharmony_ci	int err, ret = 0;
4288c2ecf20Sopenharmony_ci	struct buffer_head *next;
4298c2ecf20Sopenharmony_ci
4308c2ecf20Sopenharmony_ci	for (	bh = head, block_start = 0;
4318c2ecf20Sopenharmony_ci		ret == 0 && (bh != head || !block_start);
4328c2ecf20Sopenharmony_ci	    	block_start = block_end, bh = next)
4338c2ecf20Sopenharmony_ci	{
4348c2ecf20Sopenharmony_ci		next = bh->b_this_page;
4358c2ecf20Sopenharmony_ci		block_end = block_start + blocksize;
4368c2ecf20Sopenharmony_ci		if (block_end <= from || block_start >= to) {
4378c2ecf20Sopenharmony_ci			if (partial && !buffer_uptodate(bh))
4388c2ecf20Sopenharmony_ci				*partial = 1;
4398c2ecf20Sopenharmony_ci			continue;
4408c2ecf20Sopenharmony_ci		}
4418c2ecf20Sopenharmony_ci		err = (*fn)(handle, bh);
4428c2ecf20Sopenharmony_ci		if (!ret)
4438c2ecf20Sopenharmony_ci			ret = err;
4448c2ecf20Sopenharmony_ci	}
4458c2ecf20Sopenharmony_ci	return ret;
4468c2ecf20Sopenharmony_ci}
4478c2ecf20Sopenharmony_ci
4488c2ecf20Sopenharmony_cistatic sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
4498c2ecf20Sopenharmony_ci{
4508c2ecf20Sopenharmony_ci	sector_t status;
4518c2ecf20Sopenharmony_ci	u64 p_blkno = 0;
4528c2ecf20Sopenharmony_ci	int err = 0;
4538c2ecf20Sopenharmony_ci	struct inode *inode = mapping->host;
4548c2ecf20Sopenharmony_ci
4558c2ecf20Sopenharmony_ci	trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno,
4568c2ecf20Sopenharmony_ci			 (unsigned long long)block);
4578c2ecf20Sopenharmony_ci
4588c2ecf20Sopenharmony_ci	/*
4598c2ecf20Sopenharmony_ci	 * The swap code (ab-)uses ->bmap to get a block mapping and then
4608c2ecf20Sopenharmony_ci	 * bypasseѕ the file system for actual I/O.  We really can't allow
4618c2ecf20Sopenharmony_ci	 * that on refcounted inodes, so we have to skip out here.  And yes,
4628c2ecf20Sopenharmony_ci	 * 0 is the magic code for a bmap error..
4638c2ecf20Sopenharmony_ci	 */
4648c2ecf20Sopenharmony_ci	if (ocfs2_is_refcount_inode(inode))
4658c2ecf20Sopenharmony_ci		return 0;
4668c2ecf20Sopenharmony_ci
4678c2ecf20Sopenharmony_ci	/* We don't need to lock journal system files, since they aren't
4688c2ecf20Sopenharmony_ci	 * accessed concurrently from multiple nodes.
4698c2ecf20Sopenharmony_ci	 */
4708c2ecf20Sopenharmony_ci	if (!INODE_JOURNAL(inode)) {
4718c2ecf20Sopenharmony_ci		err = ocfs2_inode_lock(inode, NULL, 0);
4728c2ecf20Sopenharmony_ci		if (err) {
4738c2ecf20Sopenharmony_ci			if (err != -ENOENT)
4748c2ecf20Sopenharmony_ci				mlog_errno(err);
4758c2ecf20Sopenharmony_ci			goto bail;
4768c2ecf20Sopenharmony_ci		}
4778c2ecf20Sopenharmony_ci		down_read(&OCFS2_I(inode)->ip_alloc_sem);
4788c2ecf20Sopenharmony_ci	}
4798c2ecf20Sopenharmony_ci
4808c2ecf20Sopenharmony_ci	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
4818c2ecf20Sopenharmony_ci		err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
4828c2ecf20Sopenharmony_ci						  NULL);
4838c2ecf20Sopenharmony_ci
4848c2ecf20Sopenharmony_ci	if (!INODE_JOURNAL(inode)) {
4858c2ecf20Sopenharmony_ci		up_read(&OCFS2_I(inode)->ip_alloc_sem);
4868c2ecf20Sopenharmony_ci		ocfs2_inode_unlock(inode, 0);
4878c2ecf20Sopenharmony_ci	}
4888c2ecf20Sopenharmony_ci
4898c2ecf20Sopenharmony_ci	if (err) {
4908c2ecf20Sopenharmony_ci		mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
4918c2ecf20Sopenharmony_ci		     (unsigned long long)block);
4928c2ecf20Sopenharmony_ci		mlog_errno(err);
4938c2ecf20Sopenharmony_ci		goto bail;
4948c2ecf20Sopenharmony_ci	}
4958c2ecf20Sopenharmony_ci
4968c2ecf20Sopenharmony_cibail:
4978c2ecf20Sopenharmony_ci	status = err ? 0 : p_blkno;
4988c2ecf20Sopenharmony_ci
4998c2ecf20Sopenharmony_ci	return status;
5008c2ecf20Sopenharmony_ci}
5018c2ecf20Sopenharmony_ci
5028c2ecf20Sopenharmony_cistatic int ocfs2_releasepage(struct page *page, gfp_t wait)
5038c2ecf20Sopenharmony_ci{
5048c2ecf20Sopenharmony_ci	if (!page_has_buffers(page))
5058c2ecf20Sopenharmony_ci		return 0;
5068c2ecf20Sopenharmony_ci	return try_to_free_buffers(page);
5078c2ecf20Sopenharmony_ci}
5088c2ecf20Sopenharmony_ci
5098c2ecf20Sopenharmony_cistatic void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
5108c2ecf20Sopenharmony_ci					    u32 cpos,
5118c2ecf20Sopenharmony_ci					    unsigned int *start,
5128c2ecf20Sopenharmony_ci					    unsigned int *end)
5138c2ecf20Sopenharmony_ci{
5148c2ecf20Sopenharmony_ci	unsigned int cluster_start = 0, cluster_end = PAGE_SIZE;
5158c2ecf20Sopenharmony_ci
5168c2ecf20Sopenharmony_ci	if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits)) {
5178c2ecf20Sopenharmony_ci		unsigned int cpp;
5188c2ecf20Sopenharmony_ci
5198c2ecf20Sopenharmony_ci		cpp = 1 << (PAGE_SHIFT - osb->s_clustersize_bits);
5208c2ecf20Sopenharmony_ci
5218c2ecf20Sopenharmony_ci		cluster_start = cpos % cpp;
5228c2ecf20Sopenharmony_ci		cluster_start = cluster_start << osb->s_clustersize_bits;
5238c2ecf20Sopenharmony_ci
5248c2ecf20Sopenharmony_ci		cluster_end = cluster_start + osb->s_clustersize;
5258c2ecf20Sopenharmony_ci	}
5268c2ecf20Sopenharmony_ci
5278c2ecf20Sopenharmony_ci	BUG_ON(cluster_start > PAGE_SIZE);
5288c2ecf20Sopenharmony_ci	BUG_ON(cluster_end > PAGE_SIZE);
5298c2ecf20Sopenharmony_ci
5308c2ecf20Sopenharmony_ci	if (start)
5318c2ecf20Sopenharmony_ci		*start = cluster_start;
5328c2ecf20Sopenharmony_ci	if (end)
5338c2ecf20Sopenharmony_ci		*end = cluster_end;
5348c2ecf20Sopenharmony_ci}
5358c2ecf20Sopenharmony_ci
5368c2ecf20Sopenharmony_ci/*
5378c2ecf20Sopenharmony_ci * 'from' and 'to' are the region in the page to avoid zeroing.
5388c2ecf20Sopenharmony_ci *
5398c2ecf20Sopenharmony_ci * If pagesize > clustersize, this function will avoid zeroing outside
5408c2ecf20Sopenharmony_ci * of the cluster boundary.
5418c2ecf20Sopenharmony_ci *
5428c2ecf20Sopenharmony_ci * from == to == 0 is code for "zero the entire cluster region"
5438c2ecf20Sopenharmony_ci */
5448c2ecf20Sopenharmony_cistatic void ocfs2_clear_page_regions(struct page *page,
5458c2ecf20Sopenharmony_ci				     struct ocfs2_super *osb, u32 cpos,
5468c2ecf20Sopenharmony_ci				     unsigned from, unsigned to)
5478c2ecf20Sopenharmony_ci{
5488c2ecf20Sopenharmony_ci	void *kaddr;
5498c2ecf20Sopenharmony_ci	unsigned int cluster_start, cluster_end;
5508c2ecf20Sopenharmony_ci
5518c2ecf20Sopenharmony_ci	ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
5528c2ecf20Sopenharmony_ci
5538c2ecf20Sopenharmony_ci	kaddr = kmap_atomic(page);
5548c2ecf20Sopenharmony_ci
5558c2ecf20Sopenharmony_ci	if (from || to) {
5568c2ecf20Sopenharmony_ci		if (from > cluster_start)
5578c2ecf20Sopenharmony_ci			memset(kaddr + cluster_start, 0, from - cluster_start);
5588c2ecf20Sopenharmony_ci		if (to < cluster_end)
5598c2ecf20Sopenharmony_ci			memset(kaddr + to, 0, cluster_end - to);
5608c2ecf20Sopenharmony_ci	} else {
5618c2ecf20Sopenharmony_ci		memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
5628c2ecf20Sopenharmony_ci	}
5638c2ecf20Sopenharmony_ci
5648c2ecf20Sopenharmony_ci	kunmap_atomic(kaddr);
5658c2ecf20Sopenharmony_ci}
5668c2ecf20Sopenharmony_ci
5678c2ecf20Sopenharmony_ci/*
5688c2ecf20Sopenharmony_ci * Nonsparse file systems fully allocate before we get to the write
5698c2ecf20Sopenharmony_ci * code. This prevents ocfs2_write() from tagging the write as an
5708c2ecf20Sopenharmony_ci * allocating one, which means ocfs2_map_page_blocks() might try to
5718c2ecf20Sopenharmony_ci * read-in the blocks at the tail of our file. Avoid reading them by
5728c2ecf20Sopenharmony_ci * testing i_size against each block offset.
5738c2ecf20Sopenharmony_ci */
5748c2ecf20Sopenharmony_cistatic int ocfs2_should_read_blk(struct inode *inode, struct page *page,
5758c2ecf20Sopenharmony_ci				 unsigned int block_start)
5768c2ecf20Sopenharmony_ci{
5778c2ecf20Sopenharmony_ci	u64 offset = page_offset(page) + block_start;
5788c2ecf20Sopenharmony_ci
5798c2ecf20Sopenharmony_ci	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
5808c2ecf20Sopenharmony_ci		return 1;
5818c2ecf20Sopenharmony_ci
5828c2ecf20Sopenharmony_ci	if (i_size_read(inode) > offset)
5838c2ecf20Sopenharmony_ci		return 1;
5848c2ecf20Sopenharmony_ci
5858c2ecf20Sopenharmony_ci	return 0;
5868c2ecf20Sopenharmony_ci}
5878c2ecf20Sopenharmony_ci
5888c2ecf20Sopenharmony_ci/*
5898c2ecf20Sopenharmony_ci * Some of this taken from __block_write_begin(). We already have our
5908c2ecf20Sopenharmony_ci * mapping by now though, and the entire write will be allocating or
5918c2ecf20Sopenharmony_ci * it won't, so not much need to use BH_New.
5928c2ecf20Sopenharmony_ci *
5938c2ecf20Sopenharmony_ci * This will also skip zeroing, which is handled externally.
5948c2ecf20Sopenharmony_ci */
5958c2ecf20Sopenharmony_ciint ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
5968c2ecf20Sopenharmony_ci			  struct inode *inode, unsigned int from,
5978c2ecf20Sopenharmony_ci			  unsigned int to, int new)
5988c2ecf20Sopenharmony_ci{
5998c2ecf20Sopenharmony_ci	int ret = 0;
6008c2ecf20Sopenharmony_ci	struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
6018c2ecf20Sopenharmony_ci	unsigned int block_end, block_start;
6028c2ecf20Sopenharmony_ci	unsigned int bsize = i_blocksize(inode);
6038c2ecf20Sopenharmony_ci
6048c2ecf20Sopenharmony_ci	if (!page_has_buffers(page))
6058c2ecf20Sopenharmony_ci		create_empty_buffers(page, bsize, 0);
6068c2ecf20Sopenharmony_ci
6078c2ecf20Sopenharmony_ci	head = page_buffers(page);
6088c2ecf20Sopenharmony_ci	for (bh = head, block_start = 0; bh != head || !block_start;
6098c2ecf20Sopenharmony_ci	     bh = bh->b_this_page, block_start += bsize) {
6108c2ecf20Sopenharmony_ci		block_end = block_start + bsize;
6118c2ecf20Sopenharmony_ci
6128c2ecf20Sopenharmony_ci		clear_buffer_new(bh);
6138c2ecf20Sopenharmony_ci
6148c2ecf20Sopenharmony_ci		/*
6158c2ecf20Sopenharmony_ci		 * Ignore blocks outside of our i/o range -
6168c2ecf20Sopenharmony_ci		 * they may belong to unallocated clusters.
6178c2ecf20Sopenharmony_ci		 */
6188c2ecf20Sopenharmony_ci		if (block_start >= to || block_end <= from) {
6198c2ecf20Sopenharmony_ci			if (PageUptodate(page))
6208c2ecf20Sopenharmony_ci				set_buffer_uptodate(bh);
6218c2ecf20Sopenharmony_ci			continue;
6228c2ecf20Sopenharmony_ci		}
6238c2ecf20Sopenharmony_ci
6248c2ecf20Sopenharmony_ci		/*
6258c2ecf20Sopenharmony_ci		 * For an allocating write with cluster size >= page
6268c2ecf20Sopenharmony_ci		 * size, we always write the entire page.
6278c2ecf20Sopenharmony_ci		 */
6288c2ecf20Sopenharmony_ci		if (new)
6298c2ecf20Sopenharmony_ci			set_buffer_new(bh);
6308c2ecf20Sopenharmony_ci
6318c2ecf20Sopenharmony_ci		if (!buffer_mapped(bh)) {
6328c2ecf20Sopenharmony_ci			map_bh(bh, inode->i_sb, *p_blkno);
6338c2ecf20Sopenharmony_ci			clean_bdev_bh_alias(bh);
6348c2ecf20Sopenharmony_ci		}
6358c2ecf20Sopenharmony_ci
6368c2ecf20Sopenharmony_ci		if (PageUptodate(page)) {
6378c2ecf20Sopenharmony_ci			if (!buffer_uptodate(bh))
6388c2ecf20Sopenharmony_ci				set_buffer_uptodate(bh);
6398c2ecf20Sopenharmony_ci		} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
6408c2ecf20Sopenharmony_ci			   !buffer_new(bh) &&
6418c2ecf20Sopenharmony_ci			   ocfs2_should_read_blk(inode, page, block_start) &&
6428c2ecf20Sopenharmony_ci			   (block_start < from || block_end > to)) {
6438c2ecf20Sopenharmony_ci			ll_rw_block(REQ_OP_READ, 0, 1, &bh);
6448c2ecf20Sopenharmony_ci			*wait_bh++=bh;
6458c2ecf20Sopenharmony_ci		}
6468c2ecf20Sopenharmony_ci
6478c2ecf20Sopenharmony_ci		*p_blkno = *p_blkno + 1;
6488c2ecf20Sopenharmony_ci	}
6498c2ecf20Sopenharmony_ci
6508c2ecf20Sopenharmony_ci	/*
6518c2ecf20Sopenharmony_ci	 * If we issued read requests - let them complete.
6528c2ecf20Sopenharmony_ci	 */
6538c2ecf20Sopenharmony_ci	while(wait_bh > wait) {
6548c2ecf20Sopenharmony_ci		wait_on_buffer(*--wait_bh);
6558c2ecf20Sopenharmony_ci		if (!buffer_uptodate(*wait_bh))
6568c2ecf20Sopenharmony_ci			ret = -EIO;
6578c2ecf20Sopenharmony_ci	}
6588c2ecf20Sopenharmony_ci
6598c2ecf20Sopenharmony_ci	if (ret == 0 || !new)
6608c2ecf20Sopenharmony_ci		return ret;
6618c2ecf20Sopenharmony_ci
6628c2ecf20Sopenharmony_ci	/*
6638c2ecf20Sopenharmony_ci	 * If we get -EIO above, zero out any newly allocated blocks
6648c2ecf20Sopenharmony_ci	 * to avoid exposing stale data.
6658c2ecf20Sopenharmony_ci	 */
6668c2ecf20Sopenharmony_ci	bh = head;
6678c2ecf20Sopenharmony_ci	block_start = 0;
6688c2ecf20Sopenharmony_ci	do {
6698c2ecf20Sopenharmony_ci		block_end = block_start + bsize;
6708c2ecf20Sopenharmony_ci		if (block_end <= from)
6718c2ecf20Sopenharmony_ci			goto next_bh;
6728c2ecf20Sopenharmony_ci		if (block_start >= to)
6738c2ecf20Sopenharmony_ci			break;
6748c2ecf20Sopenharmony_ci
6758c2ecf20Sopenharmony_ci		zero_user(page, block_start, bh->b_size);
6768c2ecf20Sopenharmony_ci		set_buffer_uptodate(bh);
6778c2ecf20Sopenharmony_ci		mark_buffer_dirty(bh);
6788c2ecf20Sopenharmony_ci
6798c2ecf20Sopenharmony_cinext_bh:
6808c2ecf20Sopenharmony_ci		block_start = block_end;
6818c2ecf20Sopenharmony_ci		bh = bh->b_this_page;
6828c2ecf20Sopenharmony_ci	} while (bh != head);
6838c2ecf20Sopenharmony_ci
6848c2ecf20Sopenharmony_ci	return ret;
6858c2ecf20Sopenharmony_ci}
6868c2ecf20Sopenharmony_ci
6878c2ecf20Sopenharmony_ci#if (PAGE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
6888c2ecf20Sopenharmony_ci#define OCFS2_MAX_CTXT_PAGES	1
6898c2ecf20Sopenharmony_ci#else
6908c2ecf20Sopenharmony_ci#define OCFS2_MAX_CTXT_PAGES	(OCFS2_MAX_CLUSTERSIZE / PAGE_SIZE)
6918c2ecf20Sopenharmony_ci#endif
6928c2ecf20Sopenharmony_ci
6938c2ecf20Sopenharmony_ci#define OCFS2_MAX_CLUSTERS_PER_PAGE	(PAGE_SIZE / OCFS2_MIN_CLUSTERSIZE)
6948c2ecf20Sopenharmony_ci
6958c2ecf20Sopenharmony_cistruct ocfs2_unwritten_extent {
6968c2ecf20Sopenharmony_ci	struct list_head	ue_node;
6978c2ecf20Sopenharmony_ci	struct list_head	ue_ip_node;
6988c2ecf20Sopenharmony_ci	u32			ue_cpos;
6998c2ecf20Sopenharmony_ci	u32			ue_phys;
7008c2ecf20Sopenharmony_ci};
7018c2ecf20Sopenharmony_ci
7028c2ecf20Sopenharmony_ci/*
7038c2ecf20Sopenharmony_ci * Describe the state of a single cluster to be written to.
7048c2ecf20Sopenharmony_ci */
7058c2ecf20Sopenharmony_cistruct ocfs2_write_cluster_desc {
7068c2ecf20Sopenharmony_ci	u32		c_cpos;
7078c2ecf20Sopenharmony_ci	u32		c_phys;
7088c2ecf20Sopenharmony_ci	/*
7098c2ecf20Sopenharmony_ci	 * Give this a unique field because c_phys eventually gets
7108c2ecf20Sopenharmony_ci	 * filled.
7118c2ecf20Sopenharmony_ci	 */
7128c2ecf20Sopenharmony_ci	unsigned	c_new;
7138c2ecf20Sopenharmony_ci	unsigned	c_clear_unwritten;
7148c2ecf20Sopenharmony_ci	unsigned	c_needs_zero;
7158c2ecf20Sopenharmony_ci};
7168c2ecf20Sopenharmony_ci
7178c2ecf20Sopenharmony_cistruct ocfs2_write_ctxt {
7188c2ecf20Sopenharmony_ci	/* Logical cluster position / len of write */
7198c2ecf20Sopenharmony_ci	u32				w_cpos;
7208c2ecf20Sopenharmony_ci	u32				w_clen;
7218c2ecf20Sopenharmony_ci
7228c2ecf20Sopenharmony_ci	/* First cluster allocated in a nonsparse extend */
7238c2ecf20Sopenharmony_ci	u32				w_first_new_cpos;
7248c2ecf20Sopenharmony_ci
7258c2ecf20Sopenharmony_ci	/* Type of caller. Must be one of buffer, mmap, direct.  */
7268c2ecf20Sopenharmony_ci	ocfs2_write_type_t		w_type;
7278c2ecf20Sopenharmony_ci
7288c2ecf20Sopenharmony_ci	struct ocfs2_write_cluster_desc	w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
7298c2ecf20Sopenharmony_ci
7308c2ecf20Sopenharmony_ci	/*
7318c2ecf20Sopenharmony_ci	 * This is true if page_size > cluster_size.
7328c2ecf20Sopenharmony_ci	 *
7338c2ecf20Sopenharmony_ci	 * It triggers a set of special cases during write which might
7348c2ecf20Sopenharmony_ci	 * have to deal with allocating writes to partial pages.
7358c2ecf20Sopenharmony_ci	 */
7368c2ecf20Sopenharmony_ci	unsigned int			w_large_pages;
7378c2ecf20Sopenharmony_ci
7388c2ecf20Sopenharmony_ci	/*
7398c2ecf20Sopenharmony_ci	 * Pages involved in this write.
7408c2ecf20Sopenharmony_ci	 *
7418c2ecf20Sopenharmony_ci	 * w_target_page is the page being written to by the user.
7428c2ecf20Sopenharmony_ci	 *
7438c2ecf20Sopenharmony_ci	 * w_pages is an array of pages which always contains
7448c2ecf20Sopenharmony_ci	 * w_target_page, and in the case of an allocating write with
7458c2ecf20Sopenharmony_ci	 * page_size < cluster size, it will contain zero'd and mapped
7468c2ecf20Sopenharmony_ci	 * pages adjacent to w_target_page which need to be written
7478c2ecf20Sopenharmony_ci	 * out in so that future reads from that region will get
7488c2ecf20Sopenharmony_ci	 * zero's.
7498c2ecf20Sopenharmony_ci	 */
7508c2ecf20Sopenharmony_ci	unsigned int			w_num_pages;
7518c2ecf20Sopenharmony_ci	struct page			*w_pages[OCFS2_MAX_CTXT_PAGES];
7528c2ecf20Sopenharmony_ci	struct page			*w_target_page;
7538c2ecf20Sopenharmony_ci
7548c2ecf20Sopenharmony_ci	/*
7558c2ecf20Sopenharmony_ci	 * w_target_locked is used for page_mkwrite path indicating no unlocking
7568c2ecf20Sopenharmony_ci	 * against w_target_page in ocfs2_write_end_nolock.
7578c2ecf20Sopenharmony_ci	 */
7588c2ecf20Sopenharmony_ci	unsigned int			w_target_locked:1;
7598c2ecf20Sopenharmony_ci
7608c2ecf20Sopenharmony_ci	/*
7618c2ecf20Sopenharmony_ci	 * ocfs2_write_end() uses this to know what the real range to
7628c2ecf20Sopenharmony_ci	 * write in the target should be.
7638c2ecf20Sopenharmony_ci	 */
7648c2ecf20Sopenharmony_ci	unsigned int			w_target_from;
7658c2ecf20Sopenharmony_ci	unsigned int			w_target_to;
7668c2ecf20Sopenharmony_ci
7678c2ecf20Sopenharmony_ci	/*
7688c2ecf20Sopenharmony_ci	 * We could use journal_current_handle() but this is cleaner,
7698c2ecf20Sopenharmony_ci	 * IMHO -Mark
7708c2ecf20Sopenharmony_ci	 */
7718c2ecf20Sopenharmony_ci	handle_t			*w_handle;
7728c2ecf20Sopenharmony_ci
7738c2ecf20Sopenharmony_ci	struct buffer_head		*w_di_bh;
7748c2ecf20Sopenharmony_ci
7758c2ecf20Sopenharmony_ci	struct ocfs2_cached_dealloc_ctxt w_dealloc;
7768c2ecf20Sopenharmony_ci
7778c2ecf20Sopenharmony_ci	struct list_head		w_unwritten_list;
7788c2ecf20Sopenharmony_ci	unsigned int			w_unwritten_count;
7798c2ecf20Sopenharmony_ci};
7808c2ecf20Sopenharmony_ci
7818c2ecf20Sopenharmony_civoid ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
7828c2ecf20Sopenharmony_ci{
7838c2ecf20Sopenharmony_ci	int i;
7848c2ecf20Sopenharmony_ci
7858c2ecf20Sopenharmony_ci	for(i = 0; i < num_pages; i++) {
7868c2ecf20Sopenharmony_ci		if (pages[i]) {
7878c2ecf20Sopenharmony_ci			unlock_page(pages[i]);
7888c2ecf20Sopenharmony_ci			mark_page_accessed(pages[i]);
7898c2ecf20Sopenharmony_ci			put_page(pages[i]);
7908c2ecf20Sopenharmony_ci		}
7918c2ecf20Sopenharmony_ci	}
7928c2ecf20Sopenharmony_ci}
7938c2ecf20Sopenharmony_ci
7948c2ecf20Sopenharmony_cistatic void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
7958c2ecf20Sopenharmony_ci{
7968c2ecf20Sopenharmony_ci	int i;
7978c2ecf20Sopenharmony_ci
7988c2ecf20Sopenharmony_ci	/*
7998c2ecf20Sopenharmony_ci	 * w_target_locked is only set to true in the page_mkwrite() case.
8008c2ecf20Sopenharmony_ci	 * The intent is to allow us to lock the target page from write_begin()
8018c2ecf20Sopenharmony_ci	 * to write_end(). The caller must hold a ref on w_target_page.
8028c2ecf20Sopenharmony_ci	 */
8038c2ecf20Sopenharmony_ci	if (wc->w_target_locked) {
8048c2ecf20Sopenharmony_ci		BUG_ON(!wc->w_target_page);
8058c2ecf20Sopenharmony_ci		for (i = 0; i < wc->w_num_pages; i++) {
8068c2ecf20Sopenharmony_ci			if (wc->w_target_page == wc->w_pages[i]) {
8078c2ecf20Sopenharmony_ci				wc->w_pages[i] = NULL;
8088c2ecf20Sopenharmony_ci				break;
8098c2ecf20Sopenharmony_ci			}
8108c2ecf20Sopenharmony_ci		}
8118c2ecf20Sopenharmony_ci		mark_page_accessed(wc->w_target_page);
8128c2ecf20Sopenharmony_ci		put_page(wc->w_target_page);
8138c2ecf20Sopenharmony_ci	}
8148c2ecf20Sopenharmony_ci	ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
8158c2ecf20Sopenharmony_ci}
8168c2ecf20Sopenharmony_ci
8178c2ecf20Sopenharmony_cistatic void ocfs2_free_unwritten_list(struct inode *inode,
8188c2ecf20Sopenharmony_ci				 struct list_head *head)
8198c2ecf20Sopenharmony_ci{
8208c2ecf20Sopenharmony_ci	struct ocfs2_inode_info *oi = OCFS2_I(inode);
8218c2ecf20Sopenharmony_ci	struct ocfs2_unwritten_extent *ue = NULL, *tmp = NULL;
8228c2ecf20Sopenharmony_ci
8238c2ecf20Sopenharmony_ci	list_for_each_entry_safe(ue, tmp, head, ue_node) {
8248c2ecf20Sopenharmony_ci		list_del(&ue->ue_node);
8258c2ecf20Sopenharmony_ci		spin_lock(&oi->ip_lock);
8268c2ecf20Sopenharmony_ci		list_del(&ue->ue_ip_node);
8278c2ecf20Sopenharmony_ci		spin_unlock(&oi->ip_lock);
8288c2ecf20Sopenharmony_ci		kfree(ue);
8298c2ecf20Sopenharmony_ci	}
8308c2ecf20Sopenharmony_ci}
8318c2ecf20Sopenharmony_ci
8328c2ecf20Sopenharmony_cistatic void ocfs2_free_write_ctxt(struct inode *inode,
8338c2ecf20Sopenharmony_ci				  struct ocfs2_write_ctxt *wc)
8348c2ecf20Sopenharmony_ci{
8358c2ecf20Sopenharmony_ci	ocfs2_free_unwritten_list(inode, &wc->w_unwritten_list);
8368c2ecf20Sopenharmony_ci	ocfs2_unlock_pages(wc);
8378c2ecf20Sopenharmony_ci	brelse(wc->w_di_bh);
8388c2ecf20Sopenharmony_ci	kfree(wc);
8398c2ecf20Sopenharmony_ci}
8408c2ecf20Sopenharmony_ci
8418c2ecf20Sopenharmony_cistatic int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
8428c2ecf20Sopenharmony_ci				  struct ocfs2_super *osb, loff_t pos,
8438c2ecf20Sopenharmony_ci				  unsigned len, ocfs2_write_type_t type,
8448c2ecf20Sopenharmony_ci				  struct buffer_head *di_bh)
8458c2ecf20Sopenharmony_ci{
8468c2ecf20Sopenharmony_ci	u32 cend;
8478c2ecf20Sopenharmony_ci	struct ocfs2_write_ctxt *wc;
8488c2ecf20Sopenharmony_ci
8498c2ecf20Sopenharmony_ci	wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
8508c2ecf20Sopenharmony_ci	if (!wc)
8518c2ecf20Sopenharmony_ci		return -ENOMEM;
8528c2ecf20Sopenharmony_ci
8538c2ecf20Sopenharmony_ci	wc->w_cpos = pos >> osb->s_clustersize_bits;
8548c2ecf20Sopenharmony_ci	wc->w_first_new_cpos = UINT_MAX;
8558c2ecf20Sopenharmony_ci	cend = (pos + len - 1) >> osb->s_clustersize_bits;
8568c2ecf20Sopenharmony_ci	wc->w_clen = cend - wc->w_cpos + 1;
8578c2ecf20Sopenharmony_ci	get_bh(di_bh);
8588c2ecf20Sopenharmony_ci	wc->w_di_bh = di_bh;
8598c2ecf20Sopenharmony_ci	wc->w_type = type;
8608c2ecf20Sopenharmony_ci
8618c2ecf20Sopenharmony_ci	if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits))
8628c2ecf20Sopenharmony_ci		wc->w_large_pages = 1;
8638c2ecf20Sopenharmony_ci	else
8648c2ecf20Sopenharmony_ci		wc->w_large_pages = 0;
8658c2ecf20Sopenharmony_ci
8668c2ecf20Sopenharmony_ci	ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
8678c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&wc->w_unwritten_list);
8688c2ecf20Sopenharmony_ci
8698c2ecf20Sopenharmony_ci	*wcp = wc;
8708c2ecf20Sopenharmony_ci
8718c2ecf20Sopenharmony_ci	return 0;
8728c2ecf20Sopenharmony_ci}
8738c2ecf20Sopenharmony_ci
8748c2ecf20Sopenharmony_ci/*
8758c2ecf20Sopenharmony_ci * If a page has any new buffers, zero them out here, and mark them uptodate
8768c2ecf20Sopenharmony_ci * and dirty so they'll be written out (in order to prevent uninitialised
8778c2ecf20Sopenharmony_ci * block data from leaking). And clear the new bit.
8788c2ecf20Sopenharmony_ci */
8798c2ecf20Sopenharmony_cistatic void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
8808c2ecf20Sopenharmony_ci{
8818c2ecf20Sopenharmony_ci	unsigned int block_start, block_end;
8828c2ecf20Sopenharmony_ci	struct buffer_head *head, *bh;
8838c2ecf20Sopenharmony_ci
8848c2ecf20Sopenharmony_ci	BUG_ON(!PageLocked(page));
8858c2ecf20Sopenharmony_ci	if (!page_has_buffers(page))
8868c2ecf20Sopenharmony_ci		return;
8878c2ecf20Sopenharmony_ci
8888c2ecf20Sopenharmony_ci	bh = head = page_buffers(page);
8898c2ecf20Sopenharmony_ci	block_start = 0;
8908c2ecf20Sopenharmony_ci	do {
8918c2ecf20Sopenharmony_ci		block_end = block_start + bh->b_size;
8928c2ecf20Sopenharmony_ci
8938c2ecf20Sopenharmony_ci		if (buffer_new(bh)) {
8948c2ecf20Sopenharmony_ci			if (block_end > from && block_start < to) {
8958c2ecf20Sopenharmony_ci				if (!PageUptodate(page)) {
8968c2ecf20Sopenharmony_ci					unsigned start, end;
8978c2ecf20Sopenharmony_ci
8988c2ecf20Sopenharmony_ci					start = max(from, block_start);
8998c2ecf20Sopenharmony_ci					end = min(to, block_end);
9008c2ecf20Sopenharmony_ci
9018c2ecf20Sopenharmony_ci					zero_user_segment(page, start, end);
9028c2ecf20Sopenharmony_ci					set_buffer_uptodate(bh);
9038c2ecf20Sopenharmony_ci				}
9048c2ecf20Sopenharmony_ci
9058c2ecf20Sopenharmony_ci				clear_buffer_new(bh);
9068c2ecf20Sopenharmony_ci				mark_buffer_dirty(bh);
9078c2ecf20Sopenharmony_ci			}
9088c2ecf20Sopenharmony_ci		}
9098c2ecf20Sopenharmony_ci
9108c2ecf20Sopenharmony_ci		block_start = block_end;
9118c2ecf20Sopenharmony_ci		bh = bh->b_this_page;
9128c2ecf20Sopenharmony_ci	} while (bh != head);
9138c2ecf20Sopenharmony_ci}
9148c2ecf20Sopenharmony_ci
9158c2ecf20Sopenharmony_ci/*
9168c2ecf20Sopenharmony_ci * Only called when we have a failure during allocating write to write
9178c2ecf20Sopenharmony_ci * zero's to the newly allocated region.
9188c2ecf20Sopenharmony_ci */
9198c2ecf20Sopenharmony_cistatic void ocfs2_write_failure(struct inode *inode,
9208c2ecf20Sopenharmony_ci				struct ocfs2_write_ctxt *wc,
9218c2ecf20Sopenharmony_ci				loff_t user_pos, unsigned user_len)
9228c2ecf20Sopenharmony_ci{
9238c2ecf20Sopenharmony_ci	int i;
9248c2ecf20Sopenharmony_ci	unsigned from = user_pos & (PAGE_SIZE - 1),
9258c2ecf20Sopenharmony_ci		to = user_pos + user_len;
9268c2ecf20Sopenharmony_ci	struct page *tmppage;
9278c2ecf20Sopenharmony_ci
9288c2ecf20Sopenharmony_ci	if (wc->w_target_page)
9298c2ecf20Sopenharmony_ci		ocfs2_zero_new_buffers(wc->w_target_page, from, to);
9308c2ecf20Sopenharmony_ci
9318c2ecf20Sopenharmony_ci	for(i = 0; i < wc->w_num_pages; i++) {
9328c2ecf20Sopenharmony_ci		tmppage = wc->w_pages[i];
9338c2ecf20Sopenharmony_ci
9348c2ecf20Sopenharmony_ci		if (tmppage && page_has_buffers(tmppage)) {
9358c2ecf20Sopenharmony_ci			if (ocfs2_should_order_data(inode))
9368c2ecf20Sopenharmony_ci				ocfs2_jbd2_inode_add_write(wc->w_handle, inode,
9378c2ecf20Sopenharmony_ci							   user_pos, user_len);
9388c2ecf20Sopenharmony_ci
9398c2ecf20Sopenharmony_ci			block_commit_write(tmppage, from, to);
9408c2ecf20Sopenharmony_ci		}
9418c2ecf20Sopenharmony_ci	}
9428c2ecf20Sopenharmony_ci}
9438c2ecf20Sopenharmony_ci
9448c2ecf20Sopenharmony_cistatic int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
9458c2ecf20Sopenharmony_ci					struct ocfs2_write_ctxt *wc,
9468c2ecf20Sopenharmony_ci					struct page *page, u32 cpos,
9478c2ecf20Sopenharmony_ci					loff_t user_pos, unsigned user_len,
9488c2ecf20Sopenharmony_ci					int new)
9498c2ecf20Sopenharmony_ci{
9508c2ecf20Sopenharmony_ci	int ret;
9518c2ecf20Sopenharmony_ci	unsigned int map_from = 0, map_to = 0;
9528c2ecf20Sopenharmony_ci	unsigned int cluster_start, cluster_end;
9538c2ecf20Sopenharmony_ci	unsigned int user_data_from = 0, user_data_to = 0;
9548c2ecf20Sopenharmony_ci
9558c2ecf20Sopenharmony_ci	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
9568c2ecf20Sopenharmony_ci					&cluster_start, &cluster_end);
9578c2ecf20Sopenharmony_ci
9588c2ecf20Sopenharmony_ci	/* treat the write as new if the a hole/lseek spanned across
9598c2ecf20Sopenharmony_ci	 * the page boundary.
9608c2ecf20Sopenharmony_ci	 */
9618c2ecf20Sopenharmony_ci	new = new | ((i_size_read(inode) <= page_offset(page)) &&
9628c2ecf20Sopenharmony_ci			(page_offset(page) <= user_pos));
9638c2ecf20Sopenharmony_ci
9648c2ecf20Sopenharmony_ci	if (page == wc->w_target_page) {
9658c2ecf20Sopenharmony_ci		map_from = user_pos & (PAGE_SIZE - 1);
9668c2ecf20Sopenharmony_ci		map_to = map_from + user_len;
9678c2ecf20Sopenharmony_ci
9688c2ecf20Sopenharmony_ci		if (new)
9698c2ecf20Sopenharmony_ci			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
9708c2ecf20Sopenharmony_ci						    cluster_start, cluster_end,
9718c2ecf20Sopenharmony_ci						    new);
9728c2ecf20Sopenharmony_ci		else
9738c2ecf20Sopenharmony_ci			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
9748c2ecf20Sopenharmony_ci						    map_from, map_to, new);
9758c2ecf20Sopenharmony_ci		if (ret) {
9768c2ecf20Sopenharmony_ci			mlog_errno(ret);
9778c2ecf20Sopenharmony_ci			goto out;
9788c2ecf20Sopenharmony_ci		}
9798c2ecf20Sopenharmony_ci
9808c2ecf20Sopenharmony_ci		user_data_from = map_from;
9818c2ecf20Sopenharmony_ci		user_data_to = map_to;
9828c2ecf20Sopenharmony_ci		if (new) {
9838c2ecf20Sopenharmony_ci			map_from = cluster_start;
9848c2ecf20Sopenharmony_ci			map_to = cluster_end;
9858c2ecf20Sopenharmony_ci		}
9868c2ecf20Sopenharmony_ci	} else {
9878c2ecf20Sopenharmony_ci		/*
9888c2ecf20Sopenharmony_ci		 * If we haven't allocated the new page yet, we
9898c2ecf20Sopenharmony_ci		 * shouldn't be writing it out without copying user
9908c2ecf20Sopenharmony_ci		 * data. This is likely a math error from the caller.
9918c2ecf20Sopenharmony_ci		 */
9928c2ecf20Sopenharmony_ci		BUG_ON(!new);
9938c2ecf20Sopenharmony_ci
9948c2ecf20Sopenharmony_ci		map_from = cluster_start;
9958c2ecf20Sopenharmony_ci		map_to = cluster_end;
9968c2ecf20Sopenharmony_ci
9978c2ecf20Sopenharmony_ci		ret = ocfs2_map_page_blocks(page, p_blkno, inode,
9988c2ecf20Sopenharmony_ci					    cluster_start, cluster_end, new);
9998c2ecf20Sopenharmony_ci		if (ret) {
10008c2ecf20Sopenharmony_ci			mlog_errno(ret);
10018c2ecf20Sopenharmony_ci			goto out;
10028c2ecf20Sopenharmony_ci		}
10038c2ecf20Sopenharmony_ci	}
10048c2ecf20Sopenharmony_ci
10058c2ecf20Sopenharmony_ci	/*
10068c2ecf20Sopenharmony_ci	 * Parts of newly allocated pages need to be zero'd.
10078c2ecf20Sopenharmony_ci	 *
10088c2ecf20Sopenharmony_ci	 * Above, we have also rewritten 'to' and 'from' - as far as
10098c2ecf20Sopenharmony_ci	 * the rest of the function is concerned, the entire cluster
10108c2ecf20Sopenharmony_ci	 * range inside of a page needs to be written.
10118c2ecf20Sopenharmony_ci	 *
10128c2ecf20Sopenharmony_ci	 * We can skip this if the page is up to date - it's already
10138c2ecf20Sopenharmony_ci	 * been zero'd from being read in as a hole.
10148c2ecf20Sopenharmony_ci	 */
10158c2ecf20Sopenharmony_ci	if (new && !PageUptodate(page))
10168c2ecf20Sopenharmony_ci		ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
10178c2ecf20Sopenharmony_ci					 cpos, user_data_from, user_data_to);
10188c2ecf20Sopenharmony_ci
10198c2ecf20Sopenharmony_ci	flush_dcache_page(page);
10208c2ecf20Sopenharmony_ci
10218c2ecf20Sopenharmony_ciout:
10228c2ecf20Sopenharmony_ci	return ret;
10238c2ecf20Sopenharmony_ci}
10248c2ecf20Sopenharmony_ci
10258c2ecf20Sopenharmony_ci/*
10268c2ecf20Sopenharmony_ci * This function will only grab one clusters worth of pages.
10278c2ecf20Sopenharmony_ci */
10288c2ecf20Sopenharmony_cistatic int ocfs2_grab_pages_for_write(struct address_space *mapping,
10298c2ecf20Sopenharmony_ci				      struct ocfs2_write_ctxt *wc,
10308c2ecf20Sopenharmony_ci				      u32 cpos, loff_t user_pos,
10318c2ecf20Sopenharmony_ci				      unsigned user_len, int new,
10328c2ecf20Sopenharmony_ci				      struct page *mmap_page)
10338c2ecf20Sopenharmony_ci{
10348c2ecf20Sopenharmony_ci	int ret = 0, i;
10358c2ecf20Sopenharmony_ci	unsigned long start, target_index, end_index, index;
10368c2ecf20Sopenharmony_ci	struct inode *inode = mapping->host;
10378c2ecf20Sopenharmony_ci	loff_t last_byte;
10388c2ecf20Sopenharmony_ci
10398c2ecf20Sopenharmony_ci	target_index = user_pos >> PAGE_SHIFT;
10408c2ecf20Sopenharmony_ci
10418c2ecf20Sopenharmony_ci	/*
10428c2ecf20Sopenharmony_ci	 * Figure out how many pages we'll be manipulating here. For
10438c2ecf20Sopenharmony_ci	 * non allocating write, we just change the one
10448c2ecf20Sopenharmony_ci	 * page. Otherwise, we'll need a whole clusters worth.  If we're
10458c2ecf20Sopenharmony_ci	 * writing past i_size, we only need enough pages to cover the
10468c2ecf20Sopenharmony_ci	 * last page of the write.
10478c2ecf20Sopenharmony_ci	 */
10488c2ecf20Sopenharmony_ci	if (new) {
10498c2ecf20Sopenharmony_ci		wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
10508c2ecf20Sopenharmony_ci		start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
10518c2ecf20Sopenharmony_ci		/*
10528c2ecf20Sopenharmony_ci		 * We need the index *past* the last page we could possibly
10538c2ecf20Sopenharmony_ci		 * touch.  This is the page past the end of the write or
10548c2ecf20Sopenharmony_ci		 * i_size, whichever is greater.
10558c2ecf20Sopenharmony_ci		 */
10568c2ecf20Sopenharmony_ci		last_byte = max(user_pos + user_len, i_size_read(inode));
10578c2ecf20Sopenharmony_ci		BUG_ON(last_byte < 1);
10588c2ecf20Sopenharmony_ci		end_index = ((last_byte - 1) >> PAGE_SHIFT) + 1;
10598c2ecf20Sopenharmony_ci		if ((start + wc->w_num_pages) > end_index)
10608c2ecf20Sopenharmony_ci			wc->w_num_pages = end_index - start;
10618c2ecf20Sopenharmony_ci	} else {
10628c2ecf20Sopenharmony_ci		wc->w_num_pages = 1;
10638c2ecf20Sopenharmony_ci		start = target_index;
10648c2ecf20Sopenharmony_ci	}
10658c2ecf20Sopenharmony_ci	end_index = (user_pos + user_len - 1) >> PAGE_SHIFT;
10668c2ecf20Sopenharmony_ci
10678c2ecf20Sopenharmony_ci	for(i = 0; i < wc->w_num_pages; i++) {
10688c2ecf20Sopenharmony_ci		index = start + i;
10698c2ecf20Sopenharmony_ci
10708c2ecf20Sopenharmony_ci		if (index >= target_index && index <= end_index &&
10718c2ecf20Sopenharmony_ci		    wc->w_type == OCFS2_WRITE_MMAP) {
10728c2ecf20Sopenharmony_ci			/*
10738c2ecf20Sopenharmony_ci			 * ocfs2_pagemkwrite() is a little different
10748c2ecf20Sopenharmony_ci			 * and wants us to directly use the page
10758c2ecf20Sopenharmony_ci			 * passed in.
10768c2ecf20Sopenharmony_ci			 */
10778c2ecf20Sopenharmony_ci			lock_page(mmap_page);
10788c2ecf20Sopenharmony_ci
10798c2ecf20Sopenharmony_ci			/* Exit and let the caller retry */
10808c2ecf20Sopenharmony_ci			if (mmap_page->mapping != mapping) {
10818c2ecf20Sopenharmony_ci				WARN_ON(mmap_page->mapping);
10828c2ecf20Sopenharmony_ci				unlock_page(mmap_page);
10838c2ecf20Sopenharmony_ci				ret = -EAGAIN;
10848c2ecf20Sopenharmony_ci				goto out;
10858c2ecf20Sopenharmony_ci			}
10868c2ecf20Sopenharmony_ci
10878c2ecf20Sopenharmony_ci			get_page(mmap_page);
10888c2ecf20Sopenharmony_ci			wc->w_pages[i] = mmap_page;
10898c2ecf20Sopenharmony_ci			wc->w_target_locked = true;
10908c2ecf20Sopenharmony_ci		} else if (index >= target_index && index <= end_index &&
10918c2ecf20Sopenharmony_ci			   wc->w_type == OCFS2_WRITE_DIRECT) {
10928c2ecf20Sopenharmony_ci			/* Direct write has no mapping page. */
10938c2ecf20Sopenharmony_ci			wc->w_pages[i] = NULL;
10948c2ecf20Sopenharmony_ci			continue;
10958c2ecf20Sopenharmony_ci		} else {
10968c2ecf20Sopenharmony_ci			wc->w_pages[i] = find_or_create_page(mapping, index,
10978c2ecf20Sopenharmony_ci							     GFP_NOFS);
10988c2ecf20Sopenharmony_ci			if (!wc->w_pages[i]) {
10998c2ecf20Sopenharmony_ci				ret = -ENOMEM;
11008c2ecf20Sopenharmony_ci				mlog_errno(ret);
11018c2ecf20Sopenharmony_ci				goto out;
11028c2ecf20Sopenharmony_ci			}
11038c2ecf20Sopenharmony_ci		}
11048c2ecf20Sopenharmony_ci		wait_for_stable_page(wc->w_pages[i]);
11058c2ecf20Sopenharmony_ci
11068c2ecf20Sopenharmony_ci		if (index == target_index)
11078c2ecf20Sopenharmony_ci			wc->w_target_page = wc->w_pages[i];
11088c2ecf20Sopenharmony_ci	}
11098c2ecf20Sopenharmony_ciout:
11108c2ecf20Sopenharmony_ci	if (ret)
11118c2ecf20Sopenharmony_ci		wc->w_target_locked = false;
11128c2ecf20Sopenharmony_ci	return ret;
11138c2ecf20Sopenharmony_ci}
11148c2ecf20Sopenharmony_ci
11158c2ecf20Sopenharmony_ci/*
11168c2ecf20Sopenharmony_ci * Prepare a single cluster for write one cluster into the file.
11178c2ecf20Sopenharmony_ci */
11188c2ecf20Sopenharmony_cistatic int ocfs2_write_cluster(struct address_space *mapping,
11198c2ecf20Sopenharmony_ci			       u32 *phys, unsigned int new,
11208c2ecf20Sopenharmony_ci			       unsigned int clear_unwritten,
11218c2ecf20Sopenharmony_ci			       unsigned int should_zero,
11228c2ecf20Sopenharmony_ci			       struct ocfs2_alloc_context *data_ac,
11238c2ecf20Sopenharmony_ci			       struct ocfs2_alloc_context *meta_ac,
11248c2ecf20Sopenharmony_ci			       struct ocfs2_write_ctxt *wc, u32 cpos,
11258c2ecf20Sopenharmony_ci			       loff_t user_pos, unsigned user_len)
11268c2ecf20Sopenharmony_ci{
11278c2ecf20Sopenharmony_ci	int ret, i;
11288c2ecf20Sopenharmony_ci	u64 p_blkno;
11298c2ecf20Sopenharmony_ci	struct inode *inode = mapping->host;
11308c2ecf20Sopenharmony_ci	struct ocfs2_extent_tree et;
11318c2ecf20Sopenharmony_ci	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
11328c2ecf20Sopenharmony_ci
11338c2ecf20Sopenharmony_ci	if (new) {
11348c2ecf20Sopenharmony_ci		u32 tmp_pos;
11358c2ecf20Sopenharmony_ci
11368c2ecf20Sopenharmony_ci		/*
11378c2ecf20Sopenharmony_ci		 * This is safe to call with the page locks - it won't take
11388c2ecf20Sopenharmony_ci		 * any additional semaphores or cluster locks.
11398c2ecf20Sopenharmony_ci		 */
11408c2ecf20Sopenharmony_ci		tmp_pos = cpos;
11418c2ecf20Sopenharmony_ci		ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
11428c2ecf20Sopenharmony_ci					   &tmp_pos, 1, !clear_unwritten,
11438c2ecf20Sopenharmony_ci					   wc->w_di_bh, wc->w_handle,
11448c2ecf20Sopenharmony_ci					   data_ac, meta_ac, NULL);
11458c2ecf20Sopenharmony_ci		/*
11468c2ecf20Sopenharmony_ci		 * This shouldn't happen because we must have already
11478c2ecf20Sopenharmony_ci		 * calculated the correct meta data allocation required. The
11488c2ecf20Sopenharmony_ci		 * internal tree allocation code should know how to increase
11498c2ecf20Sopenharmony_ci		 * transaction credits itself.
11508c2ecf20Sopenharmony_ci		 *
11518c2ecf20Sopenharmony_ci		 * If need be, we could handle -EAGAIN for a
11528c2ecf20Sopenharmony_ci		 * RESTART_TRANS here.
11538c2ecf20Sopenharmony_ci		 */
11548c2ecf20Sopenharmony_ci		mlog_bug_on_msg(ret == -EAGAIN,
11558c2ecf20Sopenharmony_ci				"Inode %llu: EAGAIN return during allocation.\n",
11568c2ecf20Sopenharmony_ci				(unsigned long long)OCFS2_I(inode)->ip_blkno);
11578c2ecf20Sopenharmony_ci		if (ret < 0) {
11588c2ecf20Sopenharmony_ci			mlog_errno(ret);
11598c2ecf20Sopenharmony_ci			goto out;
11608c2ecf20Sopenharmony_ci		}
11618c2ecf20Sopenharmony_ci	} else if (clear_unwritten) {
11628c2ecf20Sopenharmony_ci		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
11638c2ecf20Sopenharmony_ci					      wc->w_di_bh);
11648c2ecf20Sopenharmony_ci		ret = ocfs2_mark_extent_written(inode, &et,
11658c2ecf20Sopenharmony_ci						wc->w_handle, cpos, 1, *phys,
11668c2ecf20Sopenharmony_ci						meta_ac, &wc->w_dealloc);
11678c2ecf20Sopenharmony_ci		if (ret < 0) {
11688c2ecf20Sopenharmony_ci			mlog_errno(ret);
11698c2ecf20Sopenharmony_ci			goto out;
11708c2ecf20Sopenharmony_ci		}
11718c2ecf20Sopenharmony_ci	}
11728c2ecf20Sopenharmony_ci
11738c2ecf20Sopenharmony_ci	/*
11748c2ecf20Sopenharmony_ci	 * The only reason this should fail is due to an inability to
11758c2ecf20Sopenharmony_ci	 * find the extent added.
11768c2ecf20Sopenharmony_ci	 */
11778c2ecf20Sopenharmony_ci	ret = ocfs2_get_clusters(inode, cpos, phys, NULL, NULL);
11788c2ecf20Sopenharmony_ci	if (ret < 0) {
11798c2ecf20Sopenharmony_ci		mlog(ML_ERROR, "Get physical blkno failed for inode %llu, "
11808c2ecf20Sopenharmony_ci			    "at logical cluster %u",
11818c2ecf20Sopenharmony_ci			    (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
11828c2ecf20Sopenharmony_ci		goto out;
11838c2ecf20Sopenharmony_ci	}
11848c2ecf20Sopenharmony_ci
11858c2ecf20Sopenharmony_ci	BUG_ON(*phys == 0);
11868c2ecf20Sopenharmony_ci
11878c2ecf20Sopenharmony_ci	p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *phys);
11888c2ecf20Sopenharmony_ci	if (!should_zero)
11898c2ecf20Sopenharmony_ci		p_blkno += (user_pos >> inode->i_sb->s_blocksize_bits) & (u64)(bpc - 1);
11908c2ecf20Sopenharmony_ci
11918c2ecf20Sopenharmony_ci	for(i = 0; i < wc->w_num_pages; i++) {
11928c2ecf20Sopenharmony_ci		int tmpret;
11938c2ecf20Sopenharmony_ci
11948c2ecf20Sopenharmony_ci		/* This is the direct io target page. */
11958c2ecf20Sopenharmony_ci		if (wc->w_pages[i] == NULL) {
11968c2ecf20Sopenharmony_ci			p_blkno++;
11978c2ecf20Sopenharmony_ci			continue;
11988c2ecf20Sopenharmony_ci		}
11998c2ecf20Sopenharmony_ci
12008c2ecf20Sopenharmony_ci		tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
12018c2ecf20Sopenharmony_ci						      wc->w_pages[i], cpos,
12028c2ecf20Sopenharmony_ci						      user_pos, user_len,
12038c2ecf20Sopenharmony_ci						      should_zero);
12048c2ecf20Sopenharmony_ci		if (tmpret) {
12058c2ecf20Sopenharmony_ci			mlog_errno(tmpret);
12068c2ecf20Sopenharmony_ci			if (ret == 0)
12078c2ecf20Sopenharmony_ci				ret = tmpret;
12088c2ecf20Sopenharmony_ci		}
12098c2ecf20Sopenharmony_ci	}
12108c2ecf20Sopenharmony_ci
12118c2ecf20Sopenharmony_ci	/*
12128c2ecf20Sopenharmony_ci	 * We only have cleanup to do in case of allocating write.
12138c2ecf20Sopenharmony_ci	 */
12148c2ecf20Sopenharmony_ci	if (ret && new)
12158c2ecf20Sopenharmony_ci		ocfs2_write_failure(inode, wc, user_pos, user_len);
12168c2ecf20Sopenharmony_ci
12178c2ecf20Sopenharmony_ciout:
12188c2ecf20Sopenharmony_ci
12198c2ecf20Sopenharmony_ci	return ret;
12208c2ecf20Sopenharmony_ci}
12218c2ecf20Sopenharmony_ci
12228c2ecf20Sopenharmony_cistatic int ocfs2_write_cluster_by_desc(struct address_space *mapping,
12238c2ecf20Sopenharmony_ci				       struct ocfs2_alloc_context *data_ac,
12248c2ecf20Sopenharmony_ci				       struct ocfs2_alloc_context *meta_ac,
12258c2ecf20Sopenharmony_ci				       struct ocfs2_write_ctxt *wc,
12268c2ecf20Sopenharmony_ci				       loff_t pos, unsigned len)
12278c2ecf20Sopenharmony_ci{
12288c2ecf20Sopenharmony_ci	int ret, i;
12298c2ecf20Sopenharmony_ci	loff_t cluster_off;
12308c2ecf20Sopenharmony_ci	unsigned int local_len = len;
12318c2ecf20Sopenharmony_ci	struct ocfs2_write_cluster_desc *desc;
12328c2ecf20Sopenharmony_ci	struct ocfs2_super *osb = OCFS2_SB(mapping->host->i_sb);
12338c2ecf20Sopenharmony_ci
12348c2ecf20Sopenharmony_ci	for (i = 0; i < wc->w_clen; i++) {
12358c2ecf20Sopenharmony_ci		desc = &wc->w_desc[i];
12368c2ecf20Sopenharmony_ci
12378c2ecf20Sopenharmony_ci		/*
12388c2ecf20Sopenharmony_ci		 * We have to make sure that the total write passed in
12398c2ecf20Sopenharmony_ci		 * doesn't extend past a single cluster.
12408c2ecf20Sopenharmony_ci		 */
12418c2ecf20Sopenharmony_ci		local_len = len;
12428c2ecf20Sopenharmony_ci		cluster_off = pos & (osb->s_clustersize - 1);
12438c2ecf20Sopenharmony_ci		if ((cluster_off + local_len) > osb->s_clustersize)
12448c2ecf20Sopenharmony_ci			local_len = osb->s_clustersize - cluster_off;
12458c2ecf20Sopenharmony_ci
12468c2ecf20Sopenharmony_ci		ret = ocfs2_write_cluster(mapping, &desc->c_phys,
12478c2ecf20Sopenharmony_ci					  desc->c_new,
12488c2ecf20Sopenharmony_ci					  desc->c_clear_unwritten,
12498c2ecf20Sopenharmony_ci					  desc->c_needs_zero,
12508c2ecf20Sopenharmony_ci					  data_ac, meta_ac,
12518c2ecf20Sopenharmony_ci					  wc, desc->c_cpos, pos, local_len);
12528c2ecf20Sopenharmony_ci		if (ret) {
12538c2ecf20Sopenharmony_ci			mlog_errno(ret);
12548c2ecf20Sopenharmony_ci			goto out;
12558c2ecf20Sopenharmony_ci		}
12568c2ecf20Sopenharmony_ci
12578c2ecf20Sopenharmony_ci		len -= local_len;
12588c2ecf20Sopenharmony_ci		pos += local_len;
12598c2ecf20Sopenharmony_ci	}
12608c2ecf20Sopenharmony_ci
12618c2ecf20Sopenharmony_ci	ret = 0;
12628c2ecf20Sopenharmony_ciout:
12638c2ecf20Sopenharmony_ci	return ret;
12648c2ecf20Sopenharmony_ci}
12658c2ecf20Sopenharmony_ci
12668c2ecf20Sopenharmony_ci/*
12678c2ecf20Sopenharmony_ci * ocfs2_write_end() wants to know which parts of the target page it
12688c2ecf20Sopenharmony_ci * should complete the write on. It's easiest to compute them ahead of
12698c2ecf20Sopenharmony_ci * time when a more complete view of the write is available.
12708c2ecf20Sopenharmony_ci */
12718c2ecf20Sopenharmony_cistatic void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
12728c2ecf20Sopenharmony_ci					struct ocfs2_write_ctxt *wc,
12738c2ecf20Sopenharmony_ci					loff_t pos, unsigned len, int alloc)
12748c2ecf20Sopenharmony_ci{
12758c2ecf20Sopenharmony_ci	struct ocfs2_write_cluster_desc *desc;
12768c2ecf20Sopenharmony_ci
12778c2ecf20Sopenharmony_ci	wc->w_target_from = pos & (PAGE_SIZE - 1);
12788c2ecf20Sopenharmony_ci	wc->w_target_to = wc->w_target_from + len;
12798c2ecf20Sopenharmony_ci
12808c2ecf20Sopenharmony_ci	if (alloc == 0)
12818c2ecf20Sopenharmony_ci		return;
12828c2ecf20Sopenharmony_ci
12838c2ecf20Sopenharmony_ci	/*
12848c2ecf20Sopenharmony_ci	 * Allocating write - we may have different boundaries based
12858c2ecf20Sopenharmony_ci	 * on page size and cluster size.
12868c2ecf20Sopenharmony_ci	 *
12878c2ecf20Sopenharmony_ci	 * NOTE: We can no longer compute one value from the other as
12888c2ecf20Sopenharmony_ci	 * the actual write length and user provided length may be
12898c2ecf20Sopenharmony_ci	 * different.
12908c2ecf20Sopenharmony_ci	 */
12918c2ecf20Sopenharmony_ci
12928c2ecf20Sopenharmony_ci	if (wc->w_large_pages) {
12938c2ecf20Sopenharmony_ci		/*
12948c2ecf20Sopenharmony_ci		 * We only care about the 1st and last cluster within
12958c2ecf20Sopenharmony_ci		 * our range and whether they should be zero'd or not. Either
12968c2ecf20Sopenharmony_ci		 * value may be extended out to the start/end of a
12978c2ecf20Sopenharmony_ci		 * newly allocated cluster.
12988c2ecf20Sopenharmony_ci		 */
12998c2ecf20Sopenharmony_ci		desc = &wc->w_desc[0];
13008c2ecf20Sopenharmony_ci		if (desc->c_needs_zero)
13018c2ecf20Sopenharmony_ci			ocfs2_figure_cluster_boundaries(osb,
13028c2ecf20Sopenharmony_ci							desc->c_cpos,
13038c2ecf20Sopenharmony_ci							&wc->w_target_from,
13048c2ecf20Sopenharmony_ci							NULL);
13058c2ecf20Sopenharmony_ci
13068c2ecf20Sopenharmony_ci		desc = &wc->w_desc[wc->w_clen - 1];
13078c2ecf20Sopenharmony_ci		if (desc->c_needs_zero)
13088c2ecf20Sopenharmony_ci			ocfs2_figure_cluster_boundaries(osb,
13098c2ecf20Sopenharmony_ci							desc->c_cpos,
13108c2ecf20Sopenharmony_ci							NULL,
13118c2ecf20Sopenharmony_ci							&wc->w_target_to);
13128c2ecf20Sopenharmony_ci	} else {
13138c2ecf20Sopenharmony_ci		wc->w_target_from = 0;
13148c2ecf20Sopenharmony_ci		wc->w_target_to = PAGE_SIZE;
13158c2ecf20Sopenharmony_ci	}
13168c2ecf20Sopenharmony_ci}
13178c2ecf20Sopenharmony_ci
13188c2ecf20Sopenharmony_ci/*
13198c2ecf20Sopenharmony_ci * Check if this extent is marked UNWRITTEN by direct io. If so, we need not to
13208c2ecf20Sopenharmony_ci * do the zero work. And should not to clear UNWRITTEN since it will be cleared
13218c2ecf20Sopenharmony_ci * by the direct io procedure.
13228c2ecf20Sopenharmony_ci * If this is a new extent that allocated by direct io, we should mark it in
13238c2ecf20Sopenharmony_ci * the ip_unwritten_list.
13248c2ecf20Sopenharmony_ci */
13258c2ecf20Sopenharmony_cistatic int ocfs2_unwritten_check(struct inode *inode,
13268c2ecf20Sopenharmony_ci				 struct ocfs2_write_ctxt *wc,
13278c2ecf20Sopenharmony_ci				 struct ocfs2_write_cluster_desc *desc)
13288c2ecf20Sopenharmony_ci{
13298c2ecf20Sopenharmony_ci	struct ocfs2_inode_info *oi = OCFS2_I(inode);
13308c2ecf20Sopenharmony_ci	struct ocfs2_unwritten_extent *ue = NULL, *new = NULL;
13318c2ecf20Sopenharmony_ci	int ret = 0;
13328c2ecf20Sopenharmony_ci
13338c2ecf20Sopenharmony_ci	if (!desc->c_needs_zero)
13348c2ecf20Sopenharmony_ci		return 0;
13358c2ecf20Sopenharmony_ci
13368c2ecf20Sopenharmony_ciretry:
13378c2ecf20Sopenharmony_ci	spin_lock(&oi->ip_lock);
13388c2ecf20Sopenharmony_ci	/* Needs not to zero no metter buffer or direct. The one who is zero
13398c2ecf20Sopenharmony_ci	 * the cluster is doing zero. And he will clear unwritten after all
13408c2ecf20Sopenharmony_ci	 * cluster io finished. */
13418c2ecf20Sopenharmony_ci	list_for_each_entry(ue, &oi->ip_unwritten_list, ue_ip_node) {
13428c2ecf20Sopenharmony_ci		if (desc->c_cpos == ue->ue_cpos) {
13438c2ecf20Sopenharmony_ci			BUG_ON(desc->c_new);
13448c2ecf20Sopenharmony_ci			desc->c_needs_zero = 0;
13458c2ecf20Sopenharmony_ci			desc->c_clear_unwritten = 0;
13468c2ecf20Sopenharmony_ci			goto unlock;
13478c2ecf20Sopenharmony_ci		}
13488c2ecf20Sopenharmony_ci	}
13498c2ecf20Sopenharmony_ci
13508c2ecf20Sopenharmony_ci	if (wc->w_type != OCFS2_WRITE_DIRECT)
13518c2ecf20Sopenharmony_ci		goto unlock;
13528c2ecf20Sopenharmony_ci
13538c2ecf20Sopenharmony_ci	if (new == NULL) {
13548c2ecf20Sopenharmony_ci		spin_unlock(&oi->ip_lock);
13558c2ecf20Sopenharmony_ci		new = kmalloc(sizeof(struct ocfs2_unwritten_extent),
13568c2ecf20Sopenharmony_ci			     GFP_NOFS);
13578c2ecf20Sopenharmony_ci		if (new == NULL) {
13588c2ecf20Sopenharmony_ci			ret = -ENOMEM;
13598c2ecf20Sopenharmony_ci			goto out;
13608c2ecf20Sopenharmony_ci		}
13618c2ecf20Sopenharmony_ci		goto retry;
13628c2ecf20Sopenharmony_ci	}
13638c2ecf20Sopenharmony_ci	/* This direct write will doing zero. */
13648c2ecf20Sopenharmony_ci	new->ue_cpos = desc->c_cpos;
13658c2ecf20Sopenharmony_ci	new->ue_phys = desc->c_phys;
13668c2ecf20Sopenharmony_ci	desc->c_clear_unwritten = 0;
13678c2ecf20Sopenharmony_ci	list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
13688c2ecf20Sopenharmony_ci	list_add_tail(&new->ue_node, &wc->w_unwritten_list);
13698c2ecf20Sopenharmony_ci	wc->w_unwritten_count++;
13708c2ecf20Sopenharmony_ci	new = NULL;
13718c2ecf20Sopenharmony_ciunlock:
13728c2ecf20Sopenharmony_ci	spin_unlock(&oi->ip_lock);
13738c2ecf20Sopenharmony_ciout:
13748c2ecf20Sopenharmony_ci	kfree(new);
13758c2ecf20Sopenharmony_ci	return ret;
13768c2ecf20Sopenharmony_ci}
13778c2ecf20Sopenharmony_ci
13788c2ecf20Sopenharmony_ci/*
13798c2ecf20Sopenharmony_ci * Populate each single-cluster write descriptor in the write context
13808c2ecf20Sopenharmony_ci * with information about the i/o to be done.
13818c2ecf20Sopenharmony_ci *
13828c2ecf20Sopenharmony_ci * Returns the number of clusters that will have to be allocated, as
13838c2ecf20Sopenharmony_ci * well as a worst case estimate of the number of extent records that
13848c2ecf20Sopenharmony_ci * would have to be created during a write to an unwritten region.
13858c2ecf20Sopenharmony_ci */
13868c2ecf20Sopenharmony_cistatic int ocfs2_populate_write_desc(struct inode *inode,
13878c2ecf20Sopenharmony_ci				     struct ocfs2_write_ctxt *wc,
13888c2ecf20Sopenharmony_ci				     unsigned int *clusters_to_alloc,
13898c2ecf20Sopenharmony_ci				     unsigned int *extents_to_split)
13908c2ecf20Sopenharmony_ci{
13918c2ecf20Sopenharmony_ci	int ret;
13928c2ecf20Sopenharmony_ci	struct ocfs2_write_cluster_desc *desc;
13938c2ecf20Sopenharmony_ci	unsigned int num_clusters = 0;
13948c2ecf20Sopenharmony_ci	unsigned int ext_flags = 0;
13958c2ecf20Sopenharmony_ci	u32 phys = 0;
13968c2ecf20Sopenharmony_ci	int i;
13978c2ecf20Sopenharmony_ci
13988c2ecf20Sopenharmony_ci	*clusters_to_alloc = 0;
13998c2ecf20Sopenharmony_ci	*extents_to_split = 0;
14008c2ecf20Sopenharmony_ci
14018c2ecf20Sopenharmony_ci	for (i = 0; i < wc->w_clen; i++) {
14028c2ecf20Sopenharmony_ci		desc = &wc->w_desc[i];
14038c2ecf20Sopenharmony_ci		desc->c_cpos = wc->w_cpos + i;
14048c2ecf20Sopenharmony_ci
14058c2ecf20Sopenharmony_ci		if (num_clusters == 0) {
14068c2ecf20Sopenharmony_ci			/*
14078c2ecf20Sopenharmony_ci			 * Need to look up the next extent record.
14088c2ecf20Sopenharmony_ci			 */
14098c2ecf20Sopenharmony_ci			ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
14108c2ecf20Sopenharmony_ci						 &num_clusters, &ext_flags);
14118c2ecf20Sopenharmony_ci			if (ret) {
14128c2ecf20Sopenharmony_ci				mlog_errno(ret);
14138c2ecf20Sopenharmony_ci				goto out;
14148c2ecf20Sopenharmony_ci			}
14158c2ecf20Sopenharmony_ci
14168c2ecf20Sopenharmony_ci			/* We should already CoW the refcountd extent. */
14178c2ecf20Sopenharmony_ci			BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
14188c2ecf20Sopenharmony_ci
14198c2ecf20Sopenharmony_ci			/*
14208c2ecf20Sopenharmony_ci			 * Assume worst case - that we're writing in
14218c2ecf20Sopenharmony_ci			 * the middle of the extent.
14228c2ecf20Sopenharmony_ci			 *
14238c2ecf20Sopenharmony_ci			 * We can assume that the write proceeds from
14248c2ecf20Sopenharmony_ci			 * left to right, in which case the extent
14258c2ecf20Sopenharmony_ci			 * insert code is smart enough to coalesce the
14268c2ecf20Sopenharmony_ci			 * next splits into the previous records created.
14278c2ecf20Sopenharmony_ci			 */
14288c2ecf20Sopenharmony_ci			if (ext_flags & OCFS2_EXT_UNWRITTEN)
14298c2ecf20Sopenharmony_ci				*extents_to_split = *extents_to_split + 2;
14308c2ecf20Sopenharmony_ci		} else if (phys) {
14318c2ecf20Sopenharmony_ci			/*
14328c2ecf20Sopenharmony_ci			 * Only increment phys if it doesn't describe
14338c2ecf20Sopenharmony_ci			 * a hole.
14348c2ecf20Sopenharmony_ci			 */
14358c2ecf20Sopenharmony_ci			phys++;
14368c2ecf20Sopenharmony_ci		}
14378c2ecf20Sopenharmony_ci
14388c2ecf20Sopenharmony_ci		/*
14398c2ecf20Sopenharmony_ci		 * If w_first_new_cpos is < UINT_MAX, we have a non-sparse
14408c2ecf20Sopenharmony_ci		 * file that got extended.  w_first_new_cpos tells us
14418c2ecf20Sopenharmony_ci		 * where the newly allocated clusters are so we can
14428c2ecf20Sopenharmony_ci		 * zero them.
14438c2ecf20Sopenharmony_ci		 */
14448c2ecf20Sopenharmony_ci		if (desc->c_cpos >= wc->w_first_new_cpos) {
14458c2ecf20Sopenharmony_ci			BUG_ON(phys == 0);
14468c2ecf20Sopenharmony_ci			desc->c_needs_zero = 1;
14478c2ecf20Sopenharmony_ci		}
14488c2ecf20Sopenharmony_ci
14498c2ecf20Sopenharmony_ci		desc->c_phys = phys;
14508c2ecf20Sopenharmony_ci		if (phys == 0) {
14518c2ecf20Sopenharmony_ci			desc->c_new = 1;
14528c2ecf20Sopenharmony_ci			desc->c_needs_zero = 1;
14538c2ecf20Sopenharmony_ci			desc->c_clear_unwritten = 1;
14548c2ecf20Sopenharmony_ci			*clusters_to_alloc = *clusters_to_alloc + 1;
14558c2ecf20Sopenharmony_ci		}
14568c2ecf20Sopenharmony_ci
14578c2ecf20Sopenharmony_ci		if (ext_flags & OCFS2_EXT_UNWRITTEN) {
14588c2ecf20Sopenharmony_ci			desc->c_clear_unwritten = 1;
14598c2ecf20Sopenharmony_ci			desc->c_needs_zero = 1;
14608c2ecf20Sopenharmony_ci		}
14618c2ecf20Sopenharmony_ci
14628c2ecf20Sopenharmony_ci		ret = ocfs2_unwritten_check(inode, wc, desc);
14638c2ecf20Sopenharmony_ci		if (ret) {
14648c2ecf20Sopenharmony_ci			mlog_errno(ret);
14658c2ecf20Sopenharmony_ci			goto out;
14668c2ecf20Sopenharmony_ci		}
14678c2ecf20Sopenharmony_ci
14688c2ecf20Sopenharmony_ci		num_clusters--;
14698c2ecf20Sopenharmony_ci	}
14708c2ecf20Sopenharmony_ci
14718c2ecf20Sopenharmony_ci	ret = 0;
14728c2ecf20Sopenharmony_ciout:
14738c2ecf20Sopenharmony_ci	return ret;
14748c2ecf20Sopenharmony_ci}
14758c2ecf20Sopenharmony_ci
14768c2ecf20Sopenharmony_cistatic int ocfs2_write_begin_inline(struct address_space *mapping,
14778c2ecf20Sopenharmony_ci				    struct inode *inode,
14788c2ecf20Sopenharmony_ci				    struct ocfs2_write_ctxt *wc)
14798c2ecf20Sopenharmony_ci{
14808c2ecf20Sopenharmony_ci	int ret;
14818c2ecf20Sopenharmony_ci	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
14828c2ecf20Sopenharmony_ci	struct page *page;
14838c2ecf20Sopenharmony_ci	handle_t *handle;
14848c2ecf20Sopenharmony_ci	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
14858c2ecf20Sopenharmony_ci
14868c2ecf20Sopenharmony_ci	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
14878c2ecf20Sopenharmony_ci	if (IS_ERR(handle)) {
14888c2ecf20Sopenharmony_ci		ret = PTR_ERR(handle);
14898c2ecf20Sopenharmony_ci		mlog_errno(ret);
14908c2ecf20Sopenharmony_ci		goto out;
14918c2ecf20Sopenharmony_ci	}
14928c2ecf20Sopenharmony_ci
14938c2ecf20Sopenharmony_ci	page = find_or_create_page(mapping, 0, GFP_NOFS);
14948c2ecf20Sopenharmony_ci	if (!page) {
14958c2ecf20Sopenharmony_ci		ocfs2_commit_trans(osb, handle);
14968c2ecf20Sopenharmony_ci		ret = -ENOMEM;
14978c2ecf20Sopenharmony_ci		mlog_errno(ret);
14988c2ecf20Sopenharmony_ci		goto out;
14998c2ecf20Sopenharmony_ci	}
15008c2ecf20Sopenharmony_ci	/*
15018c2ecf20Sopenharmony_ci	 * If we don't set w_num_pages then this page won't get unlocked
15028c2ecf20Sopenharmony_ci	 * and freed on cleanup of the write context.
15038c2ecf20Sopenharmony_ci	 */
15048c2ecf20Sopenharmony_ci	wc->w_pages[0] = wc->w_target_page = page;
15058c2ecf20Sopenharmony_ci	wc->w_num_pages = 1;
15068c2ecf20Sopenharmony_ci
15078c2ecf20Sopenharmony_ci	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
15088c2ecf20Sopenharmony_ci				      OCFS2_JOURNAL_ACCESS_WRITE);
15098c2ecf20Sopenharmony_ci	if (ret) {
15108c2ecf20Sopenharmony_ci		ocfs2_commit_trans(osb, handle);
15118c2ecf20Sopenharmony_ci
15128c2ecf20Sopenharmony_ci		mlog_errno(ret);
15138c2ecf20Sopenharmony_ci		goto out;
15148c2ecf20Sopenharmony_ci	}
15158c2ecf20Sopenharmony_ci
15168c2ecf20Sopenharmony_ci	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
15178c2ecf20Sopenharmony_ci		ocfs2_set_inode_data_inline(inode, di);
15188c2ecf20Sopenharmony_ci
15198c2ecf20Sopenharmony_ci	if (!PageUptodate(page)) {
15208c2ecf20Sopenharmony_ci		ret = ocfs2_read_inline_data(inode, page, wc->w_di_bh);
15218c2ecf20Sopenharmony_ci		if (ret) {
15228c2ecf20Sopenharmony_ci			ocfs2_commit_trans(osb, handle);
15238c2ecf20Sopenharmony_ci
15248c2ecf20Sopenharmony_ci			goto out;
15258c2ecf20Sopenharmony_ci		}
15268c2ecf20Sopenharmony_ci	}
15278c2ecf20Sopenharmony_ci
15288c2ecf20Sopenharmony_ci	wc->w_handle = handle;
15298c2ecf20Sopenharmony_ciout:
15308c2ecf20Sopenharmony_ci	return ret;
15318c2ecf20Sopenharmony_ci}
15328c2ecf20Sopenharmony_ci
15338c2ecf20Sopenharmony_ciint ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size)
15348c2ecf20Sopenharmony_ci{
15358c2ecf20Sopenharmony_ci	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
15368c2ecf20Sopenharmony_ci
15378c2ecf20Sopenharmony_ci	if (new_size <= le16_to_cpu(di->id2.i_data.id_count))
15388c2ecf20Sopenharmony_ci		return 1;
15398c2ecf20Sopenharmony_ci	return 0;
15408c2ecf20Sopenharmony_ci}
15418c2ecf20Sopenharmony_ci
15428c2ecf20Sopenharmony_cistatic int ocfs2_try_to_write_inline_data(struct address_space *mapping,
15438c2ecf20Sopenharmony_ci					  struct inode *inode, loff_t pos,
15448c2ecf20Sopenharmony_ci					  unsigned len, struct page *mmap_page,
15458c2ecf20Sopenharmony_ci					  struct ocfs2_write_ctxt *wc)
15468c2ecf20Sopenharmony_ci{
15478c2ecf20Sopenharmony_ci	int ret, written = 0;
15488c2ecf20Sopenharmony_ci	loff_t end = pos + len;
15498c2ecf20Sopenharmony_ci	struct ocfs2_inode_info *oi = OCFS2_I(inode);
15508c2ecf20Sopenharmony_ci	struct ocfs2_dinode *di = NULL;
15518c2ecf20Sopenharmony_ci
15528c2ecf20Sopenharmony_ci	trace_ocfs2_try_to_write_inline_data((unsigned long long)oi->ip_blkno,
15538c2ecf20Sopenharmony_ci					     len, (unsigned long long)pos,
15548c2ecf20Sopenharmony_ci					     oi->ip_dyn_features);
15558c2ecf20Sopenharmony_ci
15568c2ecf20Sopenharmony_ci	/*
15578c2ecf20Sopenharmony_ci	 * Handle inodes which already have inline data 1st.
15588c2ecf20Sopenharmony_ci	 */
15598c2ecf20Sopenharmony_ci	if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
15608c2ecf20Sopenharmony_ci		if (mmap_page == NULL &&
15618c2ecf20Sopenharmony_ci		    ocfs2_size_fits_inline_data(wc->w_di_bh, end))
15628c2ecf20Sopenharmony_ci			goto do_inline_write;
15638c2ecf20Sopenharmony_ci
15648c2ecf20Sopenharmony_ci		/*
15658c2ecf20Sopenharmony_ci		 * The write won't fit - we have to give this inode an
15668c2ecf20Sopenharmony_ci		 * inline extent list now.
15678c2ecf20Sopenharmony_ci		 */
15688c2ecf20Sopenharmony_ci		ret = ocfs2_convert_inline_data_to_extents(inode, wc->w_di_bh);
15698c2ecf20Sopenharmony_ci		if (ret)
15708c2ecf20Sopenharmony_ci			mlog_errno(ret);
15718c2ecf20Sopenharmony_ci		goto out;
15728c2ecf20Sopenharmony_ci	}
15738c2ecf20Sopenharmony_ci
15748c2ecf20Sopenharmony_ci	/*
15758c2ecf20Sopenharmony_ci	 * Check whether the inode can accept inline data.
15768c2ecf20Sopenharmony_ci	 */
15778c2ecf20Sopenharmony_ci	if (oi->ip_clusters != 0 || i_size_read(inode) != 0)
15788c2ecf20Sopenharmony_ci		return 0;
15798c2ecf20Sopenharmony_ci
15808c2ecf20Sopenharmony_ci	/*
15818c2ecf20Sopenharmony_ci	 * Check whether the write can fit.
15828c2ecf20Sopenharmony_ci	 */
15838c2ecf20Sopenharmony_ci	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
15848c2ecf20Sopenharmony_ci	if (mmap_page ||
15858c2ecf20Sopenharmony_ci	    end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di))
15868c2ecf20Sopenharmony_ci		return 0;
15878c2ecf20Sopenharmony_ci
15888c2ecf20Sopenharmony_cido_inline_write:
15898c2ecf20Sopenharmony_ci	ret = ocfs2_write_begin_inline(mapping, inode, wc);
15908c2ecf20Sopenharmony_ci	if (ret) {
15918c2ecf20Sopenharmony_ci		mlog_errno(ret);
15928c2ecf20Sopenharmony_ci		goto out;
15938c2ecf20Sopenharmony_ci	}
15948c2ecf20Sopenharmony_ci
15958c2ecf20Sopenharmony_ci	/*
15968c2ecf20Sopenharmony_ci	 * This signals to the caller that the data can be written
15978c2ecf20Sopenharmony_ci	 * inline.
15988c2ecf20Sopenharmony_ci	 */
15998c2ecf20Sopenharmony_ci	written = 1;
16008c2ecf20Sopenharmony_ciout:
16018c2ecf20Sopenharmony_ci	return written ? written : ret;
16028c2ecf20Sopenharmony_ci}
16038c2ecf20Sopenharmony_ci
16048c2ecf20Sopenharmony_ci/*
16058c2ecf20Sopenharmony_ci * This function only does anything for file systems which can't
16068c2ecf20Sopenharmony_ci * handle sparse files.
16078c2ecf20Sopenharmony_ci *
16088c2ecf20Sopenharmony_ci * What we want to do here is fill in any hole between the current end
16098c2ecf20Sopenharmony_ci * of allocation and the end of our write. That way the rest of the
16108c2ecf20Sopenharmony_ci * write path can treat it as an non-allocating write, which has no
16118c2ecf20Sopenharmony_ci * special case code for sparse/nonsparse files.
16128c2ecf20Sopenharmony_ci */
16138c2ecf20Sopenharmony_cistatic int ocfs2_expand_nonsparse_inode(struct inode *inode,
16148c2ecf20Sopenharmony_ci					struct buffer_head *di_bh,
16158c2ecf20Sopenharmony_ci					loff_t pos, unsigned len,
16168c2ecf20Sopenharmony_ci					struct ocfs2_write_ctxt *wc)
16178c2ecf20Sopenharmony_ci{
16188c2ecf20Sopenharmony_ci	int ret;
16198c2ecf20Sopenharmony_ci	loff_t newsize = pos + len;
16208c2ecf20Sopenharmony_ci
16218c2ecf20Sopenharmony_ci	BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
16228c2ecf20Sopenharmony_ci
16238c2ecf20Sopenharmony_ci	if (newsize <= i_size_read(inode))
16248c2ecf20Sopenharmony_ci		return 0;
16258c2ecf20Sopenharmony_ci
16268c2ecf20Sopenharmony_ci	ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
16278c2ecf20Sopenharmony_ci	if (ret)
16288c2ecf20Sopenharmony_ci		mlog_errno(ret);
16298c2ecf20Sopenharmony_ci
16308c2ecf20Sopenharmony_ci	/* There is no wc if this is call from direct. */
16318c2ecf20Sopenharmony_ci	if (wc)
16328c2ecf20Sopenharmony_ci		wc->w_first_new_cpos =
16338c2ecf20Sopenharmony_ci			ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
16348c2ecf20Sopenharmony_ci
16358c2ecf20Sopenharmony_ci	return ret;
16368c2ecf20Sopenharmony_ci}
16378c2ecf20Sopenharmony_ci
16388c2ecf20Sopenharmony_cistatic int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
16398c2ecf20Sopenharmony_ci			   loff_t pos)
16408c2ecf20Sopenharmony_ci{
16418c2ecf20Sopenharmony_ci	int ret = 0;
16428c2ecf20Sopenharmony_ci
16438c2ecf20Sopenharmony_ci	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
16448c2ecf20Sopenharmony_ci	if (pos > i_size_read(inode))
16458c2ecf20Sopenharmony_ci		ret = ocfs2_zero_extend(inode, di_bh, pos);
16468c2ecf20Sopenharmony_ci
16478c2ecf20Sopenharmony_ci	return ret;
16488c2ecf20Sopenharmony_ci}
16498c2ecf20Sopenharmony_ci
16508c2ecf20Sopenharmony_ciint ocfs2_write_begin_nolock(struct address_space *mapping,
16518c2ecf20Sopenharmony_ci			     loff_t pos, unsigned len, ocfs2_write_type_t type,
16528c2ecf20Sopenharmony_ci			     struct page **pagep, void **fsdata,
16538c2ecf20Sopenharmony_ci			     struct buffer_head *di_bh, struct page *mmap_page)
16548c2ecf20Sopenharmony_ci{
16558c2ecf20Sopenharmony_ci	int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
16568c2ecf20Sopenharmony_ci	unsigned int clusters_to_alloc, extents_to_split, clusters_need = 0;
16578c2ecf20Sopenharmony_ci	struct ocfs2_write_ctxt *wc;
16588c2ecf20Sopenharmony_ci	struct inode *inode = mapping->host;
16598c2ecf20Sopenharmony_ci	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
16608c2ecf20Sopenharmony_ci	struct ocfs2_dinode *di;
16618c2ecf20Sopenharmony_ci	struct ocfs2_alloc_context *data_ac = NULL;
16628c2ecf20Sopenharmony_ci	struct ocfs2_alloc_context *meta_ac = NULL;
16638c2ecf20Sopenharmony_ci	handle_t *handle;
16648c2ecf20Sopenharmony_ci	struct ocfs2_extent_tree et;
16658c2ecf20Sopenharmony_ci	int try_free = 1, ret1;
16668c2ecf20Sopenharmony_ci
16678c2ecf20Sopenharmony_citry_again:
16688c2ecf20Sopenharmony_ci	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, type, di_bh);
16698c2ecf20Sopenharmony_ci	if (ret) {
16708c2ecf20Sopenharmony_ci		mlog_errno(ret);
16718c2ecf20Sopenharmony_ci		return ret;
16728c2ecf20Sopenharmony_ci	}
16738c2ecf20Sopenharmony_ci
16748c2ecf20Sopenharmony_ci	if (ocfs2_supports_inline_data(osb)) {
16758c2ecf20Sopenharmony_ci		ret = ocfs2_try_to_write_inline_data(mapping, inode, pos, len,
16768c2ecf20Sopenharmony_ci						     mmap_page, wc);
16778c2ecf20Sopenharmony_ci		if (ret == 1) {
16788c2ecf20Sopenharmony_ci			ret = 0;
16798c2ecf20Sopenharmony_ci			goto success;
16808c2ecf20Sopenharmony_ci		}
16818c2ecf20Sopenharmony_ci		if (ret < 0) {
16828c2ecf20Sopenharmony_ci			mlog_errno(ret);
16838c2ecf20Sopenharmony_ci			goto out;
16848c2ecf20Sopenharmony_ci		}
16858c2ecf20Sopenharmony_ci	}
16868c2ecf20Sopenharmony_ci
16878c2ecf20Sopenharmony_ci	/* Direct io change i_size late, should not zero tail here. */
16888c2ecf20Sopenharmony_ci	if (type != OCFS2_WRITE_DIRECT) {
16898c2ecf20Sopenharmony_ci		if (ocfs2_sparse_alloc(osb))
16908c2ecf20Sopenharmony_ci			ret = ocfs2_zero_tail(inode, di_bh, pos);
16918c2ecf20Sopenharmony_ci		else
16928c2ecf20Sopenharmony_ci			ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
16938c2ecf20Sopenharmony_ci							   len, wc);
16948c2ecf20Sopenharmony_ci		if (ret) {
16958c2ecf20Sopenharmony_ci			mlog_errno(ret);
16968c2ecf20Sopenharmony_ci			goto out;
16978c2ecf20Sopenharmony_ci		}
16988c2ecf20Sopenharmony_ci	}
16998c2ecf20Sopenharmony_ci
17008c2ecf20Sopenharmony_ci	ret = ocfs2_check_range_for_refcount(inode, pos, len);
17018c2ecf20Sopenharmony_ci	if (ret < 0) {
17028c2ecf20Sopenharmony_ci		mlog_errno(ret);
17038c2ecf20Sopenharmony_ci		goto out;
17048c2ecf20Sopenharmony_ci	} else if (ret == 1) {
17058c2ecf20Sopenharmony_ci		clusters_need = wc->w_clen;
17068c2ecf20Sopenharmony_ci		ret = ocfs2_refcount_cow(inode, di_bh,
17078c2ecf20Sopenharmony_ci					 wc->w_cpos, wc->w_clen, UINT_MAX);
17088c2ecf20Sopenharmony_ci		if (ret) {
17098c2ecf20Sopenharmony_ci			mlog_errno(ret);
17108c2ecf20Sopenharmony_ci			goto out;
17118c2ecf20Sopenharmony_ci		}
17128c2ecf20Sopenharmony_ci	}
17138c2ecf20Sopenharmony_ci
17148c2ecf20Sopenharmony_ci	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
17158c2ecf20Sopenharmony_ci					&extents_to_split);
17168c2ecf20Sopenharmony_ci	if (ret) {
17178c2ecf20Sopenharmony_ci		mlog_errno(ret);
17188c2ecf20Sopenharmony_ci		goto out;
17198c2ecf20Sopenharmony_ci	}
17208c2ecf20Sopenharmony_ci	clusters_need += clusters_to_alloc;
17218c2ecf20Sopenharmony_ci
17228c2ecf20Sopenharmony_ci	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
17238c2ecf20Sopenharmony_ci
17248c2ecf20Sopenharmony_ci	trace_ocfs2_write_begin_nolock(
17258c2ecf20Sopenharmony_ci			(unsigned long long)OCFS2_I(inode)->ip_blkno,
17268c2ecf20Sopenharmony_ci			(long long)i_size_read(inode),
17278c2ecf20Sopenharmony_ci			le32_to_cpu(di->i_clusters),
17288c2ecf20Sopenharmony_ci			pos, len, type, mmap_page,
17298c2ecf20Sopenharmony_ci			clusters_to_alloc, extents_to_split);
17308c2ecf20Sopenharmony_ci
17318c2ecf20Sopenharmony_ci	/*
17328c2ecf20Sopenharmony_ci	 * We set w_target_from, w_target_to here so that
17338c2ecf20Sopenharmony_ci	 * ocfs2_write_end() knows which range in the target page to
17348c2ecf20Sopenharmony_ci	 * write out. An allocation requires that we write the entire
17358c2ecf20Sopenharmony_ci	 * cluster range.
17368c2ecf20Sopenharmony_ci	 */
17378c2ecf20Sopenharmony_ci	if (clusters_to_alloc || extents_to_split) {
17388c2ecf20Sopenharmony_ci		/*
17398c2ecf20Sopenharmony_ci		 * XXX: We are stretching the limits of
17408c2ecf20Sopenharmony_ci		 * ocfs2_lock_allocators(). It greatly over-estimates
17418c2ecf20Sopenharmony_ci		 * the work to be done.
17428c2ecf20Sopenharmony_ci		 */
17438c2ecf20Sopenharmony_ci		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
17448c2ecf20Sopenharmony_ci					      wc->w_di_bh);
17458c2ecf20Sopenharmony_ci		ret = ocfs2_lock_allocators(inode, &et,
17468c2ecf20Sopenharmony_ci					    clusters_to_alloc, extents_to_split,
17478c2ecf20Sopenharmony_ci					    &data_ac, &meta_ac);
17488c2ecf20Sopenharmony_ci		if (ret) {
17498c2ecf20Sopenharmony_ci			mlog_errno(ret);
17508c2ecf20Sopenharmony_ci			goto out;
17518c2ecf20Sopenharmony_ci		}
17528c2ecf20Sopenharmony_ci
17538c2ecf20Sopenharmony_ci		if (data_ac)
17548c2ecf20Sopenharmony_ci			data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
17558c2ecf20Sopenharmony_ci
17568c2ecf20Sopenharmony_ci		credits = ocfs2_calc_extend_credits(inode->i_sb,
17578c2ecf20Sopenharmony_ci						    &di->id2.i_list);
17588c2ecf20Sopenharmony_ci	} else if (type == OCFS2_WRITE_DIRECT)
17598c2ecf20Sopenharmony_ci		/* direct write needs not to start trans if no extents alloc. */
17608c2ecf20Sopenharmony_ci		goto success;
17618c2ecf20Sopenharmony_ci
17628c2ecf20Sopenharmony_ci	/*
17638c2ecf20Sopenharmony_ci	 * We have to zero sparse allocated clusters, unwritten extent clusters,
17648c2ecf20Sopenharmony_ci	 * and non-sparse clusters we just extended.  For non-sparse writes,
17658c2ecf20Sopenharmony_ci	 * we know zeros will only be needed in the first and/or last cluster.
17668c2ecf20Sopenharmony_ci	 */
17678c2ecf20Sopenharmony_ci	if (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
17688c2ecf20Sopenharmony_ci			   wc->w_desc[wc->w_clen - 1].c_needs_zero))
17698c2ecf20Sopenharmony_ci		cluster_of_pages = 1;
17708c2ecf20Sopenharmony_ci	else
17718c2ecf20Sopenharmony_ci		cluster_of_pages = 0;
17728c2ecf20Sopenharmony_ci
17738c2ecf20Sopenharmony_ci	ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages);
17748c2ecf20Sopenharmony_ci
17758c2ecf20Sopenharmony_ci	handle = ocfs2_start_trans(osb, credits);
17768c2ecf20Sopenharmony_ci	if (IS_ERR(handle)) {
17778c2ecf20Sopenharmony_ci		ret = PTR_ERR(handle);
17788c2ecf20Sopenharmony_ci		mlog_errno(ret);
17798c2ecf20Sopenharmony_ci		goto out;
17808c2ecf20Sopenharmony_ci	}
17818c2ecf20Sopenharmony_ci
17828c2ecf20Sopenharmony_ci	wc->w_handle = handle;
17838c2ecf20Sopenharmony_ci
17848c2ecf20Sopenharmony_ci	if (clusters_to_alloc) {
17858c2ecf20Sopenharmony_ci		ret = dquot_alloc_space_nodirty(inode,
17868c2ecf20Sopenharmony_ci			ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
17878c2ecf20Sopenharmony_ci		if (ret)
17888c2ecf20Sopenharmony_ci			goto out_commit;
17898c2ecf20Sopenharmony_ci	}
17908c2ecf20Sopenharmony_ci
17918c2ecf20Sopenharmony_ci	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
17928c2ecf20Sopenharmony_ci				      OCFS2_JOURNAL_ACCESS_WRITE);
17938c2ecf20Sopenharmony_ci	if (ret) {
17948c2ecf20Sopenharmony_ci		mlog_errno(ret);
17958c2ecf20Sopenharmony_ci		goto out_quota;
17968c2ecf20Sopenharmony_ci	}
17978c2ecf20Sopenharmony_ci
17988c2ecf20Sopenharmony_ci	/*
17998c2ecf20Sopenharmony_ci	 * Fill our page array first. That way we've grabbed enough so
18008c2ecf20Sopenharmony_ci	 * that we can zero and flush if we error after adding the
18018c2ecf20Sopenharmony_ci	 * extent.
18028c2ecf20Sopenharmony_ci	 */
18038c2ecf20Sopenharmony_ci	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
18048c2ecf20Sopenharmony_ci					 cluster_of_pages, mmap_page);
18058c2ecf20Sopenharmony_ci	if (ret && ret != -EAGAIN) {
18068c2ecf20Sopenharmony_ci		mlog_errno(ret);
18078c2ecf20Sopenharmony_ci		goto out_quota;
18088c2ecf20Sopenharmony_ci	}
18098c2ecf20Sopenharmony_ci
18108c2ecf20Sopenharmony_ci	/*
18118c2ecf20Sopenharmony_ci	 * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
18128c2ecf20Sopenharmony_ci	 * the target page. In this case, we exit with no error and no target
18138c2ecf20Sopenharmony_ci	 * page. This will trigger the caller, page_mkwrite(), to re-try
18148c2ecf20Sopenharmony_ci	 * the operation.
18158c2ecf20Sopenharmony_ci	 */
18168c2ecf20Sopenharmony_ci	if (ret == -EAGAIN) {
18178c2ecf20Sopenharmony_ci		BUG_ON(wc->w_target_page);
18188c2ecf20Sopenharmony_ci		ret = 0;
18198c2ecf20Sopenharmony_ci		goto out_quota;
18208c2ecf20Sopenharmony_ci	}
18218c2ecf20Sopenharmony_ci
18228c2ecf20Sopenharmony_ci	ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
18238c2ecf20Sopenharmony_ci					  len);
18248c2ecf20Sopenharmony_ci	if (ret) {
18258c2ecf20Sopenharmony_ci		mlog_errno(ret);
18268c2ecf20Sopenharmony_ci		goto out_quota;
18278c2ecf20Sopenharmony_ci	}
18288c2ecf20Sopenharmony_ci
18298c2ecf20Sopenharmony_ci	if (data_ac)
18308c2ecf20Sopenharmony_ci		ocfs2_free_alloc_context(data_ac);
18318c2ecf20Sopenharmony_ci	if (meta_ac)
18328c2ecf20Sopenharmony_ci		ocfs2_free_alloc_context(meta_ac);
18338c2ecf20Sopenharmony_ci
18348c2ecf20Sopenharmony_cisuccess:
18358c2ecf20Sopenharmony_ci	if (pagep)
18368c2ecf20Sopenharmony_ci		*pagep = wc->w_target_page;
18378c2ecf20Sopenharmony_ci	*fsdata = wc;
18388c2ecf20Sopenharmony_ci	return 0;
18398c2ecf20Sopenharmony_ciout_quota:
18408c2ecf20Sopenharmony_ci	if (clusters_to_alloc)
18418c2ecf20Sopenharmony_ci		dquot_free_space(inode,
18428c2ecf20Sopenharmony_ci			  ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
18438c2ecf20Sopenharmony_ciout_commit:
18448c2ecf20Sopenharmony_ci	ocfs2_commit_trans(osb, handle);
18458c2ecf20Sopenharmony_ci
18468c2ecf20Sopenharmony_ciout:
18478c2ecf20Sopenharmony_ci	/*
18488c2ecf20Sopenharmony_ci	 * The mmapped page won't be unlocked in ocfs2_free_write_ctxt(),
18498c2ecf20Sopenharmony_ci	 * even in case of error here like ENOSPC and ENOMEM. So, we need
18508c2ecf20Sopenharmony_ci	 * to unlock the target page manually to prevent deadlocks when
18518c2ecf20Sopenharmony_ci	 * retrying again on ENOSPC, or when returning non-VM_FAULT_LOCKED
18528c2ecf20Sopenharmony_ci	 * to VM code.
18538c2ecf20Sopenharmony_ci	 */
18548c2ecf20Sopenharmony_ci	if (wc->w_target_locked)
18558c2ecf20Sopenharmony_ci		unlock_page(mmap_page);
18568c2ecf20Sopenharmony_ci
18578c2ecf20Sopenharmony_ci	ocfs2_free_write_ctxt(inode, wc);
18588c2ecf20Sopenharmony_ci
18598c2ecf20Sopenharmony_ci	if (data_ac) {
18608c2ecf20Sopenharmony_ci		ocfs2_free_alloc_context(data_ac);
18618c2ecf20Sopenharmony_ci		data_ac = NULL;
18628c2ecf20Sopenharmony_ci	}
18638c2ecf20Sopenharmony_ci	if (meta_ac) {
18648c2ecf20Sopenharmony_ci		ocfs2_free_alloc_context(meta_ac);
18658c2ecf20Sopenharmony_ci		meta_ac = NULL;
18668c2ecf20Sopenharmony_ci	}
18678c2ecf20Sopenharmony_ci
18688c2ecf20Sopenharmony_ci	if (ret == -ENOSPC && try_free) {
18698c2ecf20Sopenharmony_ci		/*
18708c2ecf20Sopenharmony_ci		 * Try to free some truncate log so that we can have enough
18718c2ecf20Sopenharmony_ci		 * clusters to allocate.
18728c2ecf20Sopenharmony_ci		 */
18738c2ecf20Sopenharmony_ci		try_free = 0;
18748c2ecf20Sopenharmony_ci
18758c2ecf20Sopenharmony_ci		ret1 = ocfs2_try_to_free_truncate_log(osb, clusters_need);
18768c2ecf20Sopenharmony_ci		if (ret1 == 1)
18778c2ecf20Sopenharmony_ci			goto try_again;
18788c2ecf20Sopenharmony_ci
18798c2ecf20Sopenharmony_ci		if (ret1 < 0)
18808c2ecf20Sopenharmony_ci			mlog_errno(ret1);
18818c2ecf20Sopenharmony_ci	}
18828c2ecf20Sopenharmony_ci
18838c2ecf20Sopenharmony_ci	return ret;
18848c2ecf20Sopenharmony_ci}
18858c2ecf20Sopenharmony_ci
18868c2ecf20Sopenharmony_cistatic int ocfs2_write_begin(struct file *file, struct address_space *mapping,
18878c2ecf20Sopenharmony_ci			     loff_t pos, unsigned len, unsigned flags,
18888c2ecf20Sopenharmony_ci			     struct page **pagep, void **fsdata)
18898c2ecf20Sopenharmony_ci{
18908c2ecf20Sopenharmony_ci	int ret;
18918c2ecf20Sopenharmony_ci	struct buffer_head *di_bh = NULL;
18928c2ecf20Sopenharmony_ci	struct inode *inode = mapping->host;
18938c2ecf20Sopenharmony_ci
18948c2ecf20Sopenharmony_ci	ret = ocfs2_inode_lock(inode, &di_bh, 1);
18958c2ecf20Sopenharmony_ci	if (ret) {
18968c2ecf20Sopenharmony_ci		mlog_errno(ret);
18978c2ecf20Sopenharmony_ci		return ret;
18988c2ecf20Sopenharmony_ci	}
18998c2ecf20Sopenharmony_ci
19008c2ecf20Sopenharmony_ci	/*
19018c2ecf20Sopenharmony_ci	 * Take alloc sem here to prevent concurrent lookups. That way
19028c2ecf20Sopenharmony_ci	 * the mapping, zeroing and tree manipulation within
19038c2ecf20Sopenharmony_ci	 * ocfs2_write() will be safe against ->readpage(). This
19048c2ecf20Sopenharmony_ci	 * should also serve to lock out allocation from a shared
19058c2ecf20Sopenharmony_ci	 * writeable region.
19068c2ecf20Sopenharmony_ci	 */
19078c2ecf20Sopenharmony_ci	down_write(&OCFS2_I(inode)->ip_alloc_sem);
19088c2ecf20Sopenharmony_ci
19098c2ecf20Sopenharmony_ci	ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_BUFFER,
19108c2ecf20Sopenharmony_ci				       pagep, fsdata, di_bh, NULL);
19118c2ecf20Sopenharmony_ci	if (ret) {
19128c2ecf20Sopenharmony_ci		mlog_errno(ret);
19138c2ecf20Sopenharmony_ci		goto out_fail;
19148c2ecf20Sopenharmony_ci	}
19158c2ecf20Sopenharmony_ci
19168c2ecf20Sopenharmony_ci	brelse(di_bh);
19178c2ecf20Sopenharmony_ci
19188c2ecf20Sopenharmony_ci	return 0;
19198c2ecf20Sopenharmony_ci
19208c2ecf20Sopenharmony_ciout_fail:
19218c2ecf20Sopenharmony_ci	up_write(&OCFS2_I(inode)->ip_alloc_sem);
19228c2ecf20Sopenharmony_ci
19238c2ecf20Sopenharmony_ci	brelse(di_bh);
19248c2ecf20Sopenharmony_ci	ocfs2_inode_unlock(inode, 1);
19258c2ecf20Sopenharmony_ci
19268c2ecf20Sopenharmony_ci	return ret;
19278c2ecf20Sopenharmony_ci}
19288c2ecf20Sopenharmony_ci
19298c2ecf20Sopenharmony_cistatic void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
19308c2ecf20Sopenharmony_ci				   unsigned len, unsigned *copied,
19318c2ecf20Sopenharmony_ci				   struct ocfs2_dinode *di,
19328c2ecf20Sopenharmony_ci				   struct ocfs2_write_ctxt *wc)
19338c2ecf20Sopenharmony_ci{
19348c2ecf20Sopenharmony_ci	void *kaddr;
19358c2ecf20Sopenharmony_ci
19368c2ecf20Sopenharmony_ci	if (unlikely(*copied < len)) {
19378c2ecf20Sopenharmony_ci		if (!PageUptodate(wc->w_target_page)) {
19388c2ecf20Sopenharmony_ci			*copied = 0;
19398c2ecf20Sopenharmony_ci			return;
19408c2ecf20Sopenharmony_ci		}
19418c2ecf20Sopenharmony_ci	}
19428c2ecf20Sopenharmony_ci
19438c2ecf20Sopenharmony_ci	kaddr = kmap_atomic(wc->w_target_page);
19448c2ecf20Sopenharmony_ci	memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
19458c2ecf20Sopenharmony_ci	kunmap_atomic(kaddr);
19468c2ecf20Sopenharmony_ci
19478c2ecf20Sopenharmony_ci	trace_ocfs2_write_end_inline(
19488c2ecf20Sopenharmony_ci	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
19498c2ecf20Sopenharmony_ci	     (unsigned long long)pos, *copied,
19508c2ecf20Sopenharmony_ci	     le16_to_cpu(di->id2.i_data.id_count),
19518c2ecf20Sopenharmony_ci	     le16_to_cpu(di->i_dyn_features));
19528c2ecf20Sopenharmony_ci}
19538c2ecf20Sopenharmony_ci
19548c2ecf20Sopenharmony_ciint ocfs2_write_end_nolock(struct address_space *mapping,
19558c2ecf20Sopenharmony_ci			   loff_t pos, unsigned len, unsigned copied, void *fsdata)
19568c2ecf20Sopenharmony_ci{
19578c2ecf20Sopenharmony_ci	int i, ret;
19588c2ecf20Sopenharmony_ci	unsigned from, to, start = pos & (PAGE_SIZE - 1);
19598c2ecf20Sopenharmony_ci	struct inode *inode = mapping->host;
19608c2ecf20Sopenharmony_ci	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
19618c2ecf20Sopenharmony_ci	struct ocfs2_write_ctxt *wc = fsdata;
19628c2ecf20Sopenharmony_ci	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
19638c2ecf20Sopenharmony_ci	handle_t *handle = wc->w_handle;
19648c2ecf20Sopenharmony_ci	struct page *tmppage;
19658c2ecf20Sopenharmony_ci
19668c2ecf20Sopenharmony_ci	BUG_ON(!list_empty(&wc->w_unwritten_list));
19678c2ecf20Sopenharmony_ci
19688c2ecf20Sopenharmony_ci	if (handle) {
19698c2ecf20Sopenharmony_ci		ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
19708c2ecf20Sopenharmony_ci				wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE);
19718c2ecf20Sopenharmony_ci		if (ret) {
19728c2ecf20Sopenharmony_ci			copied = ret;
19738c2ecf20Sopenharmony_ci			mlog_errno(ret);
19748c2ecf20Sopenharmony_ci			goto out;
19758c2ecf20Sopenharmony_ci		}
19768c2ecf20Sopenharmony_ci	}
19778c2ecf20Sopenharmony_ci
19788c2ecf20Sopenharmony_ci	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
19798c2ecf20Sopenharmony_ci		ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
19808c2ecf20Sopenharmony_ci		goto out_write_size;
19818c2ecf20Sopenharmony_ci	}
19828c2ecf20Sopenharmony_ci
19838c2ecf20Sopenharmony_ci	if (unlikely(copied < len) && wc->w_target_page) {
19848c2ecf20Sopenharmony_ci		loff_t new_isize;
19858c2ecf20Sopenharmony_ci
19868c2ecf20Sopenharmony_ci		if (!PageUptodate(wc->w_target_page))
19878c2ecf20Sopenharmony_ci			copied = 0;
19888c2ecf20Sopenharmony_ci
19898c2ecf20Sopenharmony_ci		new_isize = max_t(loff_t, i_size_read(inode), pos + copied);
19908c2ecf20Sopenharmony_ci		if (new_isize > page_offset(wc->w_target_page))
19918c2ecf20Sopenharmony_ci			ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
19928c2ecf20Sopenharmony_ci					       start+len);
19938c2ecf20Sopenharmony_ci		else {
19948c2ecf20Sopenharmony_ci			/*
19958c2ecf20Sopenharmony_ci			 * When page is fully beyond new isize (data copy
19968c2ecf20Sopenharmony_ci			 * failed), do not bother zeroing the page. Invalidate
19978c2ecf20Sopenharmony_ci			 * it instead so that writeback does not get confused
19988c2ecf20Sopenharmony_ci			 * put page & buffer dirty bits into inconsistent
19998c2ecf20Sopenharmony_ci			 * state.
20008c2ecf20Sopenharmony_ci			 */
20018c2ecf20Sopenharmony_ci			block_invalidatepage(wc->w_target_page, 0, PAGE_SIZE);
20028c2ecf20Sopenharmony_ci		}
20038c2ecf20Sopenharmony_ci	}
20048c2ecf20Sopenharmony_ci	if (wc->w_target_page)
20058c2ecf20Sopenharmony_ci		flush_dcache_page(wc->w_target_page);
20068c2ecf20Sopenharmony_ci
20078c2ecf20Sopenharmony_ci	for(i = 0; i < wc->w_num_pages; i++) {
20088c2ecf20Sopenharmony_ci		tmppage = wc->w_pages[i];
20098c2ecf20Sopenharmony_ci
20108c2ecf20Sopenharmony_ci		/* This is the direct io target page. */
20118c2ecf20Sopenharmony_ci		if (tmppage == NULL)
20128c2ecf20Sopenharmony_ci			continue;
20138c2ecf20Sopenharmony_ci
20148c2ecf20Sopenharmony_ci		if (tmppage == wc->w_target_page) {
20158c2ecf20Sopenharmony_ci			from = wc->w_target_from;
20168c2ecf20Sopenharmony_ci			to = wc->w_target_to;
20178c2ecf20Sopenharmony_ci
20188c2ecf20Sopenharmony_ci			BUG_ON(from > PAGE_SIZE ||
20198c2ecf20Sopenharmony_ci			       to > PAGE_SIZE ||
20208c2ecf20Sopenharmony_ci			       to < from);
20218c2ecf20Sopenharmony_ci		} else {
20228c2ecf20Sopenharmony_ci			/*
20238c2ecf20Sopenharmony_ci			 * Pages adjacent to the target (if any) imply
20248c2ecf20Sopenharmony_ci			 * a hole-filling write in which case we want
20258c2ecf20Sopenharmony_ci			 * to flush their entire range.
20268c2ecf20Sopenharmony_ci			 */
20278c2ecf20Sopenharmony_ci			from = 0;
20288c2ecf20Sopenharmony_ci			to = PAGE_SIZE;
20298c2ecf20Sopenharmony_ci		}
20308c2ecf20Sopenharmony_ci
20318c2ecf20Sopenharmony_ci		if (page_has_buffers(tmppage)) {
20328c2ecf20Sopenharmony_ci			if (handle && ocfs2_should_order_data(inode)) {
20338c2ecf20Sopenharmony_ci				loff_t start_byte =
20348c2ecf20Sopenharmony_ci					((loff_t)tmppage->index << PAGE_SHIFT) +
20358c2ecf20Sopenharmony_ci					from;
20368c2ecf20Sopenharmony_ci				loff_t length = to - from;
20378c2ecf20Sopenharmony_ci				ocfs2_jbd2_inode_add_write(handle, inode,
20388c2ecf20Sopenharmony_ci							   start_byte, length);
20398c2ecf20Sopenharmony_ci			}
20408c2ecf20Sopenharmony_ci			block_commit_write(tmppage, from, to);
20418c2ecf20Sopenharmony_ci		}
20428c2ecf20Sopenharmony_ci	}
20438c2ecf20Sopenharmony_ci
20448c2ecf20Sopenharmony_ciout_write_size:
20458c2ecf20Sopenharmony_ci	/* Direct io do not update i_size here. */
20468c2ecf20Sopenharmony_ci	if (wc->w_type != OCFS2_WRITE_DIRECT) {
20478c2ecf20Sopenharmony_ci		pos += copied;
20488c2ecf20Sopenharmony_ci		if (pos > i_size_read(inode)) {
20498c2ecf20Sopenharmony_ci			i_size_write(inode, pos);
20508c2ecf20Sopenharmony_ci			mark_inode_dirty(inode);
20518c2ecf20Sopenharmony_ci		}
20528c2ecf20Sopenharmony_ci		inode->i_blocks = ocfs2_inode_sector_count(inode);
20538c2ecf20Sopenharmony_ci		di->i_size = cpu_to_le64((u64)i_size_read(inode));
20548c2ecf20Sopenharmony_ci		inode->i_mtime = inode->i_ctime = current_time(inode);
20558c2ecf20Sopenharmony_ci		di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
20568c2ecf20Sopenharmony_ci		di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
20578c2ecf20Sopenharmony_ci		if (handle)
20588c2ecf20Sopenharmony_ci			ocfs2_update_inode_fsync_trans(handle, inode, 1);
20598c2ecf20Sopenharmony_ci	}
20608c2ecf20Sopenharmony_ci	if (handle)
20618c2ecf20Sopenharmony_ci		ocfs2_journal_dirty(handle, wc->w_di_bh);
20628c2ecf20Sopenharmony_ci
20638c2ecf20Sopenharmony_ciout:
20648c2ecf20Sopenharmony_ci	/* unlock pages before dealloc since it needs acquiring j_trans_barrier
20658c2ecf20Sopenharmony_ci	 * lock, or it will cause a deadlock since journal commit threads holds
20668c2ecf20Sopenharmony_ci	 * this lock and will ask for the page lock when flushing the data.
20678c2ecf20Sopenharmony_ci	 * put it here to preserve the unlock order.
20688c2ecf20Sopenharmony_ci	 */
20698c2ecf20Sopenharmony_ci	ocfs2_unlock_pages(wc);
20708c2ecf20Sopenharmony_ci
20718c2ecf20Sopenharmony_ci	if (handle)
20728c2ecf20Sopenharmony_ci		ocfs2_commit_trans(osb, handle);
20738c2ecf20Sopenharmony_ci
20748c2ecf20Sopenharmony_ci	ocfs2_run_deallocs(osb, &wc->w_dealloc);
20758c2ecf20Sopenharmony_ci
20768c2ecf20Sopenharmony_ci	brelse(wc->w_di_bh);
20778c2ecf20Sopenharmony_ci	kfree(wc);
20788c2ecf20Sopenharmony_ci
20798c2ecf20Sopenharmony_ci	return copied;
20808c2ecf20Sopenharmony_ci}
20818c2ecf20Sopenharmony_ci
20828c2ecf20Sopenharmony_cistatic int ocfs2_write_end(struct file *file, struct address_space *mapping,
20838c2ecf20Sopenharmony_ci			   loff_t pos, unsigned len, unsigned copied,
20848c2ecf20Sopenharmony_ci			   struct page *page, void *fsdata)
20858c2ecf20Sopenharmony_ci{
20868c2ecf20Sopenharmony_ci	int ret;
20878c2ecf20Sopenharmony_ci	struct inode *inode = mapping->host;
20888c2ecf20Sopenharmony_ci
20898c2ecf20Sopenharmony_ci	ret = ocfs2_write_end_nolock(mapping, pos, len, copied, fsdata);
20908c2ecf20Sopenharmony_ci
20918c2ecf20Sopenharmony_ci	up_write(&OCFS2_I(inode)->ip_alloc_sem);
20928c2ecf20Sopenharmony_ci	ocfs2_inode_unlock(inode, 1);
20938c2ecf20Sopenharmony_ci
20948c2ecf20Sopenharmony_ci	return ret;
20958c2ecf20Sopenharmony_ci}
20968c2ecf20Sopenharmony_ci
20978c2ecf20Sopenharmony_cistruct ocfs2_dio_write_ctxt {
20988c2ecf20Sopenharmony_ci	struct list_head	dw_zero_list;
20998c2ecf20Sopenharmony_ci	unsigned		dw_zero_count;
21008c2ecf20Sopenharmony_ci	int			dw_orphaned;
21018c2ecf20Sopenharmony_ci	pid_t			dw_writer_pid;
21028c2ecf20Sopenharmony_ci};
21038c2ecf20Sopenharmony_ci
21048c2ecf20Sopenharmony_cistatic struct ocfs2_dio_write_ctxt *
21058c2ecf20Sopenharmony_ciocfs2_dio_alloc_write_ctx(struct buffer_head *bh, int *alloc)
21068c2ecf20Sopenharmony_ci{
21078c2ecf20Sopenharmony_ci	struct ocfs2_dio_write_ctxt *dwc = NULL;
21088c2ecf20Sopenharmony_ci
21098c2ecf20Sopenharmony_ci	if (bh->b_private)
21108c2ecf20Sopenharmony_ci		return bh->b_private;
21118c2ecf20Sopenharmony_ci
21128c2ecf20Sopenharmony_ci	dwc = kmalloc(sizeof(struct ocfs2_dio_write_ctxt), GFP_NOFS);
21138c2ecf20Sopenharmony_ci	if (dwc == NULL)
21148c2ecf20Sopenharmony_ci		return NULL;
21158c2ecf20Sopenharmony_ci	INIT_LIST_HEAD(&dwc->dw_zero_list);
21168c2ecf20Sopenharmony_ci	dwc->dw_zero_count = 0;
21178c2ecf20Sopenharmony_ci	dwc->dw_orphaned = 0;
21188c2ecf20Sopenharmony_ci	dwc->dw_writer_pid = task_pid_nr(current);
21198c2ecf20Sopenharmony_ci	bh->b_private = dwc;
21208c2ecf20Sopenharmony_ci	*alloc = 1;
21218c2ecf20Sopenharmony_ci
21228c2ecf20Sopenharmony_ci	return dwc;
21238c2ecf20Sopenharmony_ci}
21248c2ecf20Sopenharmony_ci
21258c2ecf20Sopenharmony_cistatic void ocfs2_dio_free_write_ctx(struct inode *inode,
21268c2ecf20Sopenharmony_ci				     struct ocfs2_dio_write_ctxt *dwc)
21278c2ecf20Sopenharmony_ci{
21288c2ecf20Sopenharmony_ci	ocfs2_free_unwritten_list(inode, &dwc->dw_zero_list);
21298c2ecf20Sopenharmony_ci	kfree(dwc);
21308c2ecf20Sopenharmony_ci}
21318c2ecf20Sopenharmony_ci
21328c2ecf20Sopenharmony_ci/*
21338c2ecf20Sopenharmony_ci * TODO: Make this into a generic get_blocks function.
21348c2ecf20Sopenharmony_ci *
21358c2ecf20Sopenharmony_ci * From do_direct_io in direct-io.c:
21368c2ecf20Sopenharmony_ci *  "So what we do is to permit the ->get_blocks function to populate
21378c2ecf20Sopenharmony_ci *   bh.b_size with the size of IO which is permitted at this offset and
21388c2ecf20Sopenharmony_ci *   this i_blkbits."
21398c2ecf20Sopenharmony_ci *
21408c2ecf20Sopenharmony_ci * This function is called directly from get_more_blocks in direct-io.c.
21418c2ecf20Sopenharmony_ci *
21428c2ecf20Sopenharmony_ci * called like this: dio->get_blocks(dio->inode, fs_startblk,
21438c2ecf20Sopenharmony_ci * 					fs_count, map_bh, dio->rw == WRITE);
21448c2ecf20Sopenharmony_ci */
21458c2ecf20Sopenharmony_cistatic int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
21468c2ecf20Sopenharmony_ci			       struct buffer_head *bh_result, int create)
21478c2ecf20Sopenharmony_ci{
21488c2ecf20Sopenharmony_ci	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
21498c2ecf20Sopenharmony_ci	struct ocfs2_inode_info *oi = OCFS2_I(inode);
21508c2ecf20Sopenharmony_ci	struct ocfs2_write_ctxt *wc;
21518c2ecf20Sopenharmony_ci	struct ocfs2_write_cluster_desc *desc = NULL;
21528c2ecf20Sopenharmony_ci	struct ocfs2_dio_write_ctxt *dwc = NULL;
21538c2ecf20Sopenharmony_ci	struct buffer_head *di_bh = NULL;
21548c2ecf20Sopenharmony_ci	u64 p_blkno;
21558c2ecf20Sopenharmony_ci	unsigned int i_blkbits = inode->i_sb->s_blocksize_bits;
21568c2ecf20Sopenharmony_ci	loff_t pos = iblock << i_blkbits;
21578c2ecf20Sopenharmony_ci	sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits;
21588c2ecf20Sopenharmony_ci	unsigned len, total_len = bh_result->b_size;
21598c2ecf20Sopenharmony_ci	int ret = 0, first_get_block = 0;
21608c2ecf20Sopenharmony_ci
21618c2ecf20Sopenharmony_ci	len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
21628c2ecf20Sopenharmony_ci	len = min(total_len, len);
21638c2ecf20Sopenharmony_ci
21648c2ecf20Sopenharmony_ci	/*
21658c2ecf20Sopenharmony_ci	 * bh_result->b_size is count in get_more_blocks according to write
21668c2ecf20Sopenharmony_ci	 * "pos" and "end", we need map twice to return different buffer state:
21678c2ecf20Sopenharmony_ci	 * 1. area in file size, not set NEW;
21688c2ecf20Sopenharmony_ci	 * 2. area out file size, set  NEW.
21698c2ecf20Sopenharmony_ci	 *
21708c2ecf20Sopenharmony_ci	 *		   iblock    endblk
21718c2ecf20Sopenharmony_ci	 * |--------|---------|---------|---------
21728c2ecf20Sopenharmony_ci	 * |<-------area in file------->|
21738c2ecf20Sopenharmony_ci	 */
21748c2ecf20Sopenharmony_ci
21758c2ecf20Sopenharmony_ci	if ((iblock <= endblk) &&
21768c2ecf20Sopenharmony_ci	    ((iblock + ((len - 1) >> i_blkbits)) > endblk))
21778c2ecf20Sopenharmony_ci		len = (endblk - iblock + 1) << i_blkbits;
21788c2ecf20Sopenharmony_ci
21798c2ecf20Sopenharmony_ci	mlog(0, "get block of %lu at %llu:%u req %u\n",
21808c2ecf20Sopenharmony_ci			inode->i_ino, pos, len, total_len);
21818c2ecf20Sopenharmony_ci
21828c2ecf20Sopenharmony_ci	/*
21838c2ecf20Sopenharmony_ci	 * Because we need to change file size in ocfs2_dio_end_io_write(), or
21848c2ecf20Sopenharmony_ci	 * we may need to add it to orphan dir. So can not fall to fast path
21858c2ecf20Sopenharmony_ci	 * while file size will be changed.
21868c2ecf20Sopenharmony_ci	 */
21878c2ecf20Sopenharmony_ci	if (pos + total_len <= i_size_read(inode)) {
21888c2ecf20Sopenharmony_ci
21898c2ecf20Sopenharmony_ci		/* This is the fast path for re-write. */
21908c2ecf20Sopenharmony_ci		ret = ocfs2_lock_get_block(inode, iblock, bh_result, create);
21918c2ecf20Sopenharmony_ci		if (buffer_mapped(bh_result) &&
21928c2ecf20Sopenharmony_ci		    !buffer_new(bh_result) &&
21938c2ecf20Sopenharmony_ci		    ret == 0)
21948c2ecf20Sopenharmony_ci			goto out;
21958c2ecf20Sopenharmony_ci
21968c2ecf20Sopenharmony_ci		/* Clear state set by ocfs2_get_block. */
21978c2ecf20Sopenharmony_ci		bh_result->b_state = 0;
21988c2ecf20Sopenharmony_ci	}
21998c2ecf20Sopenharmony_ci
22008c2ecf20Sopenharmony_ci	dwc = ocfs2_dio_alloc_write_ctx(bh_result, &first_get_block);
22018c2ecf20Sopenharmony_ci	if (unlikely(dwc == NULL)) {
22028c2ecf20Sopenharmony_ci		ret = -ENOMEM;
22038c2ecf20Sopenharmony_ci		mlog_errno(ret);
22048c2ecf20Sopenharmony_ci		goto out;
22058c2ecf20Sopenharmony_ci	}
22068c2ecf20Sopenharmony_ci
22078c2ecf20Sopenharmony_ci	if (ocfs2_clusters_for_bytes(inode->i_sb, pos + total_len) >
22088c2ecf20Sopenharmony_ci	    ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)) &&
22098c2ecf20Sopenharmony_ci	    !dwc->dw_orphaned) {
22108c2ecf20Sopenharmony_ci		/*
22118c2ecf20Sopenharmony_ci		 * when we are going to alloc extents beyond file size, add the
22128c2ecf20Sopenharmony_ci		 * inode to orphan dir, so we can recall those spaces when
22138c2ecf20Sopenharmony_ci		 * system crashed during write.
22148c2ecf20Sopenharmony_ci		 */
22158c2ecf20Sopenharmony_ci		ret = ocfs2_add_inode_to_orphan(osb, inode);
22168c2ecf20Sopenharmony_ci		if (ret < 0) {
22178c2ecf20Sopenharmony_ci			mlog_errno(ret);
22188c2ecf20Sopenharmony_ci			goto out;
22198c2ecf20Sopenharmony_ci		}
22208c2ecf20Sopenharmony_ci		dwc->dw_orphaned = 1;
22218c2ecf20Sopenharmony_ci	}
22228c2ecf20Sopenharmony_ci
22238c2ecf20Sopenharmony_ci	ret = ocfs2_inode_lock(inode, &di_bh, 1);
22248c2ecf20Sopenharmony_ci	if (ret) {
22258c2ecf20Sopenharmony_ci		mlog_errno(ret);
22268c2ecf20Sopenharmony_ci		goto out;
22278c2ecf20Sopenharmony_ci	}
22288c2ecf20Sopenharmony_ci
22298c2ecf20Sopenharmony_ci	down_write(&oi->ip_alloc_sem);
22308c2ecf20Sopenharmony_ci
22318c2ecf20Sopenharmony_ci	if (first_get_block) {
22328c2ecf20Sopenharmony_ci		if (ocfs2_sparse_alloc(osb))
22338c2ecf20Sopenharmony_ci			ret = ocfs2_zero_tail(inode, di_bh, pos);
22348c2ecf20Sopenharmony_ci		else
22358c2ecf20Sopenharmony_ci			ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
22368c2ecf20Sopenharmony_ci							   total_len, NULL);
22378c2ecf20Sopenharmony_ci		if (ret < 0) {
22388c2ecf20Sopenharmony_ci			mlog_errno(ret);
22398c2ecf20Sopenharmony_ci			goto unlock;
22408c2ecf20Sopenharmony_ci		}
22418c2ecf20Sopenharmony_ci	}
22428c2ecf20Sopenharmony_ci
22438c2ecf20Sopenharmony_ci	ret = ocfs2_write_begin_nolock(inode->i_mapping, pos, len,
22448c2ecf20Sopenharmony_ci				       OCFS2_WRITE_DIRECT, NULL,
22458c2ecf20Sopenharmony_ci				       (void **)&wc, di_bh, NULL);
22468c2ecf20Sopenharmony_ci	if (ret) {
22478c2ecf20Sopenharmony_ci		mlog_errno(ret);
22488c2ecf20Sopenharmony_ci		goto unlock;
22498c2ecf20Sopenharmony_ci	}
22508c2ecf20Sopenharmony_ci
22518c2ecf20Sopenharmony_ci	desc = &wc->w_desc[0];
22528c2ecf20Sopenharmony_ci
22538c2ecf20Sopenharmony_ci	p_blkno = ocfs2_clusters_to_blocks(inode->i_sb, desc->c_phys);
22548c2ecf20Sopenharmony_ci	BUG_ON(p_blkno == 0);
22558c2ecf20Sopenharmony_ci	p_blkno += iblock & (u64)(ocfs2_clusters_to_blocks(inode->i_sb, 1) - 1);
22568c2ecf20Sopenharmony_ci
22578c2ecf20Sopenharmony_ci	map_bh(bh_result, inode->i_sb, p_blkno);
22588c2ecf20Sopenharmony_ci	bh_result->b_size = len;
22598c2ecf20Sopenharmony_ci	if (desc->c_needs_zero)
22608c2ecf20Sopenharmony_ci		set_buffer_new(bh_result);
22618c2ecf20Sopenharmony_ci
22628c2ecf20Sopenharmony_ci	if (iblock > endblk)
22638c2ecf20Sopenharmony_ci		set_buffer_new(bh_result);
22648c2ecf20Sopenharmony_ci
22658c2ecf20Sopenharmony_ci	/* May sleep in end_io. It should not happen in a irq context. So defer
22668c2ecf20Sopenharmony_ci	 * it to dio work queue. */
22678c2ecf20Sopenharmony_ci	set_buffer_defer_completion(bh_result);
22688c2ecf20Sopenharmony_ci
22698c2ecf20Sopenharmony_ci	if (!list_empty(&wc->w_unwritten_list)) {
22708c2ecf20Sopenharmony_ci		struct ocfs2_unwritten_extent *ue = NULL;
22718c2ecf20Sopenharmony_ci
22728c2ecf20Sopenharmony_ci		ue = list_first_entry(&wc->w_unwritten_list,
22738c2ecf20Sopenharmony_ci				      struct ocfs2_unwritten_extent,
22748c2ecf20Sopenharmony_ci				      ue_node);
22758c2ecf20Sopenharmony_ci		BUG_ON(ue->ue_cpos != desc->c_cpos);
22768c2ecf20Sopenharmony_ci		/* The physical address may be 0, fill it. */
22778c2ecf20Sopenharmony_ci		ue->ue_phys = desc->c_phys;
22788c2ecf20Sopenharmony_ci
22798c2ecf20Sopenharmony_ci		list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
22808c2ecf20Sopenharmony_ci		dwc->dw_zero_count += wc->w_unwritten_count;
22818c2ecf20Sopenharmony_ci	}
22828c2ecf20Sopenharmony_ci
22838c2ecf20Sopenharmony_ci	ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
22848c2ecf20Sopenharmony_ci	BUG_ON(ret != len);
22858c2ecf20Sopenharmony_ci	ret = 0;
22868c2ecf20Sopenharmony_ciunlock:
22878c2ecf20Sopenharmony_ci	up_write(&oi->ip_alloc_sem);
22888c2ecf20Sopenharmony_ci	ocfs2_inode_unlock(inode, 1);
22898c2ecf20Sopenharmony_ci	brelse(di_bh);
22908c2ecf20Sopenharmony_ciout:
22918c2ecf20Sopenharmony_ci	if (ret < 0)
22928c2ecf20Sopenharmony_ci		ret = -EIO;
22938c2ecf20Sopenharmony_ci	return ret;
22948c2ecf20Sopenharmony_ci}
22958c2ecf20Sopenharmony_ci
22968c2ecf20Sopenharmony_cistatic int ocfs2_dio_end_io_write(struct inode *inode,
22978c2ecf20Sopenharmony_ci				  struct ocfs2_dio_write_ctxt *dwc,
22988c2ecf20Sopenharmony_ci				  loff_t offset,
22998c2ecf20Sopenharmony_ci				  ssize_t bytes)
23008c2ecf20Sopenharmony_ci{
23018c2ecf20Sopenharmony_ci	struct ocfs2_cached_dealloc_ctxt dealloc;
23028c2ecf20Sopenharmony_ci	struct ocfs2_extent_tree et;
23038c2ecf20Sopenharmony_ci	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
23048c2ecf20Sopenharmony_ci	struct ocfs2_inode_info *oi = OCFS2_I(inode);
23058c2ecf20Sopenharmony_ci	struct ocfs2_unwritten_extent *ue = NULL;
23068c2ecf20Sopenharmony_ci	struct buffer_head *di_bh = NULL;
23078c2ecf20Sopenharmony_ci	struct ocfs2_dinode *di;
23088c2ecf20Sopenharmony_ci	struct ocfs2_alloc_context *data_ac = NULL;
23098c2ecf20Sopenharmony_ci	struct ocfs2_alloc_context *meta_ac = NULL;
23108c2ecf20Sopenharmony_ci	handle_t *handle = NULL;
23118c2ecf20Sopenharmony_ci	loff_t end = offset + bytes;
23128c2ecf20Sopenharmony_ci	int ret = 0, credits = 0;
23138c2ecf20Sopenharmony_ci
23148c2ecf20Sopenharmony_ci	ocfs2_init_dealloc_ctxt(&dealloc);
23158c2ecf20Sopenharmony_ci
23168c2ecf20Sopenharmony_ci	/* We do clear unwritten, delete orphan, change i_size here. If neither
23178c2ecf20Sopenharmony_ci	 * of these happen, we can skip all this. */
23188c2ecf20Sopenharmony_ci	if (list_empty(&dwc->dw_zero_list) &&
23198c2ecf20Sopenharmony_ci	    end <= i_size_read(inode) &&
23208c2ecf20Sopenharmony_ci	    !dwc->dw_orphaned)
23218c2ecf20Sopenharmony_ci		goto out;
23228c2ecf20Sopenharmony_ci
23238c2ecf20Sopenharmony_ci	ret = ocfs2_inode_lock(inode, &di_bh, 1);
23248c2ecf20Sopenharmony_ci	if (ret < 0) {
23258c2ecf20Sopenharmony_ci		mlog_errno(ret);
23268c2ecf20Sopenharmony_ci		goto out;
23278c2ecf20Sopenharmony_ci	}
23288c2ecf20Sopenharmony_ci
23298c2ecf20Sopenharmony_ci	down_write(&oi->ip_alloc_sem);
23308c2ecf20Sopenharmony_ci
23318c2ecf20Sopenharmony_ci	/* Delete orphan before acquire i_mutex. */
23328c2ecf20Sopenharmony_ci	if (dwc->dw_orphaned) {
23338c2ecf20Sopenharmony_ci		BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
23348c2ecf20Sopenharmony_ci
23358c2ecf20Sopenharmony_ci		end = end > i_size_read(inode) ? end : 0;
23368c2ecf20Sopenharmony_ci
23378c2ecf20Sopenharmony_ci		ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh,
23388c2ecf20Sopenharmony_ci				!!end, end);
23398c2ecf20Sopenharmony_ci		if (ret < 0)
23408c2ecf20Sopenharmony_ci			mlog_errno(ret);
23418c2ecf20Sopenharmony_ci	}
23428c2ecf20Sopenharmony_ci
23438c2ecf20Sopenharmony_ci	di = (struct ocfs2_dinode *)di_bh->b_data;
23448c2ecf20Sopenharmony_ci
23458c2ecf20Sopenharmony_ci	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
23468c2ecf20Sopenharmony_ci
23478c2ecf20Sopenharmony_ci	/* Attach dealloc with extent tree in case that we may reuse extents
23488c2ecf20Sopenharmony_ci	 * which are already unlinked from current extent tree due to extent
23498c2ecf20Sopenharmony_ci	 * rotation and merging.
23508c2ecf20Sopenharmony_ci	 */
23518c2ecf20Sopenharmony_ci	et.et_dealloc = &dealloc;
23528c2ecf20Sopenharmony_ci
23538c2ecf20Sopenharmony_ci	ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
23548c2ecf20Sopenharmony_ci				    &data_ac, &meta_ac);
23558c2ecf20Sopenharmony_ci	if (ret) {
23568c2ecf20Sopenharmony_ci		mlog_errno(ret);
23578c2ecf20Sopenharmony_ci		goto unlock;
23588c2ecf20Sopenharmony_ci	}
23598c2ecf20Sopenharmony_ci
23608c2ecf20Sopenharmony_ci	credits = ocfs2_calc_extend_credits(inode->i_sb, &di->id2.i_list);
23618c2ecf20Sopenharmony_ci
23628c2ecf20Sopenharmony_ci	handle = ocfs2_start_trans(osb, credits);
23638c2ecf20Sopenharmony_ci	if (IS_ERR(handle)) {
23648c2ecf20Sopenharmony_ci		ret = PTR_ERR(handle);
23658c2ecf20Sopenharmony_ci		mlog_errno(ret);
23668c2ecf20Sopenharmony_ci		goto unlock;
23678c2ecf20Sopenharmony_ci	}
23688c2ecf20Sopenharmony_ci	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
23698c2ecf20Sopenharmony_ci				      OCFS2_JOURNAL_ACCESS_WRITE);
23708c2ecf20Sopenharmony_ci	if (ret) {
23718c2ecf20Sopenharmony_ci		mlog_errno(ret);
23728c2ecf20Sopenharmony_ci		goto commit;
23738c2ecf20Sopenharmony_ci	}
23748c2ecf20Sopenharmony_ci
23758c2ecf20Sopenharmony_ci	list_for_each_entry(ue, &dwc->dw_zero_list, ue_node) {
23768c2ecf20Sopenharmony_ci		ret = ocfs2_mark_extent_written(inode, &et, handle,
23778c2ecf20Sopenharmony_ci						ue->ue_cpos, 1,
23788c2ecf20Sopenharmony_ci						ue->ue_phys,
23798c2ecf20Sopenharmony_ci						meta_ac, &dealloc);
23808c2ecf20Sopenharmony_ci		if (ret < 0) {
23818c2ecf20Sopenharmony_ci			mlog_errno(ret);
23828c2ecf20Sopenharmony_ci			break;
23838c2ecf20Sopenharmony_ci		}
23848c2ecf20Sopenharmony_ci	}
23858c2ecf20Sopenharmony_ci
23868c2ecf20Sopenharmony_ci	if (end > i_size_read(inode)) {
23878c2ecf20Sopenharmony_ci		ret = ocfs2_set_inode_size(handle, inode, di_bh, end);
23888c2ecf20Sopenharmony_ci		if (ret < 0)
23898c2ecf20Sopenharmony_ci			mlog_errno(ret);
23908c2ecf20Sopenharmony_ci	}
23918c2ecf20Sopenharmony_cicommit:
23928c2ecf20Sopenharmony_ci	ocfs2_commit_trans(osb, handle);
23938c2ecf20Sopenharmony_ciunlock:
23948c2ecf20Sopenharmony_ci	up_write(&oi->ip_alloc_sem);
23958c2ecf20Sopenharmony_ci	ocfs2_inode_unlock(inode, 1);
23968c2ecf20Sopenharmony_ci	brelse(di_bh);
23978c2ecf20Sopenharmony_ciout:
23988c2ecf20Sopenharmony_ci	if (data_ac)
23998c2ecf20Sopenharmony_ci		ocfs2_free_alloc_context(data_ac);
24008c2ecf20Sopenharmony_ci	if (meta_ac)
24018c2ecf20Sopenharmony_ci		ocfs2_free_alloc_context(meta_ac);
24028c2ecf20Sopenharmony_ci	ocfs2_run_deallocs(osb, &dealloc);
24038c2ecf20Sopenharmony_ci	ocfs2_dio_free_write_ctx(inode, dwc);
24048c2ecf20Sopenharmony_ci
24058c2ecf20Sopenharmony_ci	return ret;
24068c2ecf20Sopenharmony_ci}
24078c2ecf20Sopenharmony_ci
24088c2ecf20Sopenharmony_ci/*
24098c2ecf20Sopenharmony_ci * ocfs2_dio_end_io is called by the dio core when a dio is finished.  We're
24108c2ecf20Sopenharmony_ci * particularly interested in the aio/dio case.  We use the rw_lock DLM lock
24118c2ecf20Sopenharmony_ci * to protect io on one node from truncation on another.
24128c2ecf20Sopenharmony_ci */
24138c2ecf20Sopenharmony_cistatic int ocfs2_dio_end_io(struct kiocb *iocb,
24148c2ecf20Sopenharmony_ci			    loff_t offset,
24158c2ecf20Sopenharmony_ci			    ssize_t bytes,
24168c2ecf20Sopenharmony_ci			    void *private)
24178c2ecf20Sopenharmony_ci{
24188c2ecf20Sopenharmony_ci	struct inode *inode = file_inode(iocb->ki_filp);
24198c2ecf20Sopenharmony_ci	int level;
24208c2ecf20Sopenharmony_ci	int ret = 0;
24218c2ecf20Sopenharmony_ci
24228c2ecf20Sopenharmony_ci	/* this io's submitter should not have unlocked this before we could */
24238c2ecf20Sopenharmony_ci	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
24248c2ecf20Sopenharmony_ci
24258c2ecf20Sopenharmony_ci	if (bytes <= 0)
24268c2ecf20Sopenharmony_ci		mlog_ratelimited(ML_ERROR, "Direct IO failed, bytes = %lld",
24278c2ecf20Sopenharmony_ci				 (long long)bytes);
24288c2ecf20Sopenharmony_ci	if (private) {
24298c2ecf20Sopenharmony_ci		if (bytes > 0)
24308c2ecf20Sopenharmony_ci			ret = ocfs2_dio_end_io_write(inode, private, offset,
24318c2ecf20Sopenharmony_ci						     bytes);
24328c2ecf20Sopenharmony_ci		else
24338c2ecf20Sopenharmony_ci			ocfs2_dio_free_write_ctx(inode, private);
24348c2ecf20Sopenharmony_ci	}
24358c2ecf20Sopenharmony_ci
24368c2ecf20Sopenharmony_ci	ocfs2_iocb_clear_rw_locked(iocb);
24378c2ecf20Sopenharmony_ci
24388c2ecf20Sopenharmony_ci	level = ocfs2_iocb_rw_locked_level(iocb);
24398c2ecf20Sopenharmony_ci	ocfs2_rw_unlock(inode, level);
24408c2ecf20Sopenharmony_ci	return ret;
24418c2ecf20Sopenharmony_ci}
24428c2ecf20Sopenharmony_ci
24438c2ecf20Sopenharmony_cistatic ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
24448c2ecf20Sopenharmony_ci{
24458c2ecf20Sopenharmony_ci	struct file *file = iocb->ki_filp;
24468c2ecf20Sopenharmony_ci	struct inode *inode = file->f_mapping->host;
24478c2ecf20Sopenharmony_ci	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
24488c2ecf20Sopenharmony_ci	get_block_t *get_block;
24498c2ecf20Sopenharmony_ci
24508c2ecf20Sopenharmony_ci	/*
24518c2ecf20Sopenharmony_ci	 * Fallback to buffered I/O if we see an inode without
24528c2ecf20Sopenharmony_ci	 * extents.
24538c2ecf20Sopenharmony_ci	 */
24548c2ecf20Sopenharmony_ci	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
24558c2ecf20Sopenharmony_ci		return 0;
24568c2ecf20Sopenharmony_ci
24578c2ecf20Sopenharmony_ci	/* Fallback to buffered I/O if we do not support append dio. */
24588c2ecf20Sopenharmony_ci	if (iocb->ki_pos + iter->count > i_size_read(inode) &&
24598c2ecf20Sopenharmony_ci	    !ocfs2_supports_append_dio(osb))
24608c2ecf20Sopenharmony_ci		return 0;
24618c2ecf20Sopenharmony_ci
24628c2ecf20Sopenharmony_ci	if (iov_iter_rw(iter) == READ)
24638c2ecf20Sopenharmony_ci		get_block = ocfs2_lock_get_block;
24648c2ecf20Sopenharmony_ci	else
24658c2ecf20Sopenharmony_ci		get_block = ocfs2_dio_wr_get_block;
24668c2ecf20Sopenharmony_ci
24678c2ecf20Sopenharmony_ci	return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
24688c2ecf20Sopenharmony_ci				    iter, get_block,
24698c2ecf20Sopenharmony_ci				    ocfs2_dio_end_io, NULL, 0);
24708c2ecf20Sopenharmony_ci}
24718c2ecf20Sopenharmony_ci
24728c2ecf20Sopenharmony_ciconst struct address_space_operations ocfs2_aops = {
24738c2ecf20Sopenharmony_ci	.readpage		= ocfs2_readpage,
24748c2ecf20Sopenharmony_ci	.readahead		= ocfs2_readahead,
24758c2ecf20Sopenharmony_ci	.writepage		= ocfs2_writepage,
24768c2ecf20Sopenharmony_ci	.write_begin		= ocfs2_write_begin,
24778c2ecf20Sopenharmony_ci	.write_end		= ocfs2_write_end,
24788c2ecf20Sopenharmony_ci	.bmap			= ocfs2_bmap,
24798c2ecf20Sopenharmony_ci	.direct_IO		= ocfs2_direct_IO,
24808c2ecf20Sopenharmony_ci	.invalidatepage		= block_invalidatepage,
24818c2ecf20Sopenharmony_ci	.releasepage		= ocfs2_releasepage,
24828c2ecf20Sopenharmony_ci	.migratepage		= buffer_migrate_page,
24838c2ecf20Sopenharmony_ci	.is_partially_uptodate	= block_is_partially_uptodate,
24848c2ecf20Sopenharmony_ci	.error_remove_page	= generic_error_remove_page,
24858c2ecf20Sopenharmony_ci};
2486