xref: /kernel/linux/linux-6.6/fs/hmdfs/file_remote.c (revision 62306a36)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * fs/hmdfs/file_remote.c
4 *
5 * Copyright (c) 2020-2021 Huawei Device Co., Ltd.
6 */
7
8#include <linux/backing-dev.h>
9#include <linux/file.h>
10#include <linux/fs.h>
11#include <linux/namei.h>
12#include <linux/page-flags.h>
13#include <linux/pagemap.h>
14#include <linux/pagevec.h>
15#include <linux/sched/signal.h>
16#include <linux/slab.h>
17#include <linux/wait.h>
18
19#include "file_remote.h"
20
21#include "comm/socket_adapter.h"
22#include "hmdfs.h"
23#include "hmdfs_client.h"
24#include "hmdfs_dentryfile.h"
25#include "hmdfs_trace.h"
26
27static inline bool hmdfs_remote_write_cache_expired(
28		struct hmdfs_inode_info *info)
29{
30	return time_after(jiffies, info->writecache_expire);
31}
32
33enum expire_reason {
34	ALL_GOOD = 0,
35	INO_DISMATCH = 1,
36	SIZE_OR_CTIME_DISMATCH = 2,
37	TIMER_EXPIRE = 3,
38	TIMER_WORKING = 4,
39	STABLE_CTIME_DISMATCH = 5,
40	KEEP_CACHE = 6,
41};
42
43/*
44 * hmdfs_open_final_remote - Do final steps of opening a remote file, update
45 * local inode cache and decide whether of not to truncate inode pages.
46 *
47 * @info: hmdfs inode info
48 * @open_ret: values returned from remote when opening a remote file
49 * @keep_cache: keep local cache & i_size
50 */
51static int hmdfs_open_final_remote(struct hmdfs_inode_info *info,
52				   struct hmdfs_open_ret *open_ret,
53				   struct file *file, bool keep_cache)
54{
55	struct inode *inode = &info->vfs_inode;
56	bool truncate = false;
57	enum expire_reason reason = ALL_GOOD;
58	int ret = 0;
59
60	/*
61	 * if remote inode number changed and lookup stale data, we'll return
62	 * -ESTALE, and reopen the file with metedate from remote getattr.
63	 */
64	if (info->remote_ino != open_ret->ino) {
65		hmdfs_debug(
66			"got stale local inode, ino in local %llu, ino from open %llu",
67			info->remote_ino, open_ret->ino);
68		hmdfs_send_close(info->conn, &open_ret->fid);
69		reason = INO_DISMATCH;
70		ret = -ESTALE;
71		goto out;
72	}
73
74	if (keep_cache) {
75		reason = KEEP_CACHE;
76		trace_hmdfs_open_final_remote(info, open_ret, file, reason);
77		goto set_fid_out;
78	}
79
80	/*
81	 * if remote size do not match local inode, or remote ctime do not match
82	 * the last time same file was opened.
83	 */
84	if (inode->i_size != open_ret->file_size ||
85	    hmdfs_time_compare(&info->remote_ctime, &open_ret->remote_ctime)) {
86		truncate = true;
87		reason = SIZE_OR_CTIME_DISMATCH;
88		goto out;
89	}
90
91	/*
92	 * If 'writecache_expire' is set, check if it expires. And skip the
93	 * checking of stable_ctime.
94	 */
95	if (info->writecache_expire) {
96		truncate = hmdfs_remote_write_cache_expired(info);
97		if (truncate)
98			reason = TIMER_EXPIRE;
99		else
100			reason = TIMER_WORKING;
101		goto out;
102	}
103
104	/* the first time, or remote ctime is ahead of remote time */
105	if (info->stable_ctime.tv_sec == 0 && info->stable_ctime.tv_nsec == 0) {
106		truncate = true;
107		reason = STABLE_CTIME_DISMATCH;
108		goto out;
109	}
110
111	/*
112	 * - if last stable_ctime == stable_ctime, we do nothing.
113	 *   a. if ctime < stable_ctime, data is ensured to be uptodate,
114	 *   b. if ctime == stable_ctime, stale data might be accessed. This is
115	 *      acceptable since pagecache will be dropped later.
116	 *   c. ctime > stable_ctime is impossible.
117	 * - if last stable_ctime < stable_ctime, we clear the cache.
118	 *   d. ctime != last stable_ctime is impossible
119	 *   e. ctime == last stable_ctime, this is possible to read again from
120	 *      b, thus we need to drop the cache.
121	 * - if last stable_ctime > stable_ctime, we clear the cache.
122	 *   stable_ctime must be zero in this case, this is possible because
123	 *   system time might be changed.
124	 */
125	if (hmdfs_time_compare(&info->stable_ctime, &open_ret->stable_ctime)) {
126		truncate = true;
127		reason = STABLE_CTIME_DISMATCH;
128		goto out;
129	}
130
131out:
132	trace_hmdfs_open_final_remote(info, open_ret, file, reason);
133	if (ret)
134		return ret;
135
136	if (reason == SIZE_OR_CTIME_DISMATCH) {
137		inode->__i_ctime = open_ret->remote_ctime;
138		info->remote_ctime = open_ret->remote_ctime;
139	}
140
141	if (truncate) {
142		info->writecache_expire = 0;
143		truncate_inode_pages(inode->i_mapping, 0);
144	}
145
146	atomic64_set(&info->write_counter, 0);
147	info->stable_ctime = open_ret->stable_ctime;
148	i_size_write(inode, open_ret->file_size);
149	info->getattr_isize = HMDFS_STALE_REMOTE_ISIZE;
150set_fid_out:
151	spin_lock(&info->fid_lock);
152	info->fid = open_ret->fid;
153	spin_unlock(&info->fid_lock);
154	return 0;
155}
156
157int hmdfs_do_open_remote(struct file *file, bool keep_cache)
158{
159	struct hmdfs_inode_info *info = hmdfs_i(file_inode(file));
160	struct hmdfs_peer *conn = info->conn;
161	struct hmdfs_open_ret open_ret;
162	__u8 file_type = hmdfs_d(file->f_path.dentry)->file_type;
163	char *send_buf;
164	int err = 0;
165
166	send_buf = hmdfs_get_dentry_relative_path(file->f_path.dentry);
167	if (!send_buf) {
168		err = -ENOMEM;
169		goto out_free;
170	}
171	err = hmdfs_send_open(conn, send_buf, file_type, &open_ret);
172	if (err) {
173		hmdfs_err("hmdfs_send_open return failed with %d", err);
174		goto out_free;
175	}
176
177	err = hmdfs_open_final_remote(info, &open_ret, file, keep_cache);
178
179out_free:
180	kfree(send_buf);
181	return err;
182}
183
184static inline bool hmdfs_remote_need_reopen(struct hmdfs_inode_info *info)
185{
186	return test_bit(HMDFS_FID_NEED_OPEN, &info->fid_flags);
187}
188
189static inline bool hmdfs_remote_is_opening_file(struct hmdfs_inode_info *info)
190{
191	return test_bit(HMDFS_FID_OPENING, &info->fid_flags);
192}
193
194static int hmdfs_remote_wait_opening_file(struct hmdfs_inode_info *info)
195{
196	int err;
197
198	if (!hmdfs_remote_is_opening_file(info))
199		return 0;
200
201	err = ___wait_event(info->fid_wq, hmdfs_remote_is_opening_file(info),
202			    TASK_INTERRUPTIBLE, 0, 0,
203			    spin_unlock(&info->fid_lock);
204			    schedule();
205			    spin_lock(&info->fid_lock));
206	if (err)
207		err = -EINTR;
208
209	return err;
210}
211
212static int hmdfs_remote_file_reopen(struct hmdfs_inode_info *info,
213				    struct file *filp)
214{
215	int err = 0;
216	struct hmdfs_peer *conn = info->conn;
217	struct inode *inode = NULL;
218	struct hmdfs_fid fid;
219
220	if (conn->status == NODE_STAT_OFFLINE)
221		return -EAGAIN;
222
223	spin_lock(&info->fid_lock);
224	err = hmdfs_remote_wait_opening_file(info);
225	if (err || !hmdfs_remote_need_reopen(info)) {
226		spin_unlock(&info->fid_lock);
227		goto out;
228	}
229
230	set_bit(HMDFS_FID_OPENING, &info->fid_flags);
231	fid = info->fid;
232	spin_unlock(&info->fid_lock);
233
234	inode = &info->vfs_inode;
235	inode_lock(inode);
236	/*
237	 * Most closing cases are meaningless, except for one:
238	 *        read process A         read process B
239	 *    err = -EBADF              err = -EBADF       (caused by re-online)
240	 *    set_need_reopen
241	 *    do reopen
242	 *    fid = new fid_1 [server hold fid_1]
243	 *                              set need_reopen
244	 *                              do reopen
245	 *                                send close (fid_1) // In case of leak
246	 *                              fid = new fid_2
247	 */
248	if (fid.id != HMDFS_INODE_INVALID_FILE_ID)
249		hmdfs_send_close(conn, &fid);
250	err = hmdfs_do_open_remote(filp, true);
251	inode_unlock(inode);
252
253	spin_lock(&info->fid_lock);
254	/*
255	 * May make the bit set in offline handler lost, but server
256	 * will tell us whether or not the newly-opened file id is
257	 * generated before offline, if it is opened before offline,
258	 * the operation on the file id will return -EBADF and
259	 * HMDFS_FID_NEED_OPEN bit will be set again.
260	 */
261	if (!err)
262		clear_bit(HMDFS_FID_NEED_OPEN, &info->fid_flags);
263	clear_bit(HMDFS_FID_OPENING, &info->fid_flags);
264	spin_unlock(&info->fid_lock);
265
266	wake_up_interruptible_all(&info->fid_wq);
267out:
268	return err;
269}
270
271static int hmdfs_remote_check_and_reopen(struct hmdfs_inode_info *info,
272					 struct file *filp)
273{
274	if (!hmdfs_remote_need_reopen(info))
275		return 0;
276
277	return hmdfs_remote_file_reopen(info, filp);
278}
279
280void hmdfs_do_close_remote(struct kref *kref)
281{
282	struct hmdfs_inode_info *info =
283		container_of(kref, struct hmdfs_inode_info, ref);
284	struct hmdfs_fid fid;
285
286	hmdfs_remote_fetch_fid(info, &fid);
287	/* This function can return asynchronously */
288	hmdfs_send_close(info->conn, &fid);
289}
290
291static inline bool hmdfs_remote_need_track_file(const struct hmdfs_sb_info *sbi,
292						fmode_t mode)
293{
294	return (hmdfs_is_stash_enabled(sbi) && (mode & FMODE_WRITE));
295}
296
297static void
298hmdfs_remote_del_wr_opened_inode_nolock(struct hmdfs_inode_info *info)
299{
300	WARN_ON(list_empty(&info->wr_opened_node));
301	if (atomic_dec_and_test(&info->wr_opened_cnt))
302		list_del_init(&info->wr_opened_node);
303}
304
305void hmdfs_remote_del_wr_opened_inode(struct hmdfs_peer *conn,
306				      struct hmdfs_inode_info *info)
307{
308	spin_lock(&conn->wr_opened_inode_lock);
309	hmdfs_remote_del_wr_opened_inode_nolock(info);
310	spin_unlock(&conn->wr_opened_inode_lock);
311}
312
313void hmdfs_remote_add_wr_opened_inode_nolock(struct hmdfs_peer *conn,
314					     struct hmdfs_inode_info *info)
315{
316	if (list_empty(&info->wr_opened_node)) {
317		atomic_set(&info->wr_opened_cnt, 1);
318		list_add_tail(&info->wr_opened_node,
319			      &conn->wr_opened_inode_list);
320	} else {
321		atomic_inc(&info->wr_opened_cnt);
322	}
323}
324
325static void hmdfs_remote_add_wr_opened_inode(struct hmdfs_peer *conn,
326					     struct hmdfs_inode_info *info)
327{
328	spin_lock(&conn->wr_opened_inode_lock);
329	hmdfs_remote_add_wr_opened_inode_nolock(conn, info);
330	spin_unlock(&conn->wr_opened_inode_lock);
331}
332
333int hmdfs_file_open_remote(struct inode *inode, struct file *file)
334{
335	struct hmdfs_inode_info *info = hmdfs_i(inode);
336	struct kref *ref = &(info->ref);
337	int err = 0;
338
339	inode_lock(inode);
340	if (kref_read(ref) == 0) {
341		err = hmdfs_do_open_remote(file, false);
342		if (err == 0)
343			kref_init(ref);
344	} else {
345		kref_get(ref);
346	}
347	inode_unlock(inode);
348
349	if (!err && hmdfs_remote_need_track_file(hmdfs_sb(inode->i_sb),
350						 file->f_mode))
351		hmdfs_remote_add_wr_opened_inode(info->conn, info);
352
353	return err;
354}
355
356static void hmdfs_set_writecache_expire(struct hmdfs_inode_info *info,
357					unsigned int seconds)
358{
359	unsigned long new_expire = jiffies + (unsigned long)seconds * HZ;
360
361	/*
362	 * When file has been written before closing, set pagecache expire
363	 * if it has not been set yet. This is necessary because ctime might
364	 * stay the same after overwrite.
365	 */
366	if (info->writecache_expire &&
367	    time_after(new_expire, info->writecache_expire))
368		return;
369
370	info->writecache_expire = new_expire;
371}
372
373static void hmdfs_remote_keep_writecache(struct inode *inode, struct file *file)
374{
375	struct hmdfs_inode_info *info = NULL;
376	struct kref *ref = NULL;
377	struct hmdfs_getattr_ret *getattr_ret = NULL;
378	unsigned int write_cache_timeout =
379		hmdfs_sb(inode->i_sb)->write_cache_timeout;
380	int err;
381
382	if (!write_cache_timeout)
383		return;
384
385	info = hmdfs_i(inode);
386	ref = &(info->ref);
387	/*
388	 * don't do anything if file is still opening or file hasn't been
389	 * written.
390	 */
391	if (kref_read(ref) > 0 || !atomic64_read(&info->write_counter))
392		return;
393
394	/*
395	 * If remote getattr failed, and we don't update ctime,
396	 * pagecache will be truncated the next time file is opened.
397	 */
398	err = hmdfs_remote_getattr(info->conn, file_dentry(file), 0,
399				   &getattr_ret);
400	if (err) {
401		hmdfs_err("remote getattr failed with err %d", err);
402		return;
403	}
404
405	if (!(getattr_ret->stat.result_mask & STATX_CTIME)) {
406		hmdfs_err("get remote ctime failed with mask 0x%x",
407			  getattr_ret->stat.result_mask);
408		kfree(getattr_ret);
409		return;
410	}
411	/*
412	 * update ctime from remote, in case that pagecahe will be
413	 * truncated in next open.
414	 */
415	inode->__i_ctime = getattr_ret->stat.ctime;
416	info->remote_ctime = getattr_ret->stat.ctime;
417	hmdfs_set_writecache_expire(info, write_cache_timeout);
418	kfree(getattr_ret);
419}
420
421int hmdfs_file_release_remote(struct inode *inode, struct file *file)
422{
423	struct hmdfs_inode_info *info = hmdfs_i(inode);
424
425	if (hmdfs_remote_need_track_file(hmdfs_sb(inode->i_sb), file->f_mode))
426		hmdfs_remote_del_wr_opened_inode(info->conn, info);
427
428	inode_lock(inode);
429	kref_put(&info->ref, hmdfs_do_close_remote);
430	hmdfs_remote_keep_writecache(inode, file);
431	inode_unlock(inode);
432
433	return 0;
434}
435
436static int hmdfs_file_flush(struct file *file, fl_owner_t id)
437{
438	int err = 0;
439	struct inode *inode = file_inode(file);
440
441	if (!(file->f_mode & FMODE_WRITE))
442		return 0;
443
444	/*
445	 * Continue regardless of whether file reopen fails or not,
446	 * because there may be no dirty page.
447	 */
448	hmdfs_remote_check_and_reopen(hmdfs_i(inode), file);
449
450	/*
451	 * Wait for wsem here would impact the performance greatly, so we
452	 * overlap the time to issue as many wbs as we can, expecting async
453	 * wbs are eliminated afterwards.
454	 */
455	filemap_fdatawrite(inode->i_mapping);
456	down_write(&hmdfs_i(inode)->wpage_sem);
457	err = filemap_write_and_wait(inode->i_mapping);
458	up_write(&hmdfs_i(inode)->wpage_sem);
459	return err;
460}
461
462static ssize_t hmdfs_file_read_iter_remote(struct kiocb *iocb,
463					   struct iov_iter *iter)
464{
465	struct file *filp = iocb->ki_filp;
466	struct hmdfs_inode_info *info = hmdfs_i(file_inode(filp));
467	struct file_ra_state *ra = NULL;
468	unsigned int rtt;
469	int err;
470	bool tried = false;
471
472retry:
473	err = hmdfs_remote_check_and_reopen(info, filp);
474	if (err)
475		return err;
476
477	ra = &filp->f_ra;
478	/* rtt is measured in 10 msecs */
479	rtt = hmdfs_tcpi_rtt(info->conn) / 10000;
480	switch (rtt) {
481	case 0:
482		break;
483	case 1:
484		ra->ra_pages = 256;
485		break;
486	case 2:
487		ra->ra_pages = 512;
488		break;
489	default:
490		ra->ra_pages = 1024;
491		break;
492	}
493
494	err = generic_file_read_iter(iocb, iter);
495	if (err < 0 && !tried && hmdfs_remote_need_reopen(info)) {
496		/* Read from a stale fid, try read again once. */
497		tried = true;
498		goto retry;
499	}
500
501	return err;
502}
503
504static inline bool hmdfs_is_file_unwritable(const struct hmdfs_inode_info *info,
505					    bool check_stash)
506{
507	return (check_stash && hmdfs_inode_is_stashing(info)) ||
508	       !hmdfs_is_node_online(info->conn);
509}
510
511static ssize_t __hmdfs_file_write_iter_remote(struct kiocb *iocb,
512					      struct iov_iter *iter,
513					      bool check_stash)
514{
515	struct file *filp = iocb->ki_filp;
516	struct inode *inode = file_inode(filp);
517	struct hmdfs_inode_info *info = hmdfs_i(inode);
518	ssize_t ret;
519
520	if (hmdfs_is_file_unwritable(info, check_stash))
521		return -EAGAIN;
522
523	ret = hmdfs_remote_check_and_reopen(info, filp);
524	if (ret)
525		return ret;
526
527	inode_lock(inode);
528	if (hmdfs_is_file_unwritable(info, check_stash)) {
529		ret = -EAGAIN;
530		goto out;
531	}
532	ret = generic_write_checks(iocb, iter);
533	if (ret > 0)
534		ret = __generic_file_write_iter(iocb, iter);
535out:
536	inode_unlock(inode);
537
538	if (ret > 0)
539		ret = generic_write_sync(iocb, ret);
540	return ret;
541}
542
543ssize_t hmdfs_file_write_iter_remote_nocheck(struct kiocb *iocb,
544					     struct iov_iter *iter)
545{
546	return __hmdfs_file_write_iter_remote(iocb, iter, false);
547}
548
549static ssize_t hmdfs_file_write_iter_remote(struct kiocb *iocb,
550					    struct iov_iter *iter)
551{
552	return __hmdfs_file_write_iter_remote(iocb, iter, true);
553}
554
555/* hmdfs not support mmap write remote file */
556static vm_fault_t hmdfs_page_mkwrite(struct vm_fault *vmf)
557{
558	return VM_FAULT_SIGBUS;
559}
560
561static const struct vm_operations_struct hmdfs_file_vm_ops = {
562	.fault = filemap_fault,
563	.map_pages = filemap_map_pages,
564	.page_mkwrite = hmdfs_page_mkwrite,
565};
566
567static int hmdfs_file_mmap_remote(struct file *file, struct vm_area_struct *vma)
568{
569	vma->vm_ops = &hmdfs_file_vm_ops;
570	file_accessed(file);
571
572	return 0;
573}
574
575static int hmdfs_file_fsync_remote(struct file *file, loff_t start, loff_t end,
576				   int datasync)
577{
578	struct hmdfs_inode_info *info = hmdfs_i(file_inode(file));
579	struct hmdfs_peer *conn = info->conn;
580	struct hmdfs_fid fid;
581	int err;
582
583	trace_hmdfs_fsync_enter_remote(conn->sbi, conn->device_id,
584				       info->remote_ino, datasync);
585	/*
586	 * Continue regardless of whether file reopen fails or not,
587	 * because there may be no dirty page.
588	 */
589	hmdfs_remote_check_and_reopen(info, file);
590
591	filemap_fdatawrite(file->f_mapping);
592	down_write(&info->wpage_sem);
593	err = file_write_and_wait_range(file, start, end);
594	up_write(&info->wpage_sem);
595	if (err) {
596		hmdfs_err("local fsync fail with %d", err);
597		goto out;
598	}
599
600	hmdfs_remote_fetch_fid(info, &fid);
601	err = hmdfs_send_fsync(conn, &fid, start, end, datasync);
602	if (err)
603		hmdfs_err("send fsync fail with %d", err);
604
605out:
606	trace_hmdfs_fsync_exit_remote(conn->sbi, conn->device_id,
607				      info->remote_ino,
608				      get_cmd_timeout(conn->sbi, F_FSYNC), err);
609
610	/* Compatible with POSIX retcode */
611	if (err == -ETIME)
612		err = -EIO;
613
614	return err;
615}
616
617const struct file_operations hmdfs_dev_file_fops_remote = {
618	.owner = THIS_MODULE,
619	.llseek = generic_file_llseek,
620	.read_iter = hmdfs_file_read_iter_remote,
621	.write_iter = hmdfs_file_write_iter_remote,
622	.mmap = hmdfs_file_mmap_remote,
623	.open = hmdfs_file_open_remote,
624	.release = hmdfs_file_release_remote,
625	.flush = hmdfs_file_flush,
626	.fsync = hmdfs_file_fsync_remote,
627	.splice_read = copy_splice_read,
628	.splice_write = iter_file_splice_write,
629};
630
631static void hmdfs_fill_page_zero(struct page *page)
632{
633	void *addr = NULL;
634
635	addr = kmap(page);
636	memset(addr, 0, PAGE_SIZE);
637	kunmap(page);
638	SetPageUptodate(page);
639	unlock_page(page);
640}
641
642static int hmdfs_readpage_remote(struct file *file, struct page *page)
643{
644	struct inode *inode = file_inode(file);
645	struct hmdfs_inode_info *info = hmdfs_i(inode);
646	loff_t isize = i_size_read(inode);
647	pgoff_t end_index = (isize - 1) >> PAGE_SHIFT;
648	struct hmdfs_fid fid;
649
650	if (!isize || page->index > end_index) {
651		hmdfs_fill_page_zero(page);
652		return 0;
653	}
654
655	if (!isize || page->index > end_index) {
656		hmdfs_fill_page_zero(page);
657		return 0;
658	}
659
660	hmdfs_remote_fetch_fid(info, &fid);
661	return hmdfs_client_readpage(info->conn, &fid, page);
662}
663
664static int hmdfs_read_folio(struct file *file, struct folio *folio)
665{
666	struct page *page = &folio->page;
667	return hmdfs_readpage_remote(file, page);
668}
669
670uint32_t hmdfs_get_writecount(struct page *page)
671{
672	uint32_t count = 0;
673	loff_t pos = (loff_t)page->index << HMDFS_PAGE_OFFSET;
674	struct inode *inode = page->mapping->host;
675	loff_t size = i_size_read(inode);
676	/*
677	 * If page offset is greater than i_size, this is possible when
678	 * writepage concurrent with truncate. In this case, we don't need to
679	 * do remote writepage since it'll be truncated after the page is
680	 * unlocked.
681	 */
682	if (pos >= size)
683		count = 0;
684	/*
685	 * If the page about to write is beyond i_size, we can't write beyond
686	 * i_size because remote file size will be wrong.
687	 */
688	else if (size < pos + HMDFS_PAGE_SIZE)
689		count = size - pos;
690	/* It's safe to write the whole page */
691	else
692		count = HMDFS_PAGE_SIZE;
693
694	return count;
695}
696
697static bool allow_cur_thread_wpage(struct hmdfs_inode_info *info,
698				   bool *rsem_held, bool sync_all)
699{
700	WARN_ON(!rsem_held);
701
702	if (sync_all) {
703		*rsem_held = false;
704		return true;
705	}
706	*rsem_held = down_read_trylock(&info->wpage_sem);
707	return *rsem_held;
708}
709
710/**
711 * hmdfs_writepage_remote - writeback a dirty page to remote
712 *
713 * INFO:
714 * When asked to WB_SYNC_ALL, this function should leave with both the page and
715 * the radix tree node clean to achieve close-to-open consitency. Moreover,
716 * this shall never return -EIO to help filemap to iterate all dirty pages.
717 *
718 * INFO:
719 * When asked to WB_SYNC_NONE, this function should be mercy if faults(oom or
720 * bad pipe) happended to enable subsequent r/w & wb.
721 */
722static int hmdfs_writepage_remote(struct page *page,
723				  struct writeback_control *wbc)
724{
725	struct inode *inode = page->mapping->host;
726	struct hmdfs_inode_info *info = hmdfs_i(inode);
727	struct hmdfs_sb_info *sbi = hmdfs_sb(inode->i_sb);
728	int ret = 0;
729	bool rsem_held = false;
730	bool sync = wbc->sync_mode == WB_SYNC_ALL;
731	struct hmdfs_writepage_context *param = NULL;
732
733	if (!allow_cur_thread_wpage(info, &rsem_held, sync))
734		goto out_unlock;
735
736	set_page_writeback(page);
737
738	param = kzalloc(sizeof(*param), GFP_NOFS);
739	if (!param) {
740		ret = -ENOMEM;
741		goto out_endwb;
742	}
743
744	if (sync && hmdfs_usr_sig_pending(current)) {
745		ClearPageUptodate(page);
746		goto out_free;
747	}
748	param->count = hmdfs_get_writecount(page);
749	if (!param->count)
750		goto out_free;
751	param->rsem_held = rsem_held;
752	hmdfs_remote_fetch_fid(info, &param->fid);
753	param->sync_all = sync;
754	param->caller = current;
755	get_task_struct(current);
756	param->page = page;
757	param->timeout = jiffies + msecs_to_jiffies(sbi->wb_timeout_ms);
758	INIT_DELAYED_WORK(&param->retry_dwork, hmdfs_remote_writepage_retry);
759	ret = hmdfs_remote_do_writepage(info->conn, param);
760	if (likely(!ret))
761		return 0;
762
763	put_task_struct(current);
764out_free:
765	kfree(param);
766out_endwb:
767	end_page_writeback(page);
768	if (rsem_held)
769		up_read(&info->wpage_sem);
770out_unlock:
771	if (sync || !hmdfs_need_redirty_page(info, ret)) {
772		SetPageError(page);
773		mapping_set_error(page->mapping, ret);
774	} else {
775		redirty_page_for_writepage(wbc, page);
776	}
777	unlock_page(page);
778	return ret;
779}
780
781static void hmdfs_account_dirty_pages(struct address_space *mapping)
782{
783	struct hmdfs_sb_info *sbi = mapping->host->i_sb->s_fs_info;
784
785	if (!sbi->h_wb->dirty_writeback_control)
786		return;
787
788	this_cpu_inc(*sbi->h_wb->bdp_ratelimits);
789}
790
791static int hmdfs_write_begin_remote(struct file *file,
792				    struct address_space *mapping, loff_t pos,
793				    unsigned int len,
794				    struct page **pagep, void **fsdata)
795{
796	pgoff_t index = ((unsigned long long)pos) >> PAGE_SHIFT;
797	struct inode *inode = file_inode(file);
798	struct page *page = NULL;
799	int ret = 0;
800
801start:
802	page = grab_cache_page_write_begin(mapping, index);
803	if (!page)
804		return -ENOMEM;
805	*pagep = page;
806	wait_on_page_writeback(page);
807
808	// If this page will be covered completely.
809	if (len == HMDFS_PAGE_SIZE || PageUptodate(page))
810		return 0;
811
812	/*
813	 * If data existed in this page will covered,
814	 * we just need to clear this page.
815	 */
816	if (!((unsigned long long)pos & (HMDFS_PAGE_SIZE - 1)) &&
817	    (pos + len) >= i_size_read(inode)) {
818		zero_user_segment(page, len, HMDFS_PAGE_SIZE);
819		return 0;
820	}
821	/*
822	 * We need readpage before write date to this page.
823	 */
824	ret = hmdfs_readpage_remote(file, page);
825	if (!ret) {
826		if (PageLocked(page)) {
827			ret = folio_lock_killable(page_folio(page));
828			if (!ret)
829				unlock_page(page);
830		}
831
832		if (!ret && PageUptodate(page)) {
833			put_page(page);
834			goto start;
835		}
836		if (!ret)
837			ret = -EIO;
838	}
839	put_page(page);
840	return ret;
841}
842
843static int hmdfs_write_end_remote(struct file *file,
844				  struct address_space *mapping, loff_t pos,
845				  unsigned int len, unsigned int copied,
846				  struct page *page, void *fsdata)
847{
848	struct inode *inode = page->mapping->host;
849
850	if (!PageUptodate(page)) {
851		if (unlikely(copied != len))
852			copied = 0;
853		else
854			SetPageUptodate(page);
855	}
856	if (!copied)
857		goto unlock_out;
858
859	if (!PageDirty(page)) {
860		hmdfs_account_dirty_pages(mapping);
861		set_page_dirty(page);
862	}
863
864	if (pos + copied > i_size_read(inode)) {
865		i_size_write(inode, pos + copied);
866		hmdfs_i(inode)->getattr_isize = HMDFS_STALE_REMOTE_ISIZE;
867	}
868unlock_out:
869	unlock_page(page);
870	put_page(page);
871
872	/* hmdfs private writeback control */
873	hmdfs_balance_dirty_pages_ratelimited(mapping);
874	return copied;
875}
876
877const struct address_space_operations hmdfs_dev_file_aops_remote = {
878	.read_folio = hmdfs_read_folio,
879	.write_begin = hmdfs_write_begin_remote,
880	.write_end = hmdfs_write_end_remote,
881	.writepage = hmdfs_writepage_remote,
882	.dirty_folio = filemap_dirty_folio,
883};
884
885loff_t hmdfs_set_pos(unsigned long dev_id, unsigned long group_id,
886			    unsigned long offset)
887{
888	loff_t pos;
889
890	pos = ((loff_t)dev_id << (POS_BIT_NUM - 1 - DEV_ID_BIT_NUM)) +
891	      ((loff_t)group_id << OFFSET_BIT_NUM) + offset;
892	if (dev_id)
893		pos |= ((loff_t)1 << (POS_BIT_NUM - 1));
894	return pos;
895}
896
897int analysis_dentry_file_from_con(struct hmdfs_sb_info *sbi,
898					 struct file *file,
899					 struct file *handler,
900					 struct dir_context *ctx)
901{
902	struct hmdfs_dentry_group *dentry_group = NULL;
903	loff_t pos = ctx->pos;
904	unsigned long dev_id = (unsigned long)((pos << 1) >> (POS_BIT_NUM - DEV_ID_BIT_NUM));
905	unsigned long group_id = (unsigned long)((pos << (1 + DEV_ID_BIT_NUM)) >>
906				 (POS_BIT_NUM - GROUP_ID_BIT_NUM));
907	loff_t offset = pos & OFFSET_BIT_MASK;
908	int group_num = 0;
909	char *dentry_name = NULL;
910	int iterate_result = 0;
911	int i, j;
912
913	dentry_group = kzalloc(sizeof(*dentry_group), GFP_KERNEL);
914
915	if (!dentry_group)
916		return -ENOMEM;
917
918	if (IS_ERR_OR_NULL(handler)) {
919		kfree(dentry_group);
920		return -ENOENT;
921	}
922
923	group_num = get_dentry_group_cnt(file_inode(handler));
924	dentry_name = kzalloc(DENTRY_NAME_MAX_LEN, GFP_KERNEL);
925	if (!dentry_name) {
926		kfree(dentry_group);
927		return -ENOMEM;
928	}
929
930	for (i = group_id; i < group_num; i++) {
931		int ret = hmdfs_metainfo_read(sbi, handler, dentry_group,
932					      sizeof(struct hmdfs_dentry_group),
933					      i);
934		if (ret != sizeof(struct hmdfs_dentry_group)) {
935			hmdfs_err("read dentry group failed ret:%d", ret);
936			goto done;
937		}
938
939		for (j = offset; j < DENTRY_PER_GROUP; j++) {
940			int len;
941			int file_type = DT_UNKNOWN;
942			bool is_continue;
943
944			len = le16_to_cpu(dentry_group->nsl[j].namelen);
945			if (!test_bit_le(j, dentry_group->bitmap) || len == 0)
946				continue;
947
948			memset(dentry_name, 0, DENTRY_NAME_MAX_LEN);
949			// TODO: Support more file_type
950			if (S_ISDIR(le16_to_cpu(dentry_group->nsl[j].i_mode)))
951				file_type = DT_DIR;
952			else if (S_ISREG(le16_to_cpu(
953					 dentry_group->nsl[j].i_mode)))
954				file_type = DT_REG;
955			else if (S_ISLNK(le16_to_cpu(
956					 dentry_group->nsl[j].i_mode)))
957				file_type = DT_LNK;
958
959			strncat(dentry_name, dentry_group->filename[j], len);
960			pos = hmdfs_set_pos(dev_id, i, j);
961			is_continue =
962				dir_emit(ctx, dentry_name, len,
963					 pos + INUNUMBER_START, file_type);
964			if (!is_continue) {
965				ctx->pos = pos;
966				iterate_result = 1;
967				goto done;
968			}
969		}
970		offset = 0;
971	}
972
973done:
974	kfree(dentry_name);
975	kfree(dentry_group);
976	return iterate_result;
977}
978
979int hmdfs_dev_readdir_from_con(struct hmdfs_peer *con, struct file *file,
980			       struct dir_context *ctx)
981{
982	int iterate_result = 0;
983
984	iterate_result = analysis_dentry_file_from_con(
985		con->sbi, file, file->private_data, ctx);
986	return iterate_result;
987}
988
989static int hmdfs_iterate_remote(struct file *file, struct dir_context *ctx)
990{
991	int err = 0;
992	loff_t start_pos = ctx->pos;
993	struct hmdfs_peer *con = NULL;
994	struct hmdfs_dentry_info *di = hmdfs_d(file->f_path.dentry);
995	bool is_local = !((ctx->pos) >> (POS_BIT_NUM - 1));
996	uint64_t dev_id = di->device_id;
997
998	if (ctx->pos == -1)
999		return 0;
1000	if (is_local)
1001		ctx->pos = hmdfs_set_pos(dev_id, 0, 0);
1002
1003	con = hmdfs_lookup_from_devid(file->f_inode->i_sb->s_fs_info, dev_id);
1004	if (con) {
1005		// ctx->pos = 0;
1006		err = hmdfs_dev_readdir_from_con(con, file, ctx);
1007		if (unlikely(!con)) {
1008			hmdfs_err("con is null");
1009			goto done;
1010		}
1011		peer_put(con);
1012		if (err)
1013			goto done;
1014	}
1015
1016done:
1017	if (err <= 0)
1018		ctx->pos = -1;
1019
1020	trace_hmdfs_iterate_remote(file->f_path.dentry, start_pos, ctx->pos,
1021				   err);
1022	return err;
1023}
1024
1025int hmdfs_dir_open_remote(struct inode *inode, struct file *file)
1026{
1027	struct hmdfs_inode_info *info = hmdfs_i(inode);
1028	struct clearcache_item *cache_item = NULL;
1029
1030	if (info->conn) {
1031		if (!hmdfs_cache_revalidate(READ_ONCE(info->conn->conn_time),
1032					    info->conn->device_id,
1033					    file->f_path.dentry))
1034			get_remote_dentry_file_sync(file->f_path.dentry,
1035						    info->conn);
1036		cache_item = hmdfs_find_cache_item(info->conn->device_id,
1037						   file->f_path.dentry);
1038		if (cache_item) {
1039			file->private_data = cache_item->filp;
1040			get_file(file->private_data);
1041			kref_put(&cache_item->ref, release_cache_item);
1042			return 0;
1043		}
1044		return -ENOENT;
1045	}
1046	return -ENOENT;
1047}
1048
1049static int hmdfs_dir_release_remote(struct inode *inode, struct file *file)
1050{
1051	if (file->private_data)
1052		fput(file->private_data);
1053	file->private_data = NULL;
1054	return 0;
1055}
1056
1057const struct file_operations hmdfs_dev_dir_ops_remote = {
1058	.owner = THIS_MODULE,
1059	.iterate_shared = hmdfs_iterate_remote,
1060	.open = hmdfs_dir_open_remote,
1061	.release = hmdfs_dir_release_remote,
1062	.fsync = __generic_file_fsync,
1063};
1064