1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * fs/hmdfs/stash.c
4  *
5  * Copyright (c) 2020-2021 Huawei Device Co., Ltd.
6  */
7 
8 #include <linux/kernel.h>
9 #include <linux/fs.h>
10 #include <linux/file.h>
11 #include <linux/dcache.h>
12 #include <linux/namei.h>
13 #include <linux/mount.h>
14 #include <linux/slab.h>
15 #include <linux/list.h>
16 #include <linux/pagemap.h>
17 #include <linux/sched/mm.h>
18 #include <linux/sched/task.h>
19 #include <linux/errseq.h>
20 #include <linux/crc32.h>
21 
22 #include "stash.h"
23 #include "comm/node_cb.h"
24 #include "comm/protocol.h"
25 #include "comm/connection.h"
26 #include "file_remote.h"
27 #include "hmdfs_dentryfile.h"
28 #include "authority/authentication.h"
29 
30 /* Head magic used to identify a stash file */
31 #define HMDFS_STASH_FILE_HEAD_MAGIC 0xF7AB06C3
32 /* Head and path in stash file are aligned with HMDFS_STASH_BLK_SIZE */
33 #define HMDFS_STASH_BLK_SIZE 4096
34 #define HMDFS_STASH_BLK_SHIFT 12
35 #define HMDFS_STASH_PAGE_TO_SECTOR_SHIFT 3
36 #define HMDFS_STASH_DIR_NAME "stash"
37 #define HMDFS_STASH_FMT_DIR_NAME "v1"
38 #define HMDFS_STASH_WORK_DIR_NAME \
39 	(HMDFS_STASH_DIR_NAME "/" HMDFS_STASH_FMT_DIR_NAME)
40 
41 #define HMDFS_STASH_FILE_NAME_LEN 20
42 
43 #define HMDFS_STASH_FLUSH_CNT 2
44 
45 #define HMDFS_STASH_PATH_LEN (HMDFS_CID_SIZE + HMDFS_STASH_FILE_NAME_LEN + 1)
46 
47 struct hmdfs_cache_file_head {
48 	__le32 magic;
49 	__le32 crc_offset;
50 	__le64 ino;
51 	__le64 size;
52 	__le64 blocks;
53 	__le64 last_write_pos;
54 	__le64 ctime;
55 	__le32 ctime_nsec;
56 	__le32 change_detect_cap;
57 	__le64 ichange_count;
58 	__le32 path_offs;
59 	__le32 path_len;
60 	__le32 path_cnt;
61 	__le32 data_offs;
62 	/* Attention: expand new fields in here to compatible with old ver */
63 	__le32 crc32;
64 } __packed;
65 
66 struct hmdfs_stash_work {
67 	struct hmdfs_peer *conn;
68 	struct list_head *list;
69 	struct work_struct work;
70 	struct completion done;
71 };
72 
73 struct hmdfs_inode_tbl {
74 	unsigned int cnt;
75 	unsigned int max;
76 	uint64_t inodes[0];
77 };
78 
79 struct hmdfs_stash_dir_context {
80 	struct dir_context dctx;
81 	char name[NAME_MAX + 1];
82 	struct hmdfs_inode_tbl *tbl;
83 };
84 
85 struct hmdfs_restore_stats {
86 	unsigned int succeed;
87 	unsigned int fail;
88 	unsigned int keep;
89 	unsigned long long ok_pages;
90 	unsigned long long fail_pages;
91 };
92 
93 struct hmdfs_stash_stats {
94 	unsigned int succeed;
95 	unsigned int donothing;
96 	unsigned int fail;
97 	unsigned long long ok_pages;
98 	unsigned long long fail_pages;
99 };
100 
101 struct hmdfs_file_restore_ctx {
102 	struct hmdfs_peer *conn;
103 	struct path src_dir_path;
104 	struct path dst_root_path;
105 	char *dst;
106 	char *page;
107 	struct file *src_filp;
108 	uint64_t inum;
109 	uint64_t pages;
110 	unsigned int seq;
111 	unsigned int data_offs;
112 	/* output */
113 	bool keep;
114 };
115 
116 struct hmdfs_copy_args {
117 	struct file *src;
118 	struct file *dst;
119 	void *buf;
120 	size_t buf_len;
121 	unsigned int seq;
122 	unsigned int data_offs;
123 	uint64_t inum;
124 };
125 
126 struct hmdfs_copy_ctx {
127 	struct hmdfs_copy_args args;
128 	loff_t src_pos;
129 	loff_t dst_pos;
130 	/* output */
131 	size_t copied;
132 	bool eof;
133 };
134 
135 struct hmdfs_rebuild_stats {
136 	unsigned int succeed;
137 	unsigned int total;
138 	unsigned int fail;
139 	unsigned int invalid;
140 };
141 
142 struct hmdfs_check_work {
143 	struct hmdfs_peer *conn;
144 	struct work_struct work;
145 	struct completion done;
146 };
147 
148 typedef int (*stash_operation_func)(struct hmdfs_peer *,
149 				    unsigned int,
150 				    struct path *,
151 				    const struct hmdfs_inode_tbl *,
152 				    void *);
153 
hmdfs_do_vfs_mkdir(struct dentry *parent, const char *name, int namelen, umode_t mode)154 static struct dentry *hmdfs_do_vfs_mkdir(struct dentry *parent,
155 					 const char *name, int namelen,
156 					 umode_t mode)
157 {
158 	struct inode *dir = d_inode(parent);
159 	struct dentry *child = NULL;
160 	int err;
161 
162 	inode_lock_nested(dir, I_MUTEX_PARENT);
163 
164 	child = lookup_one_len(name, parent, namelen);
165 	if (IS_ERR(child))
166 		goto out;
167 
168 	if (d_is_positive(child)) {
169 		if (d_can_lookup(child))
170 			goto out;
171 
172 		dput(child);
173 		child = ERR_PTR(-EINVAL);
174 		goto out;
175 	}
176 
177 	err = vfs_mkdir(&nop_mnt_idmap, dir, child, mode);
178 	if (err) {
179 		dput(child);
180 		child = ERR_PTR(err);
181 		goto out;
182 	}
183 
184 out:
185 	inode_unlock(dir);
186 	return child;
187 }
188 
hmdfs_stash_new_work_dir(struct dentry *parent)189 struct dentry *hmdfs_stash_new_work_dir(struct dentry *parent)
190 {
191 	struct dentry *base = NULL;
192 	struct dentry *work = NULL;
193 
194 	base = hmdfs_do_vfs_mkdir(parent, HMDFS_STASH_DIR_NAME,
195 				   strlen(HMDFS_STASH_DIR_NAME), 0700);
196 	if (IS_ERR(base))
197 		return base;
198 
199 	work = hmdfs_do_vfs_mkdir(base, HMDFS_STASH_FMT_DIR_NAME,
200 				  strlen(HMDFS_STASH_FMT_DIR_NAME), 0700);
201 	dput(base);
202 
203 	return work;
204 }
205 
hmdfs_new_stash_file(struct path *d_path, const char *cid)206 static struct file *hmdfs_new_stash_file(struct path *d_path, const char *cid)
207 {
208 	struct dentry *parent = NULL;
209 	struct file *filp = NULL;
210 	struct path stash;
211 	int err;
212 
213 	parent = hmdfs_do_vfs_mkdir(d_path->dentry, cid, strlen(cid), 0700);
214 	if (IS_ERR(parent)) {
215 		err = PTR_ERR(parent);
216 		hmdfs_err("mkdir error %d", err);
217 		goto mkdir_err;
218 	}
219 
220 	stash.mnt = d_path->mnt;
221 	stash.dentry = parent;
222 	filp = kernel_tmpfile_open(&nop_mnt_idmap, &stash, S_IFREG | 0600,
223 	                         O_LARGEFILE | O_WRONLY, current_cred());
224 	if (IS_ERR(filp)) {
225 		err = PTR_ERR(filp);
226 		hmdfs_err("open stash file error %d", err);
227 		goto open_err;
228 	}
229 
230 	dput(parent);
231 
232 	return filp;
233 
234 open_err:
235 	dput(parent);
236 mkdir_err:
237 	return ERR_PTR(err);
238 }
239 
hmdfs_is_dir(struct dentry *child)240 static inline bool hmdfs_is_dir(struct dentry *child)
241 {
242 	return d_is_positive(child) && d_can_lookup(child);
243 }
244 
hmdfs_is_reg(struct dentry *child)245 static inline bool hmdfs_is_reg(struct dentry *child)
246 {
247 	return d_is_positive(child) && d_is_reg(child);
248 }
249 
hmdfs_set_stash_file_head(const struct hmdfs_cache_info *cache, uint64_t ino, struct hmdfs_cache_file_head *head)250 static void hmdfs_set_stash_file_head(const struct hmdfs_cache_info *cache,
251 				      uint64_t ino,
252 				      struct hmdfs_cache_file_head *head)
253 {
254 	long long blocks;
255 	unsigned int crc_offset;
256 
257 	memset(head, 0, sizeof(*head));
258 	head->magic = cpu_to_le32(HMDFS_STASH_FILE_HEAD_MAGIC);
259 	head->ino = cpu_to_le64(ino);
260 	head->size = cpu_to_le64(i_size_read(file_inode(cache->cache_file)));
261 	blocks = atomic64_read(&cache->written_pgs) <<
262 			       HMDFS_STASH_PAGE_TO_SECTOR_SHIFT;
263 	head->blocks = cpu_to_le64(blocks);
264 	head->path_offs = cpu_to_le32(cache->path_offs);
265 	head->path_len = cpu_to_le32(cache->path_len);
266 	head->path_cnt = cpu_to_le32(cache->path_cnt);
267 	head->data_offs = cpu_to_le32(cache->data_offs);
268 	crc_offset = offsetof(struct hmdfs_cache_file_head, crc32);
269 	head->crc_offset = cpu_to_le32(crc_offset);
270 	head->crc32 = cpu_to_le32(crc32(0, head, crc_offset));
271 }
272 
hmdfs_flush_stash_file_metadata(struct hmdfs_inode_info *info)273 static int hmdfs_flush_stash_file_metadata(struct hmdfs_inode_info *info)
274 {
275 	struct hmdfs_cache_info *cache = NULL;
276 	struct hmdfs_peer *conn = info->conn;
277 	struct hmdfs_cache_file_head cache_head;
278 	size_t written;
279 	loff_t pos;
280 	unsigned int head_size;
281 
282 	/* No metadata if no cache file info */
283 	cache = info->cache;
284 	if (!cache)
285 		return -EINVAL;
286 
287 	if (strlen(cache->path) == 0) {
288 		long long to_write_pgs = atomic64_read(&cache->to_write_pgs);
289 
290 		/* Nothing to stash. No need to flush meta data. */
291 		if (to_write_pgs == 0)
292 			return 0;
293 
294 		hmdfs_err("peer 0x%x:0x%llx inode 0x%llx lost %lld pages due to no path",
295 			  conn->owner, conn->device_id,
296 			  info->remote_ino, to_write_pgs);
297 		return -EINVAL;
298 	}
299 
300 	hmdfs_set_stash_file_head(cache, info->remote_ino, &cache_head);
301 
302 	/* Write head */
303 	pos = 0;
304 	head_size = sizeof(cache_head);
305 	written = kernel_write(cache->cache_file, &cache_head, head_size, &pos);
306 	if (written != head_size) {
307 		hmdfs_err("stash peer 0x%x:0x%llx ino 0x%llx write head len %u err %zd",
308 			   conn->owner, conn->device_id, info->remote_ino,
309 			   head_size, written);
310 		return -EIO;
311 	}
312 	/* Write path */
313 	pos = (loff_t)cache->path_offs << HMDFS_STASH_BLK_SHIFT;
314 	written = kernel_write(cache->cache_file, cache->path, cache->path_len,
315 			       &pos);
316 	if (written != cache->path_len) {
317 		hmdfs_err("stash peer 0x%x:0x%llx ino 0x%llx write path len %u err %zd",
318 			   conn->owner, conn->device_id, info->remote_ino,
319 			   cache->path_len, written);
320 		return -EIO;
321 	}
322 
323 	return 0;
324 }
325 
326 /* Mainly from inode_wait_for_writeback() */
hmdfs_wait_remote_writeback_once(struct hmdfs_peer *conn, struct hmdfs_inode_info *info)327 static void hmdfs_wait_remote_writeback_once(struct hmdfs_peer *conn,
328 					     struct hmdfs_inode_info *info)
329 {
330 	struct inode *inode = &info->vfs_inode;
331 	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
332 	wait_queue_head_t *wq_head = NULL;
333 	bool in_sync = false;
334 
335 	spin_lock(&inode->i_lock);
336 	in_sync = inode->i_state & I_SYNC;
337 	spin_unlock(&inode->i_lock);
338 
339 	if (!in_sync)
340 		return;
341 
342 	hmdfs_info("peer 0x%x:0x%llx ino 0x%llx wait for wb once",
343 		   conn->owner, conn->device_id, info->remote_ino);
344 
345 	wq_head = bit_waitqueue(&inode->i_state, __I_SYNC);
346 	__wait_on_bit(wq_head, &wq, bit_wait, TASK_UNINTERRUPTIBLE);
347 }
348 
hmdfs_reset_remote_write_err(struct hmdfs_peer *conn, struct hmdfs_inode_info *info)349 static void hmdfs_reset_remote_write_err(struct hmdfs_peer *conn,
350 					 struct hmdfs_inode_info *info)
351 {
352 	struct address_space *mapping = info->vfs_inode.i_mapping;
353 	int flags_err;
354 	errseq_t old;
355 	int wb_err;
356 
357 	flags_err = filemap_check_errors(mapping);
358 
359 	old = errseq_sample(&mapping->wb_err);
360 	wb_err = errseq_check_and_advance(&mapping->wb_err, &old);
361 	if (flags_err || wb_err)
362 		hmdfs_warning("peer 0x%x:0x%llx inode 0x%llx wb error %d %d before stash",
363 			      conn->owner, conn->device_id, info->remote_ino,
364 			      flags_err, wb_err);
365 }
366 
hmdfs_is_mapping_clean(struct address_space *mapping)367 static bool hmdfs_is_mapping_clean(struct address_space *mapping)
368 {
369 	bool clean = false;
370 
371 	/* b93b016313b3b ("page cache: use xa_lock") introduces i_pages */
372 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)
373 	xa_lock_irq(&mapping->i_pages);
374 #else
375 	spin_lock_irq(&mapping->tree_lock);
376 #endif
377 	clean = !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
378 		!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
379 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0)
380 	xa_unlock_irq(&mapping->i_pages);
381 #else
382 	spin_unlock_irq(&mapping->tree_lock);
383 #endif
384 	return clean;
385 }
386 
hmdfs_flush_stash_file_data(struct hmdfs_peer *conn, struct hmdfs_inode_info *info)387 static int hmdfs_flush_stash_file_data(struct hmdfs_peer *conn,
388 				       struct hmdfs_inode_info *info)
389 {
390 	struct inode *inode = &info->vfs_inode;
391 	struct address_space *mapping = inode->i_mapping;
392 	bool all_clean = true;
393 	int err = 0;
394 	int i;
395 
396 	/* Wait for the completion of write syscall */
397 	inode_lock(inode);
398 	inode_unlock(inode);
399 
400 	all_clean = hmdfs_is_mapping_clean(mapping);
401 	if (all_clean) {
402 		hmdfs_reset_remote_write_err(conn, info);
403 		return 0;
404 	}
405 
406 	/*
407 	 * No-sync_all writeback during offline may have not seen
408 	 * the setting of stash_status as HMDFS_REMOTE_INODE_STASHING
409 	 * and will call mapping_set_error() after we just reset
410 	 * the previous error. So waiting for these writeback once,
411 	 * and the following writeback will do local write.
412 	 */
413 	hmdfs_wait_remote_writeback_once(conn, info);
414 
415 	/* Need to clear previous error ? */
416 	hmdfs_reset_remote_write_err(conn, info);
417 
418 	/*
419 	 * 1. dirty page: do write back
420 	 * 2. writeback page: wait for its completion
421 	 * 3. writeback -> redirty page: do filemap_write_and_wait()
422 	 *    twice, so 2th writeback should not allow
423 	 *    writeback -> redirty transition
424 	 */
425 	for (i = 0; i < HMDFS_STASH_FLUSH_CNT; i++) {
426 		err = filemap_write_and_wait(mapping);
427 		if (err) {
428 			hmdfs_err("peer 0x%x:0x%llx inode 0x%llx #%d stash flush error %d",
429 				  conn->owner, conn->device_id,
430 				  info->remote_ino, i, err);
431 			return err;
432 		}
433 	}
434 
435 	if (!hmdfs_is_mapping_clean(mapping))
436 		hmdfs_err("peer 0x%x:0x%llx inode 0x%llx is still dirty dt %d wb %d",
437 			  conn->owner, conn->device_id, info->remote_ino,
438 			  !!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY),
439 			  !!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK));
440 
441 	return 0;
442 }
443 
hmdfs_flush_stash_file(struct hmdfs_inode_info *info)444 static int hmdfs_flush_stash_file(struct hmdfs_inode_info *info)
445 {
446 	int err;
447 
448 	err = hmdfs_flush_stash_file_data(info->conn, info);
449 	if (!err)
450 		err = hmdfs_flush_stash_file_metadata(info);
451 
452 	return err;
453 }
454 
hmdfs_enable_stash_file(struct hmdfs_inode_info *info, struct dentry *stash)455 static int hmdfs_enable_stash_file(struct hmdfs_inode_info *info,
456 				   struct dentry *stash)
457 {
458 	char name[HMDFS_STASH_FILE_NAME_LEN];
459 	struct dentry *parent = NULL;
460 	struct inode *dir = NULL;
461 	struct dentry *child = NULL;
462 	int err = 0;
463 	bool retried = false;
464 
465 	snprintf(name, sizeof(name), "0x%llx", info->remote_ino);
466 
467 	parent = lock_parent(stash);
468 	dir = d_inode(parent);
469 
470 lookup_again:
471 	child = lookup_one_len(name, parent, strlen(name));
472 	if (IS_ERR(child)) {
473 		err = PTR_ERR(child);
474 		child = NULL;
475 		hmdfs_err("lookup %s err %d", name, err);
476 		goto out;
477 	}
478 
479 	if (d_is_positive(child)) {
480 		hmdfs_warning("%s exists (mode 0%o)",
481 			      name, d_inode(child)->i_mode);
482 
483 		err = vfs_unlink(&nop_mnt_idmap, dir, child, NULL);
484 		if (err) {
485 			hmdfs_err("unlink %s err %d", name, err);
486 			goto out;
487 		}
488 		if (retried) {
489 			err = -EEXIST;
490 			goto out;
491 		}
492 
493 		retried = true;
494 		dput(child);
495 		goto lookup_again;
496 	}
497 
498 	err = vfs_link(stash, &nop_mnt_idmap, dir, child, NULL);
499 	if (err) {
500 		hmdfs_err("link stash file to %s err %d", name, err);
501 		goto out;
502 	}
503 
504 out:
505 	unlock_dir(parent);
506 	if (child)
507 		dput(child);
508 
509 	return err;
510 }
511 
512 /* Return 1 if stash is done, 0 if nothing is stashed */
hmdfs_close_stash_file(struct hmdfs_peer *conn, struct hmdfs_inode_info *info)513 static int hmdfs_close_stash_file(struct hmdfs_peer *conn,
514 				  struct hmdfs_inode_info *info)
515 {
516 	struct file *cache_file = info->cache->cache_file;
517 	struct dentry *c_dentry = file_dentry(cache_file);
518 	struct inode *c_inode = d_inode(c_dentry);
519 	long long to_write_pgs = atomic64_read(&info->cache->to_write_pgs);
520 	int err;
521 
522 	hmdfs_info("peer 0x%x:0x%llx inode 0x%llx stashed bytes %lld pages %lld",
523 		   conn->owner, conn->device_id, info->remote_ino,
524 		   i_size_read(c_inode), to_write_pgs);
525 
526 	if (to_write_pgs == 0)
527 		return 0;
528 
529 	err = vfs_fsync(cache_file, 0);
530 	if (!err)
531 		err = hmdfs_enable_stash_file(info, c_dentry);
532 	else
533 		hmdfs_err("fsync stash file err %d", err);
534 
535 	return err < 0 ? err : 1;
536 }
537 
hmdfs_del_file_cache(struct hmdfs_cache_info *cache)538 static void hmdfs_del_file_cache(struct hmdfs_cache_info *cache)
539 {
540 	if (!cache)
541 		return;
542 
543 	fput(cache->cache_file);
544 	kfree(cache->path_buf);
545 	kfree(cache);
546 }
547 
548 static struct hmdfs_cache_info *
hmdfs_new_file_cache(struct hmdfs_peer *conn, struct hmdfs_inode_info *info)549 hmdfs_new_file_cache(struct hmdfs_peer *conn, struct hmdfs_inode_info *info)
550 {
551 	struct hmdfs_cache_info *cache = NULL;
552 	struct dentry *stash_dentry = NULL;
553 	int err;
554 
555 	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
556 	if (!cache)
557 		return ERR_PTR(-ENOMEM);
558 
559 	atomic64_set(&cache->to_write_pgs, 0);
560 	atomic64_set(&cache->written_pgs, 0);
561 	cache->path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
562 	if (!cache->path_buf) {
563 		err = -ENOMEM;
564 		goto free_cache;
565 	}
566 
567 	/* Need to handle "hardlink" ? */
568 	stash_dentry = d_find_any_alias(&info->vfs_inode);
569 	if (stash_dentry) {
570 		/* Needs full path in hmdfs, will be a device-view path */
571 		cache->path = dentry_path_raw(stash_dentry, cache->path_buf,
572 					      PATH_MAX);
573 		dput(stash_dentry);
574 		if (IS_ERR(cache->path)) {
575 			err = PTR_ERR(cache->path);
576 			hmdfs_err("peer 0x%x:0x%llx inode 0x%llx gen path err %d",
577 				  conn->owner, conn->device_id,
578 				  info->remote_ino, err);
579 			goto free_path;
580 		}
581 	} else {
582 		/* Write-opened file was closed before finding dentry */
583 		hmdfs_info("peer 0x%x:0x%llx inode 0x%llx no dentry found",
584 			   conn->owner, conn->device_id, info->remote_ino);
585 		cache->path_buf[0] = '\0';
586 		cache->path = cache->path_buf;
587 	}
588 
589 	cache->path_cnt = 1;
590 	cache->path_len = strlen(cache->path) + 1;
591 	cache->path_offs = DIV_ROUND_UP(sizeof(struct hmdfs_cache_file_head),
592 					HMDFS_STASH_BLK_SIZE);
593 	cache->data_offs = cache->path_offs + DIV_ROUND_UP(cache->path_len,
594 					HMDFS_STASH_BLK_SIZE);
595 	cache->cache_file = hmdfs_new_stash_file(&conn->sbi->stash_work_dir,
596 						 conn->cid);
597 	if (IS_ERR(cache->cache_file)) {
598 		err = PTR_ERR(cache->cache_file);
599 		goto free_path;
600 	}
601 
602 	return cache;
603 
604 free_path:
605 	kfree(cache->path_buf);
606 free_cache:
607 	kfree(cache);
608 	return ERR_PTR(err);
609 }
610 
hmdfs_init_stash_file_cache(struct hmdfs_peer *conn, struct hmdfs_inode_info *info)611 static void hmdfs_init_stash_file_cache(struct hmdfs_peer *conn,
612 					struct hmdfs_inode_info *info)
613 {
614 	struct hmdfs_cache_info *cache = NULL;
615 
616 	cache = hmdfs_new_file_cache(conn, info);
617 	if (IS_ERR(cache))
618 		/*
619 		 * Continue even creating stash info failed.
620 		 * We need to ensure there is no dirty pages
621 		 * after stash completes
622 		 */
623 		cache = NULL;
624 
625 	/* Make write() returns */
626 	spin_lock(&info->stash_lock);
627 	info->cache = cache;
628 	info->stash_status = HMDFS_REMOTE_INODE_STASHING;
629 	spin_unlock(&info->stash_lock);
630 }
631 
hmdfs_update_stash_stats(struct hmdfs_stash_stats *stats, const struct hmdfs_cache_info *cache, int err)632 static void hmdfs_update_stash_stats(struct hmdfs_stash_stats *stats,
633 				     const struct hmdfs_cache_info *cache,
634 				     int err)
635 {
636 	unsigned long long ok_pages, fail_pages;
637 
638 	if (cache) {
639 		ok_pages = err > 0 ? atomic64_read(&cache->written_pgs) : 0;
640 		fail_pages = atomic64_read(&cache->to_write_pgs) - ok_pages;
641 		stats->ok_pages += ok_pages;
642 		stats->fail_pages += fail_pages;
643 	}
644 
645 	if (err > 0)
646 		stats->succeed++;
647 	else if (!err)
648 		stats->donothing++;
649 	else
650 		stats->fail++;
651 }
652 
653 /* Return 1 if stash is done, 0 if nothing is stashed */
hmdfs_stash_remote_inode(struct hmdfs_inode_info *info, struct hmdfs_stash_stats *stats)654 static int hmdfs_stash_remote_inode(struct hmdfs_inode_info *info,
655 				    struct hmdfs_stash_stats *stats)
656 {
657 	struct hmdfs_cache_info *cache = info->cache;
658 	struct hmdfs_peer *conn = info->conn;
659 	unsigned int status;
660 	int err = 0;
661 
662 	hmdfs_info("stash peer 0x%x:0x%llx ino 0x%llx",
663 		   conn->owner, conn->device_id, info->remote_ino);
664 
665 	err = hmdfs_flush_stash_file(info);
666 	if (!err)
667 		err = hmdfs_close_stash_file(conn, info);
668 
669 	if (err <= 0)
670 		set_bit(HMDFS_FID_NEED_OPEN, &info->fid_flags);
671 	status = err > 0 ? HMDFS_REMOTE_INODE_RESTORING :
672 			   HMDFS_REMOTE_INODE_NONE;
673 	spin_lock(&info->stash_lock);
674 	info->cache = NULL;
675 	/*
676 	 * Use smp_store_release() to ensure order between HMDFS_FID_NEED_OPEN
677 	 * and HMDFS_REMOTE_INODE_NONE.
678 	 */
679 	smp_store_release(&info->stash_status, status);
680 	spin_unlock(&info->stash_lock);
681 
682 	hmdfs_update_stash_stats(stats, cache, err);
683 	hmdfs_del_file_cache(cache);
684 
685 	return err;
686 }
687 
hmdfs_init_cache_for_stash_files(struct hmdfs_peer *conn, struct list_head *list)688 static void hmdfs_init_cache_for_stash_files(struct hmdfs_peer *conn,
689 					     struct list_head *list)
690 {
691 	const struct cred *old_cred = NULL;
692 	struct hmdfs_inode_info *info = NULL;
693 
694 	/* For file creation under stash_work_dir */
695 	old_cred = hmdfs_override_creds(conn->sbi->cred);
696 	list_for_each_entry(info, list, stash_node)
697 		hmdfs_init_stash_file_cache(conn, info);
698 	hmdfs_revert_creds(old_cred);
699 }
700 
hmdfs_init_stash_cache_work_fn(struct work_struct *base)701 static void hmdfs_init_stash_cache_work_fn(struct work_struct *base)
702 {
703 	struct hmdfs_stash_work *work =
704 		container_of(base, struct hmdfs_stash_work, work);
705 
706 	hmdfs_init_cache_for_stash_files(work->conn, work->list);
707 	complete(&work->done);
708 }
709 
hmdfs_init_cache_for_stash_files_by_work(struct hmdfs_peer *conn, struct list_head *list)710 static void hmdfs_init_cache_for_stash_files_by_work(struct hmdfs_peer *conn,
711 						     struct list_head *list)
712 {
713 	struct hmdfs_stash_work work = {
714 		.conn = conn,
715 		.list = list,
716 		.done = COMPLETION_INITIALIZER_ONSTACK(work.done),
717 	};
718 
719 	INIT_WORK_ONSTACK(&work.work, hmdfs_init_stash_cache_work_fn);
720 	schedule_work(&work.work);
721 	wait_for_completion(&work.done);
722 }
723 
hmdfs_stash_fetch_ready_files(struct hmdfs_peer *conn, bool check, struct list_head *list)724 static void hmdfs_stash_fetch_ready_files(struct hmdfs_peer *conn,
725 					  bool check, struct list_head *list)
726 {
727 	struct hmdfs_inode_info *info = NULL;
728 
729 	spin_lock(&conn->wr_opened_inode_lock);
730 	list_for_each_entry(info, &conn->wr_opened_inode_list, wr_opened_node) {
731 		int status;
732 
733 		/* Paired with *_release() in hmdfs_reset_stashed_inode() */
734 		status = smp_load_acquire(&info->stash_status);
735 		if (status == HMDFS_REMOTE_INODE_NONE) {
736 			list_add_tail(&info->stash_node, list);
737 			/*
738 			 * Prevent close() removing the inode from
739 			 * writeable-opened inode list
740 			 */
741 			hmdfs_remote_add_wr_opened_inode_nolock(conn, info);
742 			/* Prevent the inode from eviction */
743 			ihold(&info->vfs_inode);
744 		} else if (check && status == HMDFS_REMOTE_INODE_STASHING) {
745 			hmdfs_warning("peer 0x%x:0x%llx inode 0x%llx unexpected stash status %d",
746 				      conn->owner, conn->device_id,
747 				      info->remote_ino, status);
748 		}
749 	}
750 	spin_unlock(&conn->wr_opened_inode_lock);
751 }
752 
hmdfs_stash_offline_prepare(struct hmdfs_peer *conn, int evt, unsigned int seq)753 static void hmdfs_stash_offline_prepare(struct hmdfs_peer *conn, int evt,
754 					unsigned int seq)
755 {
756 	LIST_HEAD(preparing);
757 
758 	if (!hmdfs_is_stash_enabled(conn->sbi))
759 		return;
760 
761 	mutex_lock(&conn->offline_cb_lock);
762 
763 	hmdfs_stash_fetch_ready_files(conn, true, &preparing);
764 
765 	if (list_empty(&preparing))
766 		goto out;
767 
768 	hmdfs_init_cache_for_stash_files_by_work(conn, &preparing);
769 out:
770 	mutex_unlock(&conn->offline_cb_lock);
771 }
772 
hmdfs_track_inode_locked(struct hmdfs_peer *conn, struct hmdfs_inode_info *info)773 static void hmdfs_track_inode_locked(struct hmdfs_peer *conn,
774 				     struct hmdfs_inode_info *info)
775 {
776 	spin_lock(&conn->stashed_inode_lock);
777 	list_add_tail(&info->stash_node, &conn->stashed_inode_list);
778 	conn->stashed_inode_nr++;
779 	spin_unlock(&conn->stashed_inode_lock);
780 }
781 
782 static void
hmdfs_update_peer_stash_stats(struct hmdfs_stash_statistics *stash_stats, const struct hmdfs_stash_stats *stats)783 hmdfs_update_peer_stash_stats(struct hmdfs_stash_statistics *stash_stats,
784 			      const struct hmdfs_stash_stats *stats)
785 {
786 	stash_stats->cur_ok = stats->succeed;
787 	stash_stats->cur_nothing = stats->donothing;
788 	stash_stats->cur_fail = stats->fail;
789 	stash_stats->total_ok += stats->succeed;
790 	stash_stats->total_nothing += stats->donothing;
791 	stash_stats->total_fail += stats->fail;
792 	stash_stats->ok_pages += stats->ok_pages;
793 	stash_stats->fail_pages += stats->fail_pages;
794 }
795 
hmdfs_stash_remote_inodes(struct hmdfs_peer *conn, struct list_head *list)796 static void hmdfs_stash_remote_inodes(struct hmdfs_peer *conn,
797 				      struct list_head *list)
798 {
799 	const struct cred *old_cred = NULL;
800 	struct hmdfs_inode_info *info = NULL;
801 	struct hmdfs_inode_info *next = NULL;
802 	struct hmdfs_stash_stats stats;
803 
804 	/* For file creation, write and relink under stash_work_dir */
805 	old_cred = hmdfs_override_creds(conn->sbi->cred);
806 
807 	memset(&stats, 0, sizeof(stats));
808 	list_for_each_entry_safe(info, next, list, stash_node) {
809 		int err;
810 
811 		list_del_init(&info->stash_node);
812 
813 		err = hmdfs_stash_remote_inode(info, &stats);
814 		if (err > 0)
815 			hmdfs_track_inode_locked(conn, info);
816 
817 		hmdfs_remote_del_wr_opened_inode(conn, info);
818 		if (err <= 0)
819 			iput(&info->vfs_inode);
820 	}
821 	hmdfs_revert_creds(old_cred);
822 
823 	hmdfs_update_peer_stash_stats(&conn->stats.stash, &stats);
824 	hmdfs_info("peer 0x%x:0x%llx total stashed %u cur ok %u none %u fail %u",
825 		   conn->owner, conn->device_id, conn->stashed_inode_nr,
826 		   stats.succeed, stats.donothing, stats.fail);
827 }
828 
hmdfs_stash_offline_do_stash(struct hmdfs_peer *conn, int evt, unsigned int seq)829 static void hmdfs_stash_offline_do_stash(struct hmdfs_peer *conn, int evt,
830 					 unsigned int seq)
831 {
832 	struct hmdfs_inode_info *info = NULL;
833 	LIST_HEAD(preparing);
834 	LIST_HEAD(stashing);
835 
836 	if (!hmdfs_is_stash_enabled(conn->sbi))
837 		return;
838 
839 	/* release seq_lock to prevent blocking no-offline sync cb */
840 	mutex_unlock(&conn->seq_lock);
841 	/* acquire offline_cb_lock to serialized with offline sync cb */
842 	mutex_lock(&conn->offline_cb_lock);
843 
844 	hmdfs_stash_fetch_ready_files(conn, false, &preparing);
845 	if (!list_empty(&preparing))
846 		hmdfs_init_cache_for_stash_files(conn, &preparing);
847 
848 	spin_lock(&conn->wr_opened_inode_lock);
849 	list_for_each_entry(info, &conn->wr_opened_inode_list, wr_opened_node) {
850 		int status = READ_ONCE(info->stash_status);
851 
852 		if (status == HMDFS_REMOTE_INODE_STASHING)
853 			list_add_tail(&info->stash_node, &stashing);
854 	}
855 	spin_unlock(&conn->wr_opened_inode_lock);
856 
857 	if (list_empty(&stashing))
858 		goto unlock;
859 
860 	hmdfs_stash_remote_inodes(conn, &stashing);
861 
862 unlock:
863 	mutex_unlock(&conn->offline_cb_lock);
864 	mutex_lock(&conn->seq_lock);
865 }
866 
867 static struct hmdfs_inode_info *
hmdfs_lookup_stash_inode(struct hmdfs_peer *conn, uint64_t inum)868 hmdfs_lookup_stash_inode(struct hmdfs_peer *conn, uint64_t inum)
869 {
870 	struct hmdfs_inode_info *info = NULL;
871 
872 	list_for_each_entry(info, &conn->stashed_inode_list, stash_node) {
873 		if (info->remote_ino == inum)
874 			return info;
875 	}
876 
877 	return NULL;
878 }
879 
hmdfs_untrack_stashed_inode(struct hmdfs_peer *conn, struct hmdfs_inode_info *info)880 static void hmdfs_untrack_stashed_inode(struct hmdfs_peer *conn,
881 					struct hmdfs_inode_info *info)
882 {
883 	list_del_init(&info->stash_node);
884 	iput(&info->vfs_inode);
885 
886 	conn->stashed_inode_nr--;
887 }
888 
hmdfs_reset_stashed_inode(struct hmdfs_peer *conn, struct hmdfs_inode_info *info)889 static void hmdfs_reset_stashed_inode(struct hmdfs_peer *conn,
890 				      struct hmdfs_inode_info *info)
891 {
892 	struct inode *ino = &info->vfs_inode;
893 
894 	/*
895 	 * For updating stash_status after iput()
896 	 * in hmdfs_untrack_stashed_inode()
897 	 */
898 	ihold(ino);
899 	hmdfs_untrack_stashed_inode(conn, info);
900 	/*
901 	 * Ensure the order of stash_node and stash_status:
902 	 * only update stash_status to NONE after removal of
903 	 * stash_node is completed.
904 	 */
905 	smp_store_release(&info->stash_status,
906 			  HMDFS_REMOTE_INODE_NONE);
907 	iput(ino);
908 }
909 
hmdfs_drop_stashed_inodes(struct hmdfs_peer *conn)910 static void hmdfs_drop_stashed_inodes(struct hmdfs_peer *conn)
911 {
912 	struct hmdfs_inode_info *info = NULL;
913 	struct hmdfs_inode_info *next = NULL;
914 
915 	if (list_empty(&conn->stashed_inode_list))
916 		return;
917 
918 	hmdfs_warning("peer 0x%x:0x%llx drop unrestorable file %u",
919 		      conn->owner, conn->device_id, conn->stashed_inode_nr);
920 
921 	list_for_each_entry_safe(info, next,
922 				 &conn->stashed_inode_list, stash_node) {
923 		hmdfs_warning("peer 0x%x:0x%llx inode 0x%llx unrestorable status %u",
924 			      conn->owner, conn->device_id, info->remote_ino,
925 			      READ_ONCE(info->stash_status));
926 
927 		hmdfs_reset_stashed_inode(conn, info);
928 	}
929 }
930 
hmdfs_open_stash_dir(struct path *d_path, const char *cid)931 static struct file *hmdfs_open_stash_dir(struct path *d_path, const char *cid)
932 {
933 	int err = 0;
934 	struct dentry *parent = d_path->dentry;
935 	struct inode *dir = d_inode(parent);
936 	struct dentry *child = NULL;
937 	struct path peer_path;
938 	struct file *filp = NULL;
939 
940 	inode_lock_nested(dir, I_MUTEX_PARENT);
941 	child = lookup_one_len(cid, parent, strlen(cid));
942 	if (!IS_ERR(child)) {
943 		if (!hmdfs_is_dir(child)) {
944 			if (d_is_positive(child)) {
945 				hmdfs_err("invalid stash dir mode 0%o", d_inode(child)->i_mode);
946 				err = -EINVAL;
947 			} else {
948 				err = -ENOENT;
949 			}
950 			dput(child);
951 		}
952 	} else {
953 		err = PTR_ERR(child);
954 		hmdfs_err("lookup stash dir err %d", err);
955 	}
956 	inode_unlock(dir);
957 
958 	if (err)
959 		return ERR_PTR(err);
960 
961 	peer_path.mnt = d_path->mnt;
962 	peer_path.dentry = child;
963 	filp = dentry_open(&peer_path, O_RDONLY | O_DIRECTORY, current_cred());
964 	if (IS_ERR(filp))
965 		hmdfs_err("open err %d", (int)PTR_ERR(filp));
966 
967 	dput(child);
968 
969 	return filp;
970 }
971 
hmdfs_new_inode_tbl(struct hmdfs_inode_tbl **tbl)972 static int hmdfs_new_inode_tbl(struct hmdfs_inode_tbl **tbl)
973 {
974 	struct hmdfs_inode_tbl *new = NULL;
975 
976 	new = kmalloc(PAGE_SIZE, GFP_KERNEL);
977 	if (!new)
978 		return -ENOMEM;
979 
980 	new->cnt = 0;
981 	new->max = (PAGE_SIZE - offsetof(struct hmdfs_inode_tbl, inodes)) /
982 		   sizeof(new->inodes[0]);
983 	*tbl = new;
984 
985 	return 0;
986 }
987 
hmdfs_parse_stash_file_name(struct dir_context *dctx, const char *name, int namelen, unsigned int d_type, uint64_t *stash_inum)988 static int hmdfs_parse_stash_file_name(struct dir_context *dctx,
989 					const char *name,
990 					int namelen,
991 					unsigned int d_type,
992 					uint64_t *stash_inum)
993 {
994 	struct hmdfs_stash_dir_context *ctx = NULL;
995 	int err;
996 
997 	if (d_type != DT_UNKNOWN && d_type != DT_REG)
998 		return 0;
999 	if (namelen > NAME_MAX)
1000 		return 0;
1001 
1002 	ctx = container_of(dctx, struct hmdfs_stash_dir_context, dctx);
1003 	memcpy(ctx->name, name, namelen);
1004 	ctx->name[namelen] = '\0';
1005 	err = kstrtoull(ctx->name, 16, stash_inum);
1006 	if (err) {
1007 		hmdfs_err("unexpected stash file err %d", err);
1008 		return 0;
1009 	}
1010 	return 1;
1011 }
1012 
hmdfs_has_stash_file(struct dir_context *dctx, const char *name, int namelen, loff_t offset, u64 inum, unsigned int d_type)1013 static bool hmdfs_has_stash_file(struct dir_context *dctx, const char *name,
1014 				int namelen, loff_t offset,
1015 				u64 inum, unsigned int d_type)
1016 {
1017 	struct hmdfs_stash_dir_context *ctx = NULL;
1018 	uint64_t stash_inum;
1019 	int err;
1020 
1021 	ctx = container_of(dctx, struct hmdfs_stash_dir_context, dctx);
1022 	err = hmdfs_parse_stash_file_name(dctx, name, namelen,
1023 					   d_type, &stash_inum);
1024 	if (!err)
1025 		return true;
1026 
1027 	ctx->tbl->cnt++;
1028 	return false;
1029 }
1030 
hmdfs_fill_stash_file(struct dir_context *dctx, const char *name, int namelen, loff_t offset, u64 inum, unsigned int d_type)1031 static bool hmdfs_fill_stash_file(struct dir_context *dctx, const char *name,
1032 				 int namelen, loff_t offset,
1033 				 u64 inum, unsigned int d_type)
1034 {
1035 	struct hmdfs_stash_dir_context *ctx = NULL;
1036 	uint64_t stash_inum;
1037 	int err;
1038 
1039 	ctx = container_of(dctx, struct hmdfs_stash_dir_context, dctx);
1040 	err = hmdfs_parse_stash_file_name(dctx, name, namelen,
1041 					   d_type, &stash_inum);
1042 	if (!err)
1043 		return true;
1044 	if (ctx->tbl->cnt >= ctx->tbl->max)
1045 		return false;
1046 
1047 	ctx->tbl->inodes[ctx->tbl->cnt++] = stash_inum;
1048 
1049 	return true;
1050 }
1051 
hmdfs_del_stash_file(struct dentry *parent, struct dentry *child)1052 static int hmdfs_del_stash_file(struct dentry *parent, struct dentry *child)
1053 {
1054 	struct inode *dir = d_inode(parent);
1055 	int err = 0;
1056 
1057 	/* Prevent d_delete() from calling dentry_unlink_inode() */
1058 	dget(child);
1059 
1060 	inode_lock_nested(dir, I_MUTEX_PARENT);
1061 	err = vfs_unlink(&nop_mnt_idmap, dir, child, NULL);
1062 	if (err)
1063 		hmdfs_err("remove stash file err %d", err);
1064 	inode_unlock(dir);
1065 
1066 	dput(child);
1067 
1068 	return err;
1069 }
1070 
hmdfs_is_node_offlined(const struct hmdfs_peer *conn, unsigned int seq)1071 static inline bool hmdfs_is_node_offlined(const struct hmdfs_peer *conn,
1072 					  unsigned int seq)
1073 {
1074 	/*
1075 	 * open()/fsync() may fail due to "status = NODE_STAT_OFFLINE"
1076 	 * in hmdfs_disconnect_node().
1077 	 * Pair with smp_mb() in hmdfs_disconnect_node() to ensure
1078 	 * getting the newest event sequence.
1079 	 */
1080 	smp_mb__before_atomic();
1081 	return hmdfs_node_evt_seq(conn) != seq;
1082 }
1083 
hmdfs_verify_restore_file_head(struct hmdfs_file_restore_ctx *ctx, const struct hmdfs_cache_file_head *head)1084 static int hmdfs_verify_restore_file_head(struct hmdfs_file_restore_ctx *ctx,
1085 				    const struct hmdfs_cache_file_head *head)
1086 {
1087 	struct inode *inode = file_inode(ctx->src_filp);
1088 	struct hmdfs_peer *conn = ctx->conn;
1089 	unsigned int crc, read_crc, crc_offset;
1090 	loff_t path_offs, data_offs, isize;
1091 	int err = 0;
1092 
1093 	if (le32_to_cpu(head->magic) != HMDFS_STASH_FILE_HEAD_MAGIC) {
1094 		err = -EUCLEAN;
1095 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid magic: got 0x%x, exp 0x%x",
1096 			  conn->owner, conn->device_id, ctx->inum,
1097 			  le32_to_cpu(head->magic),
1098 			  HMDFS_STASH_FILE_HEAD_MAGIC);
1099 		goto out;
1100 	}
1101 
1102 	crc_offset = le32_to_cpu(head->crc_offset);
1103 	read_crc = le32_to_cpu(*((__le32 *)((char *)head + crc_offset)));
1104 	crc = crc32(0, head, crc_offset);
1105 	if (read_crc != crc) {
1106 		err = -EUCLEAN;
1107 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid crc: got 0x%x, exp 0x%x",
1108 			  conn->owner, conn->device_id, ctx->inum,
1109 			  read_crc, crc);
1110 		goto out;
1111 	}
1112 
1113 	if (le64_to_cpu(head->ino) != ctx->inum) {
1114 		err = -EUCLEAN;
1115 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid ino: got %llu, exp %llu",
1116 			  conn->owner, conn->device_id, ctx->inum,
1117 			  le64_to_cpu(head->ino), ctx->inum);
1118 		goto out;
1119 	}
1120 
1121 	path_offs = (loff_t)le32_to_cpu(head->path_offs) <<
1122 		    HMDFS_STASH_BLK_SHIFT;
1123 	if (path_offs <= 0 || path_offs >= i_size_read(inode)) {
1124 		err = -EUCLEAN;
1125 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid path_offs %d, stash file size %llu",
1126 			  conn->owner, conn->device_id, ctx->inum,
1127 			  le32_to_cpu(head->path_offs), i_size_read(inode));
1128 		goto out;
1129 	}
1130 
1131 	data_offs = (loff_t)le32_to_cpu(head->data_offs) <<
1132 		    HMDFS_STASH_BLK_SHIFT;
1133 	if (path_offs >= data_offs) {
1134 		err = -EUCLEAN;
1135 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid data_offs %d, path_offs %d",
1136 			  conn->owner, conn->device_id, ctx->inum,
1137 			  le32_to_cpu(head->data_offs),
1138 			  le32_to_cpu(head->path_offs));
1139 		goto out;
1140 	}
1141 	if (data_offs <= 0 || data_offs >= i_size_read(inode)) {
1142 		err = -EUCLEAN;
1143 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid data_offs %d, stash file size %llu",
1144 			  conn->owner, conn->device_id, ctx->inum,
1145 			  le32_to_cpu(head->data_offs), i_size_read(inode));
1146 		goto out;
1147 	}
1148 
1149 	isize = le64_to_cpu(head->size);
1150 	if (isize != i_size_read(inode)) {
1151 		err = -EUCLEAN;
1152 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid isize: got %llu, exp %llu",
1153 			  conn->owner, conn->device_id, ctx->inum,
1154 			  le64_to_cpu(head->size), i_size_read(inode));
1155 		goto out;
1156 	}
1157 
1158 	if (le32_to_cpu(head->path_cnt) < 1) {
1159 		err = -EUCLEAN;
1160 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid path_cnt %d",
1161 			  conn->owner, conn->device_id, ctx->inum,
1162 			  le32_to_cpu(head->path_cnt));
1163 		goto out;
1164 	}
1165 
1166 out:
1167 	return err;
1168 }
1169 
hmdfs_get_restore_file_metadata(struct hmdfs_file_restore_ctx *ctx)1170 static int hmdfs_get_restore_file_metadata(struct hmdfs_file_restore_ctx *ctx)
1171 {
1172 	struct hmdfs_cache_file_head head;
1173 	struct hmdfs_peer *conn = ctx->conn;
1174 	unsigned int head_size, read_size, head_crc_offset;
1175 	loff_t pos;
1176 	ssize_t rd;
1177 	int err = 0;
1178 
1179 	head_size = sizeof(struct hmdfs_cache_file_head);
1180 	memset(&head, 0, head_size);
1181 	/* Read part head */
1182 	pos = 0;
1183 	read_size = offsetof(struct hmdfs_cache_file_head, crc_offset) +
1184 		    sizeof(head.crc_offset);
1185 	rd = kernel_read(ctx->src_filp, &head, read_size, &pos);
1186 	if (rd != read_size) {
1187 		err = rd < 0 ? rd : -ENODATA;
1188 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx read part head err %d",
1189 			  conn->owner, conn->device_id, ctx->inum, err);
1190 		goto out;
1191 	}
1192 	head_crc_offset = le32_to_cpu(head.crc_offset);
1193 	if (head_crc_offset + sizeof(head.crc32) < head_crc_offset ||
1194 	    head_crc_offset + sizeof(head.crc32) > head_size) {
1195 		err = -EUCLEAN;
1196 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx got bad head: Too long crc_offset %u which exceeds head size %u",
1197 			  conn->owner, conn->device_id, ctx->inum,
1198 			  head_crc_offset, head_size);
1199 		goto out;
1200 	}
1201 
1202 	/* Read full head */
1203 	pos = 0;
1204 	read_size = le32_to_cpu(head.crc_offset) + sizeof(head.crc32);
1205 	rd = kernel_read(ctx->src_filp, &head, read_size, &pos);
1206 	if (rd != read_size) {
1207 		err = rd < 0 ? rd : -ENODATA;
1208 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx read full head err %d",
1209 			  conn->owner, conn->device_id, ctx->inum, err);
1210 		goto out;
1211 	}
1212 
1213 	err = hmdfs_verify_restore_file_head(ctx, &head);
1214 	if (err)
1215 		goto out;
1216 
1217 	ctx->pages = le64_to_cpu(head.blocks) >>
1218 		     HMDFS_STASH_PAGE_TO_SECTOR_SHIFT;
1219 	ctx->data_offs = le32_to_cpu(head.data_offs);
1220 	/* Read path */
1221 	read_size = min_t(unsigned int, le32_to_cpu(head.path_len), PATH_MAX);
1222 	pos = (loff_t)le32_to_cpu(head.path_offs) << HMDFS_STASH_BLK_SHIFT;
1223 	rd = kernel_read(ctx->src_filp, ctx->dst, read_size, &pos);
1224 	if (rd != read_size) {
1225 		err = rd < 0 ? rd : -ENODATA;
1226 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx read path err %d",
1227 			  conn->owner, conn->device_id, ctx->inum, err);
1228 		goto out;
1229 	}
1230 	if (strnlen(ctx->dst, read_size) >= read_size) {
1231 		err = -EUCLEAN;
1232 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx read path not end with \\0",
1233 			  conn->owner, conn->device_id, ctx->inum);
1234 		goto out;
1235 	}
1236 	/* TODO: Pick a valid path from all paths */
1237 
1238 out:
1239 	return err;
1240 }
1241 
hmdfs_open_restore_dst_file(struct hmdfs_file_restore_ctx *ctx, unsigned int rw_flag, struct file **filp)1242 static int hmdfs_open_restore_dst_file(struct hmdfs_file_restore_ctx *ctx,
1243 				       unsigned int rw_flag, struct file **filp)
1244 {
1245 	struct hmdfs_peer *conn = ctx->conn;
1246 	struct file *dst = NULL;
1247 	int err = 0;
1248 
1249 	err = hmdfs_get_restore_file_metadata(ctx);
1250 	if (err)
1251 		goto out;
1252 
1253 	/* Error comes from connection or server ? */
1254 	dst = file_open_root(&ctx->dst_root_path,
1255 			     ctx->dst, O_LARGEFILE | rw_flag, 0);
1256 	if (IS_ERR(dst)) {
1257 		err = PTR_ERR(dst);
1258 		hmdfs_err("open remote file ino 0x%llx err %d", ctx->inum, err);
1259 		if (hmdfs_is_node_offlined(conn, ctx->seq))
1260 			err = -ESHUTDOWN;
1261 		goto out;
1262 	}
1263 
1264 	*filp = dst;
1265 out:
1266 	return err;
1267 }
1268 
hmdfs_need_abort_restore(struct hmdfs_file_restore_ctx *ctx, struct hmdfs_inode_info *pinned, struct file *opened_file)1269 static bool hmdfs_need_abort_restore(struct hmdfs_file_restore_ctx *ctx,
1270 				     struct hmdfs_inode_info *pinned,
1271 				     struct file *opened_file)
1272 {
1273 	struct hmdfs_inode_info *opened = hmdfs_i(file_inode(opened_file));
1274 
1275 	if (opened->inode_type != HMDFS_LAYER_OTHER_REMOTE)
1276 		goto abort;
1277 
1278 	if (opened == pinned)
1279 		return false;
1280 
1281 abort:
1282 	hmdfs_warning("peer 0x%x:0x%llx inode 0x%llx invalid remote file",
1283 		      ctx->conn->owner, ctx->conn->device_id, ctx->inum);
1284 	hmdfs_warning("got: peer 0x%x:0x%llx inode 0x%llx type %d status %d",
1285 		      opened->conn ? opened->conn->owner : 0,
1286 		      opened->conn ? opened->conn->device_id : 0,
1287 		      opened->remote_ino, opened->inode_type,
1288 		      opened->stash_status);
1289 	hmdfs_warning("pinned: peer 0x%x:0x%llx inode 0x%llx type %d status %d",
1290 		      pinned->conn->owner, pinned->conn->device_id,
1291 		      pinned->remote_ino, pinned->inode_type,
1292 		      pinned->stash_status);
1293 	return true;
1294 }
1295 
hmdfs_init_copy_args(const struct hmdfs_file_restore_ctx *ctx, struct file *dst, struct hmdfs_copy_args *args)1296 static void hmdfs_init_copy_args(const struct hmdfs_file_restore_ctx *ctx,
1297 				 struct file *dst, struct hmdfs_copy_args *args)
1298 {
1299 	args->src = ctx->src_filp;
1300 	args->dst = dst;
1301 	args->buf = ctx->page;
1302 	args->buf_len = PAGE_SIZE;
1303 	args->seq = ctx->seq;
1304 	args->data_offs = ctx->data_offs;
1305 	args->inum = ctx->inum;
1306 }
1307 
hmdfs_write_dst(struct hmdfs_peer *conn, struct file *filp, void *buf, size_t len, loff_t pos)1308 static ssize_t hmdfs_write_dst(struct hmdfs_peer *conn, struct file *filp,
1309 			       void *buf, size_t len, loff_t pos)
1310 {
1311 	struct kiocb kiocb;
1312 	struct iovec iov;
1313 	struct iov_iter iter;
1314 	ssize_t wr;
1315 	int err = 0;
1316 
1317 	file_start_write(filp);
1318 
1319 	init_sync_kiocb(&kiocb, filp);
1320 	kiocb.ki_pos = pos;
1321 
1322 	iov.iov_base = buf;
1323 	iov.iov_len = len;
1324 	iov_iter_init(&iter, WRITE, &iov, 1, len);
1325 
1326 	wr = hmdfs_file_write_iter_remote_nocheck(&kiocb, &iter);
1327 
1328 	file_end_write(filp);
1329 
1330 	if (wr != len) {
1331 		struct hmdfs_inode_info *info = hmdfs_i(file_inode(filp));
1332 
1333 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx short write ret %zd exp %zu",
1334 			  conn->owner, conn->device_id, info->remote_ino,
1335 			  wr, len);
1336 		err = wr < 0 ? (int)wr : -EFAULT;
1337 	}
1338 
1339 	return err;
1340 }
1341 
hmdfs_rd_src_wr_dst(struct hmdfs_peer *conn, struct hmdfs_copy_ctx *ctx)1342 static int hmdfs_rd_src_wr_dst(struct hmdfs_peer *conn,
1343 			       struct hmdfs_copy_ctx *ctx)
1344 {
1345 	const struct hmdfs_copy_args *args = NULL;
1346 	int err = 0;
1347 	loff_t rd_pos;
1348 	ssize_t rd;
1349 
1350 	ctx->eof = false;
1351 	ctx->copied = 0;
1352 
1353 	args = &ctx->args;
1354 	rd_pos = ctx->src_pos;
1355 	rd = kernel_read(args->src, args->buf, args->buf_len, &rd_pos);
1356 	if (rd < 0) {
1357 		err = (int)rd;
1358 		hmdfs_err("peer 0x%x:0x%llx ino 0x%llx short read err %d",
1359 			  conn->owner, conn->device_id, args->inum, err);
1360 		goto out;
1361 	} else if (rd == 0) {
1362 		ctx->eof = true;
1363 		goto out;
1364 	}
1365 
1366 	err = hmdfs_write_dst(conn, args->dst, args->buf, rd, ctx->dst_pos);
1367 	if (!err)
1368 		ctx->copied = rd;
1369 	else if (hmdfs_is_node_offlined(conn, args->seq))
1370 		err = -ESHUTDOWN;
1371 out:
1372 	return err;
1373 }
1374 
hmdfs_copy_src_to_dst(struct hmdfs_peer *conn, const struct hmdfs_copy_args *args)1375 static int hmdfs_copy_src_to_dst(struct hmdfs_peer *conn,
1376 				 const struct hmdfs_copy_args *args)
1377 {
1378 	int err = 0;
1379 	struct file *src = NULL;
1380 	struct hmdfs_copy_ctx ctx;
1381 	loff_t seek_pos, data_init_pos;
1382 	loff_t src_size;
1383 
1384 	ctx.args = *args;
1385 
1386 	src = ctx.args.src;
1387 	data_init_pos = (loff_t)ctx.args.data_offs << HMDFS_STASH_BLK_SHIFT;
1388 	seek_pos = data_init_pos;
1389 	src_size = i_size_read(file_inode(src));
1390 	while (true) {
1391 		loff_t data_pos;
1392 
1393 		data_pos = vfs_llseek(src, seek_pos, SEEK_DATA);
1394 		if (data_pos > seek_pos) {
1395 			seek_pos = data_pos;
1396 			continue;
1397 		} else if (data_pos < 0) {
1398 			if (data_pos == -ENXIO) {
1399 				loff_t src_blks = file_inode(src)->i_blocks;
1400 
1401 				hmdfs_info("peer 0x%x:0x%llx ino 0x%llx end at 0x%llx (sz 0x%llx blk 0x%llx)",
1402 					   conn->owner, conn->device_id,
1403 					   args->inum, seek_pos,
1404 					   src_size, src_blks);
1405 			} else {
1406 				err = (int)data_pos;
1407 				hmdfs_err("peer 0x%x:0x%llx ino 0x%llx seek pos 0x%llx err %d",
1408 					  conn->owner, conn->device_id,
1409 					  args->inum, seek_pos, err);
1410 			}
1411 			break;
1412 		}
1413 
1414 		hmdfs_debug("peer 0x%x:0x%llx ino 0x%llx seek to 0x%llx",
1415 			    conn->owner, conn->device_id, args->inum, data_pos);
1416 
1417 		ctx.src_pos = data_pos;
1418 		ctx.dst_pos = data_pos - data_init_pos;
1419 		err = hmdfs_rd_src_wr_dst(conn, &ctx);
1420 		if (err || ctx.eof)
1421 			break;
1422 
1423 		seek_pos += ctx.copied;
1424 		if (seek_pos >= src_size)
1425 			break;
1426 	}
1427 
1428 	return err;
1429 }
1430 
hmdfs_restore_src_to_dst(struct hmdfs_file_restore_ctx *ctx, struct file *dst)1431 static int hmdfs_restore_src_to_dst(struct hmdfs_file_restore_ctx *ctx,
1432 				    struct file *dst)
1433 {
1434 	struct file *src = ctx->src_filp;
1435 	struct hmdfs_copy_args args;
1436 	int err;
1437 
1438 	hmdfs_init_copy_args(ctx, dst, &args);
1439 	err = hmdfs_copy_src_to_dst(ctx->conn, &args);
1440 	if (err)
1441 		goto out;
1442 
1443 	err = vfs_fsync(dst, 0);
1444 	if (err) {
1445 		hmdfs_err("fsync remote file ino 0x%llx err %d", ctx->inum, err);
1446 		if (hmdfs_is_node_offlined(ctx->conn, ctx->seq))
1447 			err = -ESHUTDOWN;
1448 	}
1449 
1450 out:
1451 	if (err)
1452 		truncate_inode_pages(file_inode(dst)->i_mapping, 0);
1453 
1454 	/* Remove the unnecessary cache */
1455 	invalidate_mapping_pages(file_inode(src)->i_mapping, 0, -1);
1456 
1457 	return err;
1458 }
1459 
1460 
hmdfs_restore_file(struct hmdfs_file_restore_ctx *ctx)1461 static int hmdfs_restore_file(struct hmdfs_file_restore_ctx *ctx)
1462 {
1463 	struct hmdfs_peer *conn = ctx->conn;
1464 	uint64_t inum = ctx->inum;
1465 	struct hmdfs_inode_info *pinned_info = NULL;
1466 	struct file *dst_filp = NULL;
1467 	int err = 0;
1468 	bool keep = false;
1469 
1470 	hmdfs_info("peer 0x%x:0x%llx ino 0x%llx do restore",
1471 		   conn->owner, conn->device_id, inum);
1472 
1473 	pinned_info = hmdfs_lookup_stash_inode(conn, inum);
1474 	if (pinned_info) {
1475 		unsigned int status = READ_ONCE(pinned_info->stash_status);
1476 
1477 		if (status != HMDFS_REMOTE_INODE_RESTORING) {
1478 			hmdfs_err("peer 0x%x:0x%llx ino 0x%llx invalid status %u",
1479 				  conn->owner, conn->device_id, inum, status);
1480 			err = -EINVAL;
1481 			goto clean;
1482 		}
1483 	} else {
1484 		hmdfs_warning("peer 0x%x:0x%llx ino 0x%llx doesn't being pinned",
1485 			      conn->owner, conn->device_id, inum);
1486 		err = -EINVAL;
1487 		goto clean;
1488 	}
1489 
1490 	set_bit(HMDFS_FID_NEED_OPEN, &pinned_info->fid_flags);
1491 	err = hmdfs_open_restore_dst_file(ctx, O_RDWR, &dst_filp);
1492 	if (err) {
1493 		if (err == -ESHUTDOWN)
1494 			keep = true;
1495 		goto clean;
1496 	}
1497 
1498 	if (hmdfs_need_abort_restore(ctx, pinned_info, dst_filp))
1499 		goto abort;
1500 
1501 	err = hmdfs_restore_src_to_dst(ctx, dst_filp);
1502 	if (err == -ESHUTDOWN)
1503 		keep = true;
1504 abort:
1505 	fput(dst_filp);
1506 clean:
1507 	if (pinned_info && !keep)
1508 		hmdfs_reset_stashed_inode(conn, pinned_info);
1509 	ctx->keep = keep;
1510 
1511 	hmdfs_info("peer 0x%x:0x%llx ino 0x%llx restore err %d keep %d",
1512 		   conn->owner, conn->device_id, inum, err, ctx->keep);
1513 
1514 	return err;
1515 }
1516 
hmdfs_init_file_restore_ctx(struct hmdfs_peer *conn, unsigned int seq, struct path *src_dir, struct hmdfs_file_restore_ctx *ctx)1517 static int hmdfs_init_file_restore_ctx(struct hmdfs_peer *conn,
1518 				       unsigned int seq, struct path *src_dir,
1519 				       struct hmdfs_file_restore_ctx *ctx)
1520 {
1521 	struct hmdfs_sb_info *sbi = conn->sbi;
1522 	struct path dst_root;
1523 	char *dst = NULL;
1524 	char *page = NULL;
1525 	int err = 0;
1526 
1527 	err = hmdfs_get_path_in_sb(sbi->sb, sbi->real_dst, LOOKUP_DIRECTORY,
1528 				   &dst_root);
1529 	if (err)
1530 		return err;
1531 
1532 	dst = kmalloc(PATH_MAX, GFP_KERNEL);
1533 	if (!dst) {
1534 		err = -ENOMEM;
1535 		goto put_path;
1536 	}
1537 
1538 	page = kmalloc(PAGE_SIZE, GFP_KERNEL);
1539 	if (!page) {
1540 		err = -ENOMEM;
1541 		goto free_dst;
1542 	}
1543 
1544 	ctx->conn = conn;
1545 	ctx->src_dir_path = *src_dir;
1546 	ctx->dst_root_path = dst_root;
1547 	ctx->dst = dst;
1548 	ctx->page = page;
1549 	ctx->seq = seq;
1550 
1551 	return 0;
1552 free_dst:
1553 	kfree(dst);
1554 put_path:
1555 	path_put(&dst_root);
1556 	return err;
1557 }
1558 
hmdfs_exit_file_restore_ctx(struct hmdfs_file_restore_ctx *ctx)1559 static void hmdfs_exit_file_restore_ctx(struct hmdfs_file_restore_ctx *ctx)
1560 {
1561 	path_put(&ctx->dst_root_path);
1562 	kfree(ctx->dst);
1563 	kfree(ctx->page);
1564 }
1565 
hmdfs_open_stash_file(struct path *p_path, char *name)1566 static struct file *hmdfs_open_stash_file(struct path *p_path, char *name)
1567 {
1568 	struct dentry *parent = NULL;
1569 	struct inode *dir = NULL;
1570 	struct dentry *child = NULL;
1571 	struct file *filp = NULL;
1572 	struct path c_path;
1573 	int err = 0;
1574 
1575 	parent = p_path->dentry;
1576 	dir = d_inode(parent);
1577 	inode_lock_nested(dir, I_MUTEX_PARENT);
1578 	child = lookup_one_len(name, parent, strlen(name));
1579 	if (!IS_ERR(child) && !hmdfs_is_reg(child)) {
1580 		if (d_is_positive(child)) {
1581 			hmdfs_err("invalid stash file (mode 0%o)",
1582 				  d_inode(child)->i_mode);
1583 			err = -EINVAL;
1584 		} else {
1585 			hmdfs_err("missing stash file");
1586 			err = -ENOENT;
1587 		}
1588 		dput(child);
1589 	} else if (IS_ERR(child)) {
1590 		err = PTR_ERR(child);
1591 		hmdfs_err("lookup stash file err %d", err);
1592 	}
1593 	inode_unlock(dir);
1594 
1595 	if (err)
1596 		return ERR_PTR(err);
1597 
1598 	c_path.mnt = p_path->mnt;
1599 	c_path.dentry = child;
1600 	filp = dentry_open(&c_path, O_RDONLY | O_LARGEFILE, current_cred());
1601 	if (IS_ERR(filp))
1602 		hmdfs_err("open stash file err %d", (int)PTR_ERR(filp));
1603 
1604 	dput(child);
1605 
1606 	return filp;
1607 }
1608 
hmdfs_update_restore_stats(struct hmdfs_restore_stats *stats, bool keep, uint64_t pages, int err)1609 static void hmdfs_update_restore_stats(struct hmdfs_restore_stats *stats,
1610 				       bool keep, uint64_t pages, int err)
1611 {
1612 	if (!err) {
1613 		stats->succeed++;
1614 		stats->ok_pages += pages;
1615 	} else if (keep) {
1616 		stats->keep++;
1617 	} else {
1618 		stats->fail++;
1619 		stats->fail_pages += pages;
1620 	}
1621 }
1622 
hmdfs_restore_files(struct hmdfs_peer *conn, unsigned int seq, struct path *dir, const struct hmdfs_inode_tbl *tbl, void *priv)1623 static int hmdfs_restore_files(struct hmdfs_peer *conn,
1624 			       unsigned int seq, struct path *dir,
1625 			       const struct hmdfs_inode_tbl *tbl,
1626 			       void *priv)
1627 {
1628 	unsigned int i;
1629 	struct hmdfs_file_restore_ctx ctx;
1630 	int err = 0;
1631 	struct hmdfs_restore_stats *stats = priv;
1632 
1633 	err = hmdfs_init_file_restore_ctx(conn, seq, dir, &ctx);
1634 	if (err)
1635 		return err;
1636 
1637 	for (i = 0; i < tbl->cnt; i++) {
1638 		char name[HMDFS_STASH_FILE_NAME_LEN];
1639 		struct file *filp = NULL;
1640 
1641 		snprintf(name, sizeof(name), "0x%llx", tbl->inodes[i]);
1642 		filp = hmdfs_open_stash_file(dir, name);
1643 		/* Continue to restore if any error */
1644 		if (IS_ERR(filp)) {
1645 			stats->fail++;
1646 			continue;
1647 		}
1648 
1649 		ctx.inum = tbl->inodes[i];
1650 		ctx.src_filp = filp;
1651 		ctx.keep = false;
1652 		ctx.pages = 0;
1653 		err = hmdfs_restore_file(&ctx);
1654 		hmdfs_update_restore_stats(stats, ctx.keep, ctx.pages, err);
1655 
1656 		if (!ctx.keep)
1657 			hmdfs_del_stash_file(dir->dentry,
1658 					     file_dentry(ctx.src_filp));
1659 		fput(ctx.src_filp);
1660 
1661 		/* Continue to restore */
1662 		if (err == -ESHUTDOWN)
1663 			break;
1664 		err = 0;
1665 	}
1666 
1667 	hmdfs_exit_file_restore_ctx(&ctx);
1668 
1669 	return err;
1670 }
1671 
hmdfs_is_valid_stash_status(struct hmdfs_inode_info *inode_info, uint64_t ino)1672 static bool hmdfs_is_valid_stash_status(struct hmdfs_inode_info *inode_info,
1673 					uint64_t ino)
1674 {
1675 	return (inode_info->inode_type == HMDFS_LAYER_OTHER_REMOTE &&
1676 		inode_info->stash_status == HMDFS_REMOTE_INODE_RESTORING &&
1677 		inode_info->remote_ino == ino);
1678 }
1679 
hmdfs_rebuild_stash_list(struct hmdfs_peer *conn, unsigned int seq, struct path *dir, const struct hmdfs_inode_tbl *tbl, void *priv)1680 static int hmdfs_rebuild_stash_list(struct hmdfs_peer *conn,
1681 				    unsigned int seq,
1682 				    struct path *dir,
1683 				    const struct hmdfs_inode_tbl *tbl,
1684 				    void *priv)
1685 {
1686 	struct hmdfs_file_restore_ctx ctx;
1687 	unsigned int i;
1688 	int err;
1689 	struct hmdfs_rebuild_stats *stats = priv;
1690 
1691 	err = hmdfs_init_file_restore_ctx(conn, seq, dir, &ctx);
1692 	if (err)
1693 		return err;
1694 
1695 	stats->total += tbl->cnt;
1696 
1697 	for (i = 0; i < tbl->cnt; i++) {
1698 		char name[HMDFS_STASH_FILE_NAME_LEN];
1699 		struct file *src_filp = NULL;
1700 		struct file *dst_filp = NULL;
1701 		struct hmdfs_inode_info *inode_info = NULL;
1702 		bool is_valid = true;
1703 
1704 		snprintf(name, sizeof(name), "0x%llx", tbl->inodes[i]);
1705 		src_filp = hmdfs_open_stash_file(dir, name);
1706 		if (IS_ERR(src_filp)) {
1707 			stats->fail++;
1708 			continue;
1709 		}
1710 		ctx.inum = tbl->inodes[i];
1711 		ctx.src_filp = src_filp;
1712 
1713 		/* No need to track the open which only needs meta info */
1714 		err = hmdfs_open_restore_dst_file(&ctx, O_RDONLY, &dst_filp);
1715 		if (err) {
1716 			fput(src_filp);
1717 			if (err == -ESHUTDOWN)
1718 				break;
1719 			stats->fail++;
1720 			err = 0;
1721 			continue;
1722 		}
1723 
1724 		inode_info = hmdfs_i(file_inode(dst_filp));
1725 		is_valid = hmdfs_is_valid_stash_status(inode_info,
1726 						       ctx.inum);
1727 		if (is_valid) {
1728 			stats->succeed++;
1729 		} else {
1730 			hmdfs_err("peer 0x%x:0x%llx inode 0x%llx invalid state: type: %d, status: %u, inode: %llu",
1731 				  conn->owner, conn->device_id, ctx.inum,
1732 				  inode_info->inode_type,
1733 				  READ_ONCE(inode_info->stash_status),
1734 				  inode_info->remote_ino);
1735 			stats->invalid++;
1736 		}
1737 
1738 		fput(ctx.src_filp);
1739 		fput(dst_filp);
1740 	}
1741 
1742 	hmdfs_exit_file_restore_ctx(&ctx);
1743 	return err;
1744 }
1745 
hmdfs_iter_stash_file(struct hmdfs_peer *conn, unsigned int seq, struct file *filp, stash_operation_func op, void *priv)1746 static int hmdfs_iter_stash_file(struct hmdfs_peer *conn,
1747 				 unsigned int seq,
1748 				 struct file *filp,
1749 				 stash_operation_func op,
1750 				 void *priv)
1751 {
1752 	int err = 0;
1753 	struct hmdfs_stash_dir_context ctx = {
1754 		.dctx.actor = hmdfs_fill_stash_file,
1755 	};
1756 	struct hmdfs_inode_tbl *tbl = NULL;
1757 	struct path dir;
1758 
1759 	err = hmdfs_new_inode_tbl(&tbl);
1760 	if (err)
1761 		goto out;
1762 
1763 	dir.mnt = filp->f_path.mnt;
1764 	dir.dentry = file_dentry(filp);
1765 
1766 	ctx.tbl = tbl;
1767 	ctx.dctx.pos = 0;
1768 	do {
1769 		tbl->cnt = 0;
1770 		err = iterate_dir(filp, &ctx.dctx);
1771 		if (err || !tbl->cnt) {
1772 			if (err)
1773 				hmdfs_err("iterate stash dir err %d", err);
1774 			break;
1775 		}
1776 		err = op(conn, seq, &dir, tbl, priv);
1777 	} while (!err);
1778 
1779 out:
1780 	kfree(tbl);
1781 	return err;
1782 }
1783 
hmdfs_rebuild_check_work_fn(struct work_struct *base)1784 static void hmdfs_rebuild_check_work_fn(struct work_struct *base)
1785 {
1786 	struct hmdfs_check_work *work =
1787 		container_of(base, struct hmdfs_check_work, work);
1788 	struct hmdfs_peer *conn = work->conn;
1789 	struct hmdfs_sb_info *sbi = conn->sbi;
1790 	struct file *filp = NULL;
1791 	const struct cred *old_cred = NULL;
1792 	struct hmdfs_stash_dir_context ctx = {
1793 		.dctx.actor = hmdfs_has_stash_file,
1794 	};
1795 	struct hmdfs_inode_tbl tbl;
1796 	int err;
1797 
1798 	old_cred = hmdfs_override_creds(sbi->cred);
1799 	filp = hmdfs_open_stash_dir(&sbi->stash_work_dir, conn->cid);
1800 	if (IS_ERR(filp))
1801 		goto out;
1802 
1803 	memset(&tbl, 0, sizeof(tbl));
1804 	ctx.tbl = &tbl;
1805 	err = iterate_dir(filp, &ctx.dctx);
1806 	if (!err && ctx.tbl->cnt > 0)
1807 		conn->need_rebuild_stash_list = true;
1808 
1809 	fput(filp);
1810 out:
1811 	hmdfs_revert_creds(old_cred);
1812 	hmdfs_info("peer 0x%x:0x%llx %sneed to rebuild stash list",
1813 		   conn->owner, conn->device_id,
1814 		   conn->need_rebuild_stash_list ? "" : "don't ");
1815 	complete(&work->done);
1816 }
1817 
hmdfs_stash_add_do_check(struct hmdfs_peer *conn, int evt, unsigned int seq)1818 static void hmdfs_stash_add_do_check(struct hmdfs_peer *conn, int evt,
1819 				     unsigned int seq)
1820 {
1821 	struct hmdfs_sb_info *sbi = conn->sbi;
1822 	struct hmdfs_check_work work = {
1823 		.conn = conn,
1824 		.done = COMPLETION_INITIALIZER_ONSTACK(work.done),
1825 	};
1826 
1827 	if (!hmdfs_is_stash_enabled(sbi))
1828 		return;
1829 
1830 	INIT_WORK_ONSTACK(&work.work, hmdfs_rebuild_check_work_fn);
1831 	schedule_work(&work.work);
1832 	wait_for_completion(&work.done);
1833 }
1834 
1835 static void
hmdfs_update_peer_rebuild_stats(struct hmdfs_rebuild_statistics *rebuild_stats, const struct hmdfs_rebuild_stats *stats)1836 hmdfs_update_peer_rebuild_stats(struct hmdfs_rebuild_statistics *rebuild_stats,
1837 				const struct hmdfs_rebuild_stats *stats)
1838 {
1839 	rebuild_stats->cur_ok = stats->succeed;
1840 	rebuild_stats->cur_fail = stats->fail;
1841 	rebuild_stats->cur_invalid = stats->invalid;
1842 	rebuild_stats->total_ok += stats->succeed;
1843 	rebuild_stats->total_fail += stats->fail;
1844 	rebuild_stats->total_invalid += stats->invalid;
1845 }
1846 
1847 /* rebuild stash inode list */
hmdfs_stash_online_prepare(struct hmdfs_peer *conn, int evt, unsigned int seq)1848 static void hmdfs_stash_online_prepare(struct hmdfs_peer *conn, int evt,
1849 				       unsigned int seq)
1850 {
1851 	struct hmdfs_sb_info *sbi = conn->sbi;
1852 	struct file *filp = NULL;
1853 	const struct cred *old_cred = NULL;
1854 	int err;
1855 	struct hmdfs_rebuild_stats stats;
1856 
1857 	if (!hmdfs_is_stash_enabled(sbi) ||
1858 	    !conn->need_rebuild_stash_list)
1859 		return;
1860 
1861 	/* release seq_lock to prevent blocking no-online sync cb */
1862 	mutex_unlock(&conn->seq_lock);
1863 	old_cred = hmdfs_override_creds(sbi->cred);
1864 	filp = hmdfs_open_stash_dir(&sbi->stash_work_dir, conn->cid);
1865 	if (IS_ERR(filp))
1866 		goto out;
1867 
1868 	memset(&stats, 0, sizeof(stats));
1869 	err = hmdfs_iter_stash_file(conn, seq, filp,
1870 				    hmdfs_rebuild_stash_list, &stats);
1871 	if (err == -ESHUTDOWN) {
1872 		hmdfs_info("peer 0x%x:0x%llx offline again during rebuild",
1873 			   conn->owner, conn->device_id);
1874 	} else {
1875 		WRITE_ONCE(conn->need_rebuild_stash_list, false);
1876 		if (err)
1877 			hmdfs_warning("partial rebuild fail err %d", err);
1878 	}
1879 
1880 	hmdfs_update_peer_rebuild_stats(&conn->stats.rebuild, &stats);
1881 	hmdfs_info("peer 0x%x:0x%llx rebuild stashed-file total %u succeed %u fail %u invalid %u",
1882 		   conn->owner, conn->device_id, stats.total, stats.succeed,
1883 		   stats.fail, stats.invalid);
1884 	fput(filp);
1885 out:
1886 	conn->stats.rebuild.time++;
1887 	hmdfs_revert_creds(old_cred);
1888 	if (!READ_ONCE(conn->need_rebuild_stash_list)) {
1889 		/*
1890 		 * Use smp_mb__before_atomic() to ensure order between
1891 		 * writing @conn->need_rebuild_stash_list and
1892 		 * reading conn->rebuild_inode_status_nr.
1893 		 */
1894 		smp_mb__before_atomic();
1895 		/*
1896 		 * Wait until all inodes finish rebuilding stash status before
1897 		 * accessing @conn->stashed_inode_list in restoring.
1898 		 */
1899 		wait_event(conn->rebuild_inode_status_wq,
1900 			   !atomic_read(&conn->rebuild_inode_status_nr));
1901 	}
1902 	mutex_lock(&conn->seq_lock);
1903 }
1904 
1905 static void
hmdfs_update_peer_restore_stats(struct hmdfs_restore_statistics *restore_stats, const struct hmdfs_restore_stats *stats)1906 hmdfs_update_peer_restore_stats(struct hmdfs_restore_statistics *restore_stats,
1907 				const struct hmdfs_restore_stats *stats)
1908 {
1909 	restore_stats->cur_ok = stats->succeed;
1910 	restore_stats->cur_fail = stats->fail;
1911 	restore_stats->cur_keep = stats->keep;
1912 	restore_stats->total_ok += stats->succeed;
1913 	restore_stats->total_fail += stats->fail;
1914 	restore_stats->total_keep += stats->keep;
1915 	restore_stats->ok_pages += stats->ok_pages;
1916 	restore_stats->fail_pages += stats->fail_pages;
1917 }
1918 
hmdfs_stash_online_do_restore(struct hmdfs_peer *conn, int evt, unsigned int seq)1919 static void hmdfs_stash_online_do_restore(struct hmdfs_peer *conn, int evt,
1920 					  unsigned int seq)
1921 {
1922 	struct hmdfs_sb_info *sbi = conn->sbi;
1923 	struct file *filp = NULL;
1924 	const struct cred *old_cred = NULL;
1925 	struct hmdfs_restore_stats stats;
1926 	int err = 0;
1927 
1928 	if (!hmdfs_is_stash_enabled(sbi) || conn->need_rebuild_stash_list) {
1929 		if (conn->need_rebuild_stash_list)
1930 			hmdfs_info("peer 0x%x:0x%llx skip restoring due to rebuild-need",
1931 				   conn->owner, conn->device_id);
1932 		return;
1933 	}
1934 
1935 	/* release seq_lock to prevent blocking no-online sync cb */
1936 	mutex_unlock(&conn->seq_lock);
1937 	/* For dir iteration, file read and unlink */
1938 	old_cred = hmdfs_override_creds(conn->sbi->cred);
1939 
1940 	memset(&stats, 0, sizeof(stats));
1941 	filp = hmdfs_open_stash_dir(&sbi->stash_work_dir, conn->cid);
1942 	if (IS_ERR(filp)) {
1943 		err = PTR_ERR(filp);
1944 		goto out;
1945 	}
1946 
1947 	err = hmdfs_iter_stash_file(conn, seq, filp,
1948 				    hmdfs_restore_files, &stats);
1949 
1950 	fput(filp);
1951 out:
1952 	hmdfs_revert_creds(old_cred);
1953 
1954 	/* offline again ? */
1955 	if (err != -ESHUTDOWN)
1956 		hmdfs_drop_stashed_inodes(conn);
1957 
1958 	hmdfs_update_peer_restore_stats(&conn->stats.restore, &stats);
1959 	hmdfs_info("peer 0x%x:0x%llx restore stashed-file ok %u fail %u keep %u",
1960 		   conn->owner, conn->device_id,
1961 		   stats.succeed, stats.fail, stats.keep);
1962 
1963 	mutex_lock(&conn->seq_lock);
1964 }
1965 
hmdfs_stash_del_do_cleanup(struct hmdfs_peer *conn, int evt, unsigned int seq)1966 static void hmdfs_stash_del_do_cleanup(struct hmdfs_peer *conn, int evt,
1967 				       unsigned int seq)
1968 {
1969 	struct hmdfs_inode_info *info = NULL;
1970 	struct hmdfs_inode_info *next = NULL;
1971 	unsigned int preparing;
1972 
1973 	if (!hmdfs_is_stash_enabled(conn->sbi))
1974 		return;
1975 
1976 	/* Async cb is cancelled */
1977 	preparing = 0;
1978 	list_for_each_entry_safe(info, next, &conn->wr_opened_inode_list,
1979 				 wr_opened_node) {
1980 		int status = READ_ONCE(info->stash_status);
1981 
1982 		if (status == HMDFS_REMOTE_INODE_STASHING) {
1983 			struct hmdfs_cache_info *cache = NULL;
1984 
1985 			spin_lock(&info->stash_lock);
1986 			cache = info->cache;
1987 			info->cache = NULL;
1988 			info->stash_status = HMDFS_REMOTE_INODE_NONE;
1989 			spin_unlock(&info->stash_lock);
1990 
1991 			hmdfs_remote_del_wr_opened_inode(conn, info);
1992 			hmdfs_del_file_cache(cache);
1993 			/* put inode after all access are completed */
1994 			iput(&info->vfs_inode);
1995 			preparing++;
1996 		}
1997 	}
1998 	hmdfs_info("release %u preparing inodes", preparing);
1999 
2000 	hmdfs_info("release %u pinned inodes", conn->stashed_inode_nr);
2001 	if (list_empty(&conn->stashed_inode_list))
2002 		return;
2003 
2004 	list_for_each_entry_safe(info, next,
2005 				 &conn->stashed_inode_list, stash_node)
2006 		hmdfs_untrack_stashed_inode(conn, info);
2007 }
2008 
hmdfs_exit_stash(struct hmdfs_sb_info *sbi)2009 void hmdfs_exit_stash(struct hmdfs_sb_info *sbi)
2010 {
2011 	if (!sbi->s_offline_stash)
2012 		return;
2013 
2014 	if (sbi->stash_work_dir.dentry) {
2015 		path_put(&sbi->stash_work_dir);
2016 		sbi->stash_work_dir.dentry = NULL;
2017 	}
2018 }
2019 
hmdfs_init_stash(struct hmdfs_sb_info *sbi)2020 int hmdfs_init_stash(struct hmdfs_sb_info *sbi)
2021 {
2022 	int err = 0;
2023 	struct path parent;
2024 	struct dentry *child = NULL;
2025 
2026 	if (!sbi->s_offline_stash)
2027 		return 0;
2028 
2029 	err = kern_path(sbi->cache_dir, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
2030 			&parent);
2031 	if (err) {
2032 		hmdfs_err("invalid cache dir err %d", err);
2033 		goto out;
2034 	}
2035 
2036 	child = hmdfs_stash_new_work_dir(parent.dentry);
2037 	if (!IS_ERR(child)) {
2038 		sbi->stash_work_dir.mnt = mntget(parent.mnt);
2039 		sbi->stash_work_dir.dentry = child;
2040 	} else {
2041 		err = PTR_ERR(child);
2042 		hmdfs_err("create stash work dir err %d", err);
2043 	}
2044 
2045 	path_put(&parent);
2046 out:
2047 	return err;
2048 }
2049 
hmdfs_stash_write_local_file(struct hmdfs_peer *conn, struct hmdfs_inode_info *info, struct hmdfs_writepage_context *ctx, struct hmdfs_cache_info *cache)2050 static int hmdfs_stash_write_local_file(struct hmdfs_peer *conn,
2051 					struct hmdfs_inode_info *info,
2052 					struct hmdfs_writepage_context *ctx,
2053 					struct hmdfs_cache_info *cache)
2054 {
2055 	struct page *page = ctx->page;
2056 	const struct cred *old_cred = NULL;
2057 	void *buf = NULL;
2058 	loff_t pos;
2059 	unsigned int flags;
2060 	ssize_t written;
2061 	int err = 0;
2062 
2063 	buf = kmap(page);
2064 	pos = (loff_t)page->index << PAGE_SHIFT;
2065 	/* enable NOFS for memory allocation */
2066 	flags = memalloc_nofs_save();
2067 	old_cred = hmdfs_override_creds(conn->sbi->cred);
2068 	pos += cache->data_offs << HMDFS_STASH_BLK_SHIFT;
2069 	written = kernel_write(cache->cache_file, buf, ctx->count, &pos);
2070 	hmdfs_revert_creds(old_cred);
2071 	memalloc_nofs_restore(flags);
2072 	kunmap(page);
2073 
2074 	if (written != ctx->count) {
2075 		hmdfs_err("stash peer 0x%x:0x%llx ino 0x%llx page 0x%lx data_offs 0x%x len %u err %zd",
2076 			  conn->owner, conn->device_id, info->remote_ino,
2077 			  page->index, cache->data_offs, ctx->count, written);
2078 		err = -EIO;
2079 	}
2080 
2081 	return err;
2082 }
2083 
hmdfs_stash_writepage(struct hmdfs_peer *conn, struct hmdfs_writepage_context *ctx)2084 int hmdfs_stash_writepage(struct hmdfs_peer *conn,
2085 			  struct hmdfs_writepage_context *ctx)
2086 {
2087 	struct inode *inode = ctx->page->mapping->host;
2088 	struct hmdfs_inode_info *info = hmdfs_i(inode);
2089 	struct hmdfs_cache_info *cache = NULL;
2090 	int err;
2091 
2092 	/* e.g. fail to create stash file */
2093 	cache = info->cache;
2094 	if (!cache)
2095 		return -EIO;
2096 
2097 	err = hmdfs_stash_write_local_file(conn, info, ctx, cache);
2098 	if (!err) {
2099 		hmdfs_client_writepage_done(info, ctx);
2100 		atomic64_inc(&cache->written_pgs);
2101 		put_task_struct(ctx->caller);
2102 		kfree(ctx);
2103 	}
2104 	atomic64_inc(&cache->to_write_pgs);
2105 
2106 	return err;
2107 }
2108 
hmdfs_stash_rebuild_status(struct hmdfs_peer *conn, struct inode *inode)2109 static void hmdfs_stash_rebuild_status(struct hmdfs_peer *conn,
2110 				       struct inode *inode)
2111 {
2112 	char *path_str = NULL;
2113 	struct hmdfs_inode_info *info = NULL;
2114 	const struct cred *old_cred = NULL;
2115 	struct path path;
2116 	struct path *stash_path = NULL;
2117 	int err = 0;
2118 
2119 	path_str = kmalloc(HMDFS_STASH_PATH_LEN, GFP_KERNEL);
2120 	if (!path_str) {
2121 		err = -ENOMEM;
2122 		return;
2123 	}
2124 
2125 	info = hmdfs_i(inode);
2126 	err = snprintf(path_str, HMDFS_STASH_PATH_LEN, "%s/0x%llx",
2127 		       conn->cid, info->remote_ino);
2128 	if (err >= HMDFS_STASH_PATH_LEN) {
2129 		kfree(path_str);
2130 		hmdfs_err("peer 0x%x:0x%llx inode 0x%llx too long name len",
2131 			  conn->owner, conn->device_id, info->remote_ino);
2132 		return;
2133 	}
2134 	old_cred = hmdfs_override_creds(conn->sbi->cred);
2135 	stash_path = &conn->sbi->stash_work_dir;
2136 	err = vfs_path_lookup(stash_path->dentry, stash_path->mnt,
2137 			      path_str, 0, &path);
2138 	hmdfs_revert_creds(old_cred);
2139 	if (!err) {
2140 		if (hmdfs_is_reg(path.dentry)) {
2141 			WRITE_ONCE(info->stash_status,
2142 				   HMDFS_REMOTE_INODE_RESTORING);
2143 			ihold(&info->vfs_inode);
2144 			hmdfs_track_inode_locked(conn, info);
2145 		} else {
2146 			hmdfs_info("peer 0x%x:0x%llx inode 0x%llx unexpected stashed file mode 0%o",
2147 				    conn->owner, conn->device_id,
2148 				    info->remote_ino,
2149 				    d_inode(path.dentry)->i_mode);
2150 		}
2151 
2152 		path_put(&path);
2153 	} else if (err && err != -ENOENT) {
2154 		hmdfs_err("peer 0x%x:0x%llx inode 0x%llx find %s err %d",
2155 			   conn->owner, conn->device_id, info->remote_ino,
2156 			   path_str, err);
2157 	}
2158 
2159 	kfree(path_str);
2160 }
2161 
2162 static inline bool
hmdfs_need_rebuild_inode_stash_status(struct hmdfs_peer *conn, umode_t mode)2163 hmdfs_need_rebuild_inode_stash_status(struct hmdfs_peer *conn, umode_t mode)
2164 {
2165 	return hmdfs_is_stash_enabled(conn->sbi) &&
2166 	       READ_ONCE(conn->need_rebuild_stash_list) &&
2167 	       (S_ISREG(mode) || S_ISLNK(mode));
2168 }
2169 
hmdfs_remote_init_stash_status(struct hmdfs_peer *conn, struct inode *inode, umode_t mode)2170 void hmdfs_remote_init_stash_status(struct hmdfs_peer *conn,
2171 				    struct inode *inode, umode_t mode)
2172 {
2173 	if (!hmdfs_need_rebuild_inode_stash_status(conn, mode))
2174 		return;
2175 
2176 	atomic_inc(&conn->rebuild_inode_status_nr);
2177 	/*
2178 	 * Use smp_mb__after_atomic() to ensure order between writing
2179 	 * @conn->rebuild_inode_status_nr and reading
2180 	 * @conn->need_rebuild_stash_list.
2181 	 */
2182 	smp_mb__after_atomic();
2183 	if (READ_ONCE(conn->need_rebuild_stash_list))
2184 		hmdfs_stash_rebuild_status(conn, inode);
2185 	if (atomic_dec_and_test(&conn->rebuild_inode_status_nr))
2186 		wake_up(&conn->rebuild_inode_status_wq);
2187 }
2188 
2189 static struct hmdfs_node_cb_desc stash_cb[] = {
2190 	{
2191 		.evt = NODE_EVT_OFFLINE,
2192 		.sync = true,
2193 		.fn = hmdfs_stash_offline_prepare,
2194 	},
2195 	{
2196 		.evt = NODE_EVT_OFFLINE,
2197 		.sync = false,
2198 		.fn = hmdfs_stash_offline_do_stash,
2199 	},
2200 	{
2201 		.evt = NODE_EVT_ADD,
2202 		.sync = true,
2203 		.fn = hmdfs_stash_add_do_check,
2204 	},
2205 	{
2206 		.evt = NODE_EVT_ONLINE,
2207 		.sync = false,
2208 		.fn = hmdfs_stash_online_prepare,
2209 	},
2210 	{
2211 		.evt = NODE_EVT_ONLINE,
2212 		.sync = false,
2213 		.fn = hmdfs_stash_online_do_restore,
2214 	},
2215 	{
2216 		.evt = NODE_EVT_DEL,
2217 		.sync = true,
2218 		.fn = hmdfs_stash_del_do_cleanup,
2219 	},
2220 };
2221 
hmdfs_stash_add_node_evt_cb(void)2222 void __init hmdfs_stash_add_node_evt_cb(void)
2223 {
2224 	hmdfs_node_add_evt_cb(stash_cb, ARRAY_SIZE(stash_cb));
2225 }
2226 
2227