1/*
2  FUSE: Filesystem in Userspace
3  Copyright (C) 2001-2007  Miklos Szeredi <miklos@szeredi.hu>
4  Copyright (C) 2017       Nikolaus Rath <Nikolaus@rath.org>
5  Copyright (C) 2018       Valve, Inc
6
7  This program can be distributed under the terms of the GNU GPLv2.
8  See the file COPYING.
9*/
10
11/** @file
12 *
13 * This is a "high-performance" version of passthrough_ll.c. While
14 * passthrough_ll.c is designed to be as simple as possible, this
15 * example intended to be as efficient and correct as possible.
16 *
17 * passthrough_hp.cc mirrors a specified "source" directory under a
18 * specified the mountpoint with as much fidelity and performance as
19 * possible.
20 *
21 * If --nocache is specified, the source directory may be changed
22 * directly even while mounted and the filesystem will continue
23 * to work correctly.
24 *
25 * Without --nocache, the source directory is assumed to be modified
26 * only through the passthrough filesystem. This enables much better
27 * performance, but if changes are made directly to the source, they
28 * may not be immediately visible under the mountpoint and further
29 * access to the mountpoint may result in incorrect behavior,
30 * including data-loss.
31 *
32 * On its own, this filesystem fulfills no practical purpose. It is
33 * intended as a template upon which additional functionality can be
34 * built.
35 *
36 * Unless --nocache is specified, is only possible to write to files
37 * for which the mounting user has read permissions. This is because
38 * the writeback cache requires the kernel to be able to issue read
39 * requests for all files (which the passthrough filesystem cannot
40 * satisfy if it can't read the file in the underlying filesystem).
41 *
42 * ## Source code ##
43 * \include passthrough_hp.cc
44 */
45
46#define FUSE_USE_VERSION FUSE_MAKE_VERSION(3, 12)
47
48#ifndef _GNU_SOURCE
49#define _GNU_SOURCE
50#endif
51
52// C includes
53#include <dirent.h>
54#include <err.h>
55#include <errno.h>
56#include <ftw.h>
57#include <fuse_lowlevel.h>
58#include <inttypes.h>
59#include <string.h>
60#include <sys/file.h>
61#include <sys/resource.h>
62#include <sys/xattr.h>
63#include <time.h>
64#include <unistd.h>
65#include <pthread.h>
66#include <limits.h>
67
68// C++ includes
69#include <cstddef>
70#include <cstdio>
71#include <cstdlib>
72#include <list>
73#include "cxxopts.hpp"
74#include <mutex>
75#include <fstream>
76#include <thread>
77#include <iomanip>
78
79using namespace std;
80
81#define SFS_DEFAULT_THREADS "-1" // take libfuse value as default
82#define SFS_DEFAULT_CLONE_FD "0"
83
84/* We are re-using pointers to our `struct sfs_inode` and `struct
85   sfs_dirp` elements as inodes and file handles. This means that we
86   must be able to store pointer a pointer in both a fuse_ino_t
87   variable and a uint64_t variable (used for file handles). */
88static_assert(sizeof(fuse_ino_t) >= sizeof(void*),
89              "void* must fit into fuse_ino_t");
90static_assert(sizeof(fuse_ino_t) >= sizeof(uint64_t),
91              "fuse_ino_t must be at least 64 bits");
92
93
94/* Forward declarations */
95struct Inode;
96static Inode& get_inode(fuse_ino_t ino);
97static void forget_one(fuse_ino_t ino, uint64_t n);
98
99// Uniquely identifies a file in the source directory tree. This could
100// be simplified to just ino_t since we require the source directory
101// not to contain any mountpoints. This hasn't been done yet in case
102// we need to reconsider this constraint (but relaxing this would have
103// the drawback that we can no longer re-use inode numbers, and thus
104// readdir() would need to do a full lookup() in order to report the
105// right inode number).
106typedef std::pair<ino_t, dev_t> SrcId;
107
108// Define a hash function for SrcId
109namespace std {
110    template<>
111    struct hash<SrcId> {
112        size_t operator()(const SrcId& id) const {
113            return hash<ino_t>{}(id.first) ^ hash<dev_t>{}(id.second);
114        }
115    };
116}
117
118// Maps files in the source directory tree to inodes
119typedef std::unordered_map<SrcId, Inode> InodeMap;
120
121struct Inode {
122    int fd {-1};
123    dev_t src_dev {0};
124    ino_t src_ino {0};
125    int generation {0};
126    uint64_t nopen {0};
127    uint64_t nlookup {0};
128    std::mutex m;
129
130    // Delete copy constructor and assignments. We could implement
131    // move if we need it.
132    Inode() = default;
133    Inode(const Inode&) = delete;
134    Inode(Inode&& inode) = delete;
135    Inode& operator=(Inode&& inode) = delete;
136    Inode& operator=(const Inode&) = delete;
137
138    ~Inode() {
139        if(fd > 0)
140            close(fd);
141    }
142};
143
144struct Fs {
145    // Must be acquired *after* any Inode.m locks.
146    std::mutex mutex;
147    InodeMap inodes; // protected by mutex
148    Inode root;
149    double timeout;
150    bool debug;
151    bool debug_fuse;
152    bool foreground;
153    std::string source;
154    size_t blocksize;
155    dev_t src_dev;
156    bool nosplice;
157    bool nocache;
158    size_t num_threads;
159    bool clone_fd;
160    std::string fuse_mount_options;
161};
162static Fs fs{};
163
164
165#define FUSE_BUF_COPY_FLAGS                      \
166        (fs.nosplice ?                           \
167            FUSE_BUF_NO_SPLICE :                 \
168            static_cast<fuse_buf_copy_flags>(0))
169
170
171static Inode& get_inode(fuse_ino_t ino) {
172    if (ino == FUSE_ROOT_ID)
173        return fs.root;
174
175    Inode* inode = reinterpret_cast<Inode*>(ino);
176    if(inode->fd == -1) {
177        cerr << "INTERNAL ERROR: Unknown inode " << ino << endl;
178        abort();
179    }
180    return *inode;
181}
182
183
184static int get_fs_fd(fuse_ino_t ino) {
185    int fd = get_inode(ino).fd;
186    return fd;
187}
188
189
190static void sfs_init(void *userdata, fuse_conn_info *conn) {
191    (void)userdata;
192    if (conn->capable & FUSE_CAP_EXPORT_SUPPORT)
193        conn->want |= FUSE_CAP_EXPORT_SUPPORT;
194
195    if (fs.timeout && conn->capable & FUSE_CAP_WRITEBACK_CACHE)
196        conn->want |= FUSE_CAP_WRITEBACK_CACHE;
197
198    if (conn->capable & FUSE_CAP_FLOCK_LOCKS)
199        conn->want |= FUSE_CAP_FLOCK_LOCKS;
200
201    if (fs.nosplice) {
202        // FUSE_CAP_SPLICE_READ is enabled in libfuse3 by default,
203        // see do_init() in in fuse_lowlevel.c
204        // Just unset both, in case FUSE_CAP_SPLICE_WRITE would also get enabled
205        // by default.
206        conn->want &= ~FUSE_CAP_SPLICE_READ;
207        conn->want &= ~FUSE_CAP_SPLICE_WRITE;
208    } else {
209        if (conn->capable & FUSE_CAP_SPLICE_WRITE)
210            conn->want |= FUSE_CAP_SPLICE_WRITE;
211        if (conn->capable & FUSE_CAP_SPLICE_READ)
212            conn->want |= FUSE_CAP_SPLICE_READ;
213    }
214}
215
216
217static void sfs_getattr(fuse_req_t req, fuse_ino_t ino, fuse_file_info *fi) {
218    (void)fi;
219    Inode& inode = get_inode(ino);
220    struct stat attr;
221    auto res = fstatat(inode.fd, "", &attr,
222                       AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
223    if (res == -1) {
224        fuse_reply_err(req, errno);
225        return;
226    }
227    fuse_reply_attr(req, &attr, fs.timeout);
228}
229
230
231static void do_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
232                       int valid, struct fuse_file_info* fi) {
233    Inode& inode = get_inode(ino);
234    int ifd = inode.fd;
235    int res;
236
237    if (valid & FUSE_SET_ATTR_MODE) {
238        if (fi) {
239            res = fchmod(fi->fh, attr->st_mode);
240        } else {
241            char procname[64];
242            sprintf(procname, "/proc/self/fd/%i", ifd);
243            res = chmod(procname, attr->st_mode);
244        }
245        if (res == -1)
246            goto out_err;
247    }
248    if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) {
249        uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : static_cast<uid_t>(-1);
250        gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : static_cast<gid_t>(-1);
251
252        res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
253        if (res == -1)
254            goto out_err;
255    }
256    if (valid & FUSE_SET_ATTR_SIZE) {
257        if (fi) {
258            res = ftruncate(fi->fh, attr->st_size);
259        } else {
260            char procname[64];
261            sprintf(procname, "/proc/self/fd/%i", ifd);
262            res = truncate(procname, attr->st_size);
263        }
264        if (res == -1)
265            goto out_err;
266    }
267    if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) {
268        struct timespec tv[2];
269
270        tv[0].tv_sec = 0;
271        tv[1].tv_sec = 0;
272        tv[0].tv_nsec = UTIME_OMIT;
273        tv[1].tv_nsec = UTIME_OMIT;
274
275        if (valid & FUSE_SET_ATTR_ATIME_NOW)
276            tv[0].tv_nsec = UTIME_NOW;
277        else if (valid & FUSE_SET_ATTR_ATIME)
278            tv[0] = attr->st_atim;
279
280        if (valid & FUSE_SET_ATTR_MTIME_NOW)
281            tv[1].tv_nsec = UTIME_NOW;
282        else if (valid & FUSE_SET_ATTR_MTIME)
283            tv[1] = attr->st_mtim;
284
285        if (fi)
286            res = futimens(fi->fh, tv);
287        else {
288#ifdef HAVE_UTIMENSAT
289            char procname[64];
290            sprintf(procname, "/proc/self/fd/%i", ifd);
291            res = utimensat(AT_FDCWD, procname, tv, 0);
292#else
293            res = -1;
294            errno = EOPNOTSUPP;
295#endif
296        }
297        if (res == -1)
298            goto out_err;
299    }
300    return sfs_getattr(req, ino, fi);
301
302out_err:
303    fuse_reply_err(req, errno);
304}
305
306
307static void sfs_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
308                        int valid, fuse_file_info *fi) {
309    (void) ino;
310    do_setattr(req, ino, attr, valid, fi);
311}
312
313
314static int do_lookup(fuse_ino_t parent, const char *name,
315                     fuse_entry_param *e) {
316    if (fs.debug)
317        cerr << "DEBUG: lookup(): name=" << name
318             << ", parent=" << parent << endl;
319    memset(e, 0, sizeof(*e));
320    e->attr_timeout = fs.timeout;
321    e->entry_timeout = fs.timeout;
322
323    auto newfd = openat(get_fs_fd(parent), name, O_PATH | O_NOFOLLOW);
324    if (newfd == -1)
325        return errno;
326
327    auto res = fstatat(newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
328    if (res == -1) {
329        auto saveerr = errno;
330        close(newfd);
331        if (fs.debug)
332            cerr << "DEBUG: lookup(): fstatat failed" << endl;
333        return saveerr;
334    }
335
336    if (e->attr.st_dev != fs.src_dev) {
337        cerr << "WARNING: Mountpoints in the source directory tree will be hidden." << endl;
338        return ENOTSUP;
339    } else if (e->attr.st_ino == FUSE_ROOT_ID) {
340        cerr << "ERROR: Source directory tree must not include inode "
341             << FUSE_ROOT_ID << endl;
342        return EIO;
343    }
344
345    SrcId id {e->attr.st_ino, e->attr.st_dev};
346    unique_lock<mutex> fs_lock {fs.mutex};
347    Inode* inode_p;
348    try {
349        inode_p = &fs.inodes[id];
350    } catch (std::bad_alloc&) {
351        return ENOMEM;
352    }
353    e->ino = reinterpret_cast<fuse_ino_t>(inode_p);
354    Inode& inode {*inode_p};
355    e->generation = inode.generation;
356
357    if (inode.fd == -ENOENT) { // found unlinked inode
358        if (fs.debug)
359            cerr << "DEBUG: lookup(): inode " << e->attr.st_ino
360                 << " recycled; generation=" << inode.generation << endl;
361	/* fallthrough to new inode but keep existing inode.nlookup */
362    }
363
364    if (inode.fd > 0) { // found existing inode
365        fs_lock.unlock();
366        if (fs.debug)
367            cerr << "DEBUG: lookup(): inode " << e->attr.st_ino
368                 << " (userspace) already known; fd = " << inode.fd << endl;
369        lock_guard<mutex> g {inode.m};
370
371        inode.nlookup++;
372        if (fs.debug)
373            cerr << "DEBUG:" << __func__ << ":" << __LINE__ << " "
374                 <<  "inode " << inode.src_ino
375                 << " count " << inode.nlookup << endl;
376
377
378        close(newfd);
379    } else { // no existing inode
380        /* This is just here to make Helgrind happy. It violates the
381           lock ordering requirement (inode.m must be acquired before
382           fs.mutex), but this is of no consequence because at this
383           point no other thread has access to the inode mutex */
384        lock_guard<mutex> g {inode.m};
385        inode.src_ino = e->attr.st_ino;
386        inode.src_dev = e->attr.st_dev;
387
388        inode.nlookup++;
389        if (fs.debug)
390            cerr << "DEBUG:" << __func__ << ":" << __LINE__ << " "
391                 <<  "inode " << inode.src_ino
392                 << " count " << inode.nlookup << endl;
393
394        inode.fd = newfd;
395        fs_lock.unlock();
396
397        if (fs.debug)
398            cerr << "DEBUG: lookup(): created userspace inode " << e->attr.st_ino
399                 << "; fd = " << inode.fd << endl;
400    }
401
402    return 0;
403}
404
405
406static void sfs_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) {
407    fuse_entry_param e {};
408    auto err = do_lookup(parent, name, &e);
409    if (err == ENOENT) {
410        e.attr_timeout = fs.timeout;
411        e.entry_timeout = fs.timeout;
412        e.ino = e.attr.st_ino = 0;
413        fuse_reply_entry(req, &e);
414    } else if (err) {
415        if (err == ENFILE || err == EMFILE)
416            cerr << "ERROR: Reached maximum number of file descriptors." << endl;
417        fuse_reply_err(req, err);
418    } else {
419        fuse_reply_entry(req, &e);
420    }
421}
422
423
424static void mknod_symlink(fuse_req_t req, fuse_ino_t parent,
425                              const char *name, mode_t mode, dev_t rdev,
426                              const char *link) {
427    int res;
428    Inode& inode_p = get_inode(parent);
429    auto saverr = ENOMEM;
430
431    if (S_ISDIR(mode))
432        res = mkdirat(inode_p.fd, name, mode);
433    else if (S_ISLNK(mode))
434        res = symlinkat(link, inode_p.fd, name);
435    else
436        res = mknodat(inode_p.fd, name, mode, rdev);
437    saverr = errno;
438    if (res == -1)
439        goto out;
440
441    fuse_entry_param e;
442    saverr = do_lookup(parent, name, &e);
443    if (saverr)
444        goto out;
445
446    fuse_reply_entry(req, &e);
447    return;
448
449out:
450    if (saverr == ENFILE || saverr == EMFILE)
451        cerr << "ERROR: Reached maximum number of file descriptors." << endl;
452    fuse_reply_err(req, saverr);
453}
454
455
456static void sfs_mknod(fuse_req_t req, fuse_ino_t parent, const char *name,
457                      mode_t mode, dev_t rdev) {
458    mknod_symlink(req, parent, name, mode, rdev, nullptr);
459}
460
461
462static void sfs_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name,
463                      mode_t mode) {
464    mknod_symlink(req, parent, name, S_IFDIR | mode, 0, nullptr);
465}
466
467
468static void sfs_symlink(fuse_req_t req, const char *link, fuse_ino_t parent,
469                        const char *name) {
470    mknod_symlink(req, parent, name, S_IFLNK, 0, link);
471}
472
473
474static void sfs_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent,
475                     const char *name) {
476    Inode& inode = get_inode(ino);
477    Inode& inode_p = get_inode(parent);
478    fuse_entry_param e {};
479
480    e.attr_timeout = fs.timeout;
481    e.entry_timeout = fs.timeout;
482
483    char procname[64];
484    sprintf(procname, "/proc/self/fd/%i", inode.fd);
485    auto res = linkat(AT_FDCWD, procname, inode_p.fd, name, AT_SYMLINK_FOLLOW);
486    if (res == -1) {
487        fuse_reply_err(req, errno);
488        return;
489    }
490
491    res = fstatat(inode.fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
492    if (res == -1) {
493        fuse_reply_err(req, errno);
494        return;
495    }
496    e.ino = reinterpret_cast<fuse_ino_t>(&inode);
497    {
498        lock_guard<mutex> g {inode.m};
499        inode.nlookup++;
500        if (fs.debug)
501            cerr << "DEBUG:" << __func__ << ":" << __LINE__ << " "
502                 <<  "inode " << inode.src_ino
503                 << " count " << inode.nlookup << endl;
504    }
505
506    fuse_reply_entry(req, &e);
507    return;
508}
509
510
511static void sfs_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) {
512    Inode& inode_p = get_inode(parent);
513    lock_guard<mutex> g {inode_p.m};
514    auto res = unlinkat(inode_p.fd, name, AT_REMOVEDIR);
515    fuse_reply_err(req, res == -1 ? errno : 0);
516}
517
518
519static void sfs_rename(fuse_req_t req, fuse_ino_t parent, const char *name,
520                       fuse_ino_t newparent, const char *newname,
521                       unsigned int flags) {
522    Inode& inode_p = get_inode(parent);
523    Inode& inode_np = get_inode(newparent);
524    if (flags) {
525        fuse_reply_err(req, EINVAL);
526        return;
527    }
528
529    auto res = renameat(inode_p.fd, name, inode_np.fd, newname);
530    fuse_reply_err(req, res == -1 ? errno : 0);
531}
532
533
534static void sfs_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) {
535    Inode& inode_p = get_inode(parent);
536    // Release inode.fd before last unlink like nfsd EXPORT_OP_CLOSE_BEFORE_UNLINK
537    // to test reused inode numbers.
538    // Skip this when inode has an open file and when writeback cache is enabled.
539    if (!fs.timeout) {
540	    fuse_entry_param e;
541	    auto err = do_lookup(parent, name, &e);
542	    if (err) {
543		    fuse_reply_err(req, err);
544		    return;
545	    }
546	    if (e.attr.st_nlink == 1) {
547		    Inode& inode = get_inode(e.ino);
548		    lock_guard<mutex> g {inode.m};
549		    if (inode.fd > 0 && !inode.nopen) {
550			    if (fs.debug)
551				    cerr << "DEBUG: unlink: release inode " << e.attr.st_ino
552					    << "; fd=" << inode.fd << endl;
553			    lock_guard<mutex> g_fs {fs.mutex};
554			    close(inode.fd);
555			    inode.fd = -ENOENT;
556			    inode.generation++;
557		    }
558	    }
559
560        // decrease the ref which lookup above had increased
561        forget_one(e.ino, 1);
562    }
563    auto res = unlinkat(inode_p.fd, name, 0);
564    fuse_reply_err(req, res == -1 ? errno : 0);
565}
566
567
568static void forget_one(fuse_ino_t ino, uint64_t n) {
569    Inode& inode = get_inode(ino);
570    unique_lock<mutex> l {inode.m};
571
572    if(n > inode.nlookup) {
573        cerr << "INTERNAL ERROR: Negative lookup count for inode "
574             << inode.src_ino << endl;
575        abort();
576    }
577    inode.nlookup -= n;
578
579    if (fs.debug)
580        cerr << "DEBUG:" << __func__ << ":" << __LINE__ << " "
581             <<  "inode " << inode.src_ino
582             << " count " << inode.nlookup << endl;
583
584    if (!inode.nlookup) {
585        if (fs.debug)
586            cerr << "DEBUG: forget: cleaning up inode " << inode.src_ino << endl;
587        {
588            lock_guard<mutex> g_fs {fs.mutex};
589            l.unlock();
590            fs.inodes.erase({inode.src_ino, inode.src_dev});
591        }
592    } else if (fs.debug)
593            cerr << "DEBUG: forget: inode " << inode.src_ino
594                 << " lookup count now " << inode.nlookup << endl;
595}
596
597static void sfs_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) {
598    forget_one(ino, nlookup);
599    fuse_reply_none(req);
600}
601
602
603static void sfs_forget_multi(fuse_req_t req, size_t count,
604                             fuse_forget_data *forgets) {
605    for (int i = 0; i < count; i++)
606        forget_one(forgets[i].ino, forgets[i].nlookup);
607    fuse_reply_none(req);
608}
609
610
611static void sfs_readlink(fuse_req_t req, fuse_ino_t ino) {
612    Inode& inode = get_inode(ino);
613    char buf[PATH_MAX + 1];
614    auto res = readlinkat(inode.fd, "", buf, sizeof(buf));
615    if (res == -1)
616        fuse_reply_err(req, errno);
617    else if (res == sizeof(buf))
618        fuse_reply_err(req, ENAMETOOLONG);
619    else {
620        buf[res] = '\0';
621        fuse_reply_readlink(req, buf);
622    }
623}
624
625
626struct DirHandle {
627    DIR *dp {nullptr};
628    off_t offset;
629
630    DirHandle() = default;
631    DirHandle(const DirHandle&) = delete;
632    DirHandle& operator=(const DirHandle&) = delete;
633
634    ~DirHandle() {
635        if(dp)
636            closedir(dp);
637    }
638};
639
640
641static DirHandle *get_dir_handle(fuse_file_info *fi) {
642    return reinterpret_cast<DirHandle*>(fi->fh);
643}
644
645
646static void sfs_opendir(fuse_req_t req, fuse_ino_t ino, fuse_file_info *fi) {
647    Inode& inode = get_inode(ino);
648    auto d = new (nothrow) DirHandle;
649    if (d == nullptr) {
650        fuse_reply_err(req, ENOMEM);
651        return;
652    }
653
654    // Make Helgrind happy - it can't know that there's an implicit
655    // synchronization due to the fact that other threads cannot
656    // access d until we've called fuse_reply_*.
657    lock_guard<mutex> g {inode.m};
658
659    auto fd = openat(inode.fd, ".", O_RDONLY);
660    if (fd == -1)
661        goto out_errno;
662
663    // On success, dir stream takes ownership of fd, so we
664    // do not have to close it.
665    d->dp = fdopendir(fd);
666    if(d->dp == nullptr)
667        goto out_errno;
668
669    d->offset = 0;
670
671    fi->fh = reinterpret_cast<uint64_t>(d);
672    if(fs.timeout) {
673        fi->keep_cache = 1;
674        fi->cache_readdir = 1;
675    }
676    fuse_reply_open(req, fi);
677    return;
678
679out_errno:
680    auto error = errno;
681    delete d;
682    if (error == ENFILE || error == EMFILE)
683        cerr << "ERROR: Reached maximum number of file descriptors." << endl;
684    fuse_reply_err(req, error);
685}
686
687
688static bool is_dot_or_dotdot(const char *name) {
689    return name[0] == '.' &&
690           (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'));
691}
692
693
694static void do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
695                    off_t offset, fuse_file_info *fi, const int plus) {
696    auto d = get_dir_handle(fi);
697    Inode& inode = get_inode(ino);
698    lock_guard<mutex> g {inode.m};
699    char *p;
700    auto rem = size;
701    int err = 0, count = 0;
702
703    if (fs.debug)
704        cerr << "DEBUG: readdir(): started with offset "
705             << offset << endl;
706
707    auto buf = new (nothrow) char[size];
708    if (!buf) {
709        fuse_reply_err(req, ENOMEM);
710        return;
711    }
712    p = buf;
713
714    if (offset != d->offset) {
715        if (fs.debug)
716            cerr << "DEBUG: readdir(): seeking to " << offset << endl;
717        seekdir(d->dp, offset);
718        d->offset = offset;
719    }
720
721    while (1) {
722        struct dirent *entry;
723        errno = 0;
724        entry = readdir(d->dp);
725        if (!entry) {
726            if(errno) {
727                err = errno;
728                if (fs.debug)
729                    warn("DEBUG: readdir(): readdir failed with");
730                goto error;
731            }
732            break; // End of stream
733        }
734        d->offset = entry->d_off;
735        if (is_dot_or_dotdot(entry->d_name))
736            continue;
737
738        fuse_entry_param e{};
739        size_t entsize;
740        if (plus) {
741            err = do_lookup(ino, entry->d_name, &e);
742            if (err)
743                goto error;
744            entsize = fuse_add_direntry_plus(req, p, rem, entry->d_name, &e, entry->d_off);
745        } else {
746            e.attr.st_ino = entry->d_ino;
747            e.attr.st_mode = entry->d_type << 12;
748            entsize = fuse_add_direntry(req, p, rem, entry->d_name, &e.attr, entry->d_off);
749        }
750
751        if (entsize > rem) {
752            if (fs.debug)
753                cerr << "DEBUG: readdir(): buffer full, returning data. " << endl;
754            if (plus)
755                forget_one(e.ino, 1);
756            break;
757        }
758
759        p += entsize;
760        rem -= entsize;
761        count++;
762        if (fs.debug) {
763            cerr << "DEBUG: readdir(): added to buffer: " << entry->d_name
764                 << ", ino " << e.attr.st_ino << ", offset " << entry->d_off << endl;
765        }
766    }
767    err = 0;
768error:
769
770    // If there's an error, we can only signal it if we haven't stored
771    // any entries yet - otherwise we'd end up with wrong lookup
772    // counts for the entries that are already in the buffer. So we
773    // return what we've collected until that point.
774    if (err && rem == size) {
775        if (err == ENFILE || err == EMFILE)
776            cerr << "ERROR: Reached maximum number of file descriptors." << endl;
777        fuse_reply_err(req, err);
778    } else {
779        if (fs.debug)
780            cerr << "DEBUG: readdir(): returning " << count
781                 << " entries, curr offset " << d->offset << endl;
782        fuse_reply_buf(req, buf, size - rem);
783    }
784    delete[] buf;
785    return;
786}
787
788
789static void sfs_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
790                        off_t offset, fuse_file_info *fi) {
791    // operation logging is done in readdir to reduce code duplication
792    do_readdir(req, ino, size, offset, fi, 0);
793}
794
795
796static void sfs_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size,
797                            off_t offset, fuse_file_info *fi) {
798    // operation logging is done in readdir to reduce code duplication
799    do_readdir(req, ino, size, offset, fi, 1);
800}
801
802
803static void sfs_releasedir(fuse_req_t req, fuse_ino_t ino, fuse_file_info *fi) {
804    (void) ino;
805    auto d = get_dir_handle(fi);
806    delete d;
807    fuse_reply_err(req, 0);
808}
809
810
811static void sfs_create(fuse_req_t req, fuse_ino_t parent, const char *name,
812                       mode_t mode, fuse_file_info *fi) {
813    Inode& inode_p = get_inode(parent);
814
815    auto fd = openat(inode_p.fd, name,
816                     (fi->flags | O_CREAT) & ~O_NOFOLLOW, mode);
817    if (fd == -1) {
818        auto err = errno;
819        if (err == ENFILE || err == EMFILE)
820            cerr << "ERROR: Reached maximum number of file descriptors." << endl;
821        fuse_reply_err(req, err);
822        return;
823    }
824
825    fi->fh = fd;
826    fuse_entry_param e;
827    auto err = do_lookup(parent, name, &e);
828    if (err) {
829        if (err == ENFILE || err == EMFILE)
830            cerr << "ERROR: Reached maximum number of file descriptors." << endl;
831        fuse_reply_err(req, err);
832	return;
833    }
834
835    Inode& inode = get_inode(e.ino);
836    lock_guard<mutex> g {inode.m};
837    inode.nopen++;
838    fuse_reply_create(req, &e, fi);
839}
840
841
842static void sfs_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
843                         fuse_file_info *fi) {
844    (void) ino;
845    int res;
846    int fd = dirfd(get_dir_handle(fi)->dp);
847    if (datasync)
848        res = fdatasync(fd);
849    else
850        res = fsync(fd);
851    fuse_reply_err(req, res == -1 ? errno : 0);
852}
853
854
855static void sfs_open(fuse_req_t req, fuse_ino_t ino, fuse_file_info *fi) {
856    Inode& inode = get_inode(ino);
857
858    /* With writeback cache, kernel may send read requests even
859       when userspace opened write-only */
860    if (fs.timeout && (fi->flags & O_ACCMODE) == O_WRONLY) {
861        fi->flags &= ~O_ACCMODE;
862        fi->flags |= O_RDWR;
863    }
864
865    /* With writeback cache, O_APPEND is handled by the kernel.  This
866       breaks atomicity (since the file may change in the underlying
867       filesystem, so that the kernel's idea of the end of the file
868       isn't accurate anymore). However, no process should modify the
869       file in the underlying filesystem once it has been read, so
870       this is not a problem. */
871    if (fs.timeout && fi->flags & O_APPEND)
872        fi->flags &= ~O_APPEND;
873
874    /* Unfortunately we cannot use inode.fd, because this was opened
875       with O_PATH (so it doesn't allow read/write access). */
876    char buf[64];
877    sprintf(buf, "/proc/self/fd/%i", inode.fd);
878    auto fd = open(buf, fi->flags & ~O_NOFOLLOW);
879    if (fd == -1) {
880        auto err = errno;
881        if (err == ENFILE || err == EMFILE)
882            cerr << "ERROR: Reached maximum number of file descriptors." << endl;
883        fuse_reply_err(req, err);
884        return;
885    }
886
887    lock_guard<mutex> g {inode.m};
888    inode.nopen++;
889    fi->keep_cache = (fs.timeout != 0);
890    fi->noflush = (fs.timeout == 0 && (fi->flags & O_ACCMODE) == O_RDONLY);
891    fi->fh = fd;
892    fuse_reply_open(req, fi);
893}
894
895
896static void sfs_release(fuse_req_t req, fuse_ino_t ino, fuse_file_info *fi) {
897    Inode& inode = get_inode(ino);
898    lock_guard<mutex> g {inode.m};
899    inode.nopen--;
900    close(fi->fh);
901    fuse_reply_err(req, 0);
902}
903
904
905static void sfs_flush(fuse_req_t req, fuse_ino_t ino, fuse_file_info *fi) {
906    (void) ino;
907    auto res = close(dup(fi->fh));
908    fuse_reply_err(req, res == -1 ? errno : 0);
909}
910
911
912static void sfs_fsync(fuse_req_t req, fuse_ino_t ino, int datasync,
913                      fuse_file_info *fi) {
914    (void) ino;
915    int res;
916    if (datasync)
917        res = fdatasync(fi->fh);
918    else
919        res = fsync(fi->fh);
920    fuse_reply_err(req, res == -1 ? errno : 0);
921}
922
923
924static void do_read(fuse_req_t req, size_t size, off_t off, fuse_file_info *fi) {
925
926    fuse_bufvec buf = FUSE_BUFVEC_INIT(size);
927    buf.buf[0].flags = static_cast<fuse_buf_flags>(
928        FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK);
929    buf.buf[0].fd = fi->fh;
930    buf.buf[0].pos = off;
931
932    fuse_reply_data(req, &buf, FUSE_BUF_COPY_FLAGS);
933}
934
935static void sfs_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off,
936                     fuse_file_info *fi) {
937    (void) ino;
938    do_read(req, size, off, fi);
939}
940
941
942static void do_write_buf(fuse_req_t req, size_t size, off_t off,
943                         fuse_bufvec *in_buf, fuse_file_info *fi) {
944    fuse_bufvec out_buf = FUSE_BUFVEC_INIT(size);
945    out_buf.buf[0].flags = static_cast<fuse_buf_flags>(
946        FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK);
947    out_buf.buf[0].fd = fi->fh;
948    out_buf.buf[0].pos = off;
949
950    auto res = fuse_buf_copy(&out_buf, in_buf, FUSE_BUF_COPY_FLAGS);
951    if (res < 0)
952        fuse_reply_err(req, -res);
953    else
954        fuse_reply_write(req, (size_t)res);
955}
956
957
958static void sfs_write_buf(fuse_req_t req, fuse_ino_t ino, fuse_bufvec *in_buf,
959                          off_t off, fuse_file_info *fi) {
960    (void) ino;
961    auto size {fuse_buf_size(in_buf)};
962    do_write_buf(req, size, off, in_buf, fi);
963}
964
965
966static void sfs_statfs(fuse_req_t req, fuse_ino_t ino) {
967    struct statvfs stbuf;
968
969    auto res = fstatvfs(get_fs_fd(ino), &stbuf);
970    if (res == -1)
971        fuse_reply_err(req, errno);
972    else
973        fuse_reply_statfs(req, &stbuf);
974}
975
976
977#ifdef HAVE_POSIX_FALLOCATE
978static void sfs_fallocate(fuse_req_t req, fuse_ino_t ino, int mode,
979                          off_t offset, off_t length, fuse_file_info *fi) {
980    (void) ino;
981    if (mode) {
982        fuse_reply_err(req, EOPNOTSUPP);
983        return;
984    }
985
986    auto err = posix_fallocate(fi->fh, offset, length);
987    fuse_reply_err(req, err);
988}
989#endif
990
991static void sfs_flock(fuse_req_t req, fuse_ino_t ino, fuse_file_info *fi,
992                      int op) {
993    (void) ino;
994    auto res = flock(fi->fh, op);
995    fuse_reply_err(req, res == -1 ? errno : 0);
996}
997
998
999#ifdef HAVE_SETXATTR
1000static void sfs_getxattr(fuse_req_t req, fuse_ino_t ino, const char *name,
1001                         size_t size) {
1002    char *value = nullptr;
1003    Inode& inode = get_inode(ino);
1004    ssize_t ret;
1005    int saverr;
1006
1007    char procname[64];
1008    sprintf(procname, "/proc/self/fd/%i", inode.fd);
1009
1010    if (size) {
1011        value = new (nothrow) char[size];
1012        if (value == nullptr) {
1013            saverr = ENOMEM;
1014            goto out;
1015        }
1016
1017        ret = getxattr(procname, name, value, size);
1018        if (ret == -1)
1019            goto out_err;
1020        saverr = 0;
1021        if (ret == 0)
1022            goto out;
1023
1024        fuse_reply_buf(req, value, ret);
1025    } else {
1026        ret = getxattr(procname, name, nullptr, 0);
1027        if (ret == -1)
1028            goto out_err;
1029
1030        fuse_reply_xattr(req, ret);
1031    }
1032out_free:
1033    delete[] value;
1034    return;
1035
1036out_err:
1037    saverr = errno;
1038out:
1039    fuse_reply_err(req, saverr);
1040    goto out_free;
1041}
1042
1043
1044static void sfs_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) {
1045    char *value = nullptr;
1046    Inode& inode = get_inode(ino);
1047    ssize_t ret;
1048    int saverr;
1049
1050    char procname[64];
1051    sprintf(procname, "/proc/self/fd/%i", inode.fd);
1052
1053    if (size) {
1054        value = new (nothrow) char[size];
1055        if (value == nullptr) {
1056            saverr = ENOMEM;
1057            goto out;
1058        }
1059
1060        ret = listxattr(procname, value, size);
1061        if (ret == -1)
1062            goto out_err;
1063        saverr = 0;
1064        if (ret == 0)
1065            goto out;
1066
1067        fuse_reply_buf(req, value, ret);
1068    } else {
1069        ret = listxattr(procname, nullptr, 0);
1070        if (ret == -1)
1071            goto out_err;
1072
1073        fuse_reply_xattr(req, ret);
1074    }
1075out_free:
1076    delete[] value;
1077    return;
1078out_err:
1079    saverr = errno;
1080out:
1081    fuse_reply_err(req, saverr);
1082    goto out_free;
1083}
1084
1085
1086static void sfs_setxattr(fuse_req_t req, fuse_ino_t ino, const char *name,
1087                         const char *value, size_t size, int flags) {
1088    Inode& inode = get_inode(ino);
1089    ssize_t ret;
1090    int saverr;
1091
1092    char procname[64];
1093    sprintf(procname, "/proc/self/fd/%i", inode.fd);
1094
1095    ret = setxattr(procname, name, value, size, flags);
1096    saverr = ret == -1 ? errno : 0;
1097
1098    fuse_reply_err(req, saverr);
1099}
1100
1101
1102static void sfs_removexattr(fuse_req_t req, fuse_ino_t ino, const char *name) {
1103    char procname[64];
1104    Inode& inode = get_inode(ino);
1105    ssize_t ret;
1106    int saverr;
1107
1108    sprintf(procname, "/proc/self/fd/%i", inode.fd);
1109    ret = removexattr(procname, name);
1110    saverr = ret == -1 ? errno : 0;
1111
1112    fuse_reply_err(req, saverr);
1113}
1114#endif
1115
1116
1117static void assign_operations(fuse_lowlevel_ops &sfs_oper) {
1118    sfs_oper.init = sfs_init;
1119    sfs_oper.lookup = sfs_lookup;
1120    sfs_oper.mkdir = sfs_mkdir;
1121    sfs_oper.mknod = sfs_mknod;
1122    sfs_oper.symlink = sfs_symlink;
1123    sfs_oper.link = sfs_link;
1124    sfs_oper.unlink = sfs_unlink;
1125    sfs_oper.rmdir = sfs_rmdir;
1126    sfs_oper.rename = sfs_rename;
1127    sfs_oper.forget = sfs_forget;
1128    sfs_oper.forget_multi = sfs_forget_multi;
1129    sfs_oper.getattr = sfs_getattr;
1130    sfs_oper.setattr = sfs_setattr;
1131    sfs_oper.readlink = sfs_readlink;
1132    sfs_oper.opendir = sfs_opendir;
1133    sfs_oper.readdir = sfs_readdir;
1134    sfs_oper.readdirplus = sfs_readdirplus;
1135    sfs_oper.releasedir = sfs_releasedir;
1136    sfs_oper.fsyncdir = sfs_fsyncdir;
1137    sfs_oper.create = sfs_create;
1138    sfs_oper.open = sfs_open;
1139    sfs_oper.release = sfs_release;
1140    sfs_oper.flush = sfs_flush;
1141    sfs_oper.fsync = sfs_fsync;
1142    sfs_oper.read = sfs_read;
1143    sfs_oper.write_buf = sfs_write_buf;
1144    sfs_oper.statfs = sfs_statfs;
1145#ifdef HAVE_POSIX_FALLOCATE
1146    sfs_oper.fallocate = sfs_fallocate;
1147#endif
1148    sfs_oper.flock = sfs_flock;
1149#ifdef HAVE_SETXATTR
1150    sfs_oper.setxattr = sfs_setxattr;
1151    sfs_oper.getxattr = sfs_getxattr;
1152    sfs_oper.listxattr = sfs_listxattr;
1153    sfs_oper.removexattr = sfs_removexattr;
1154#endif
1155}
1156
1157static void print_usage(char *prog_name) {
1158    cout << "Usage: " << prog_name << " --help\n"
1159         << "       " << prog_name << " [options] <source> <mountpoint>\n";
1160}
1161
1162static cxxopts::ParseResult parse_wrapper(cxxopts::Options& parser, int& argc, char**& argv) {
1163    try {
1164        return parser.parse(argc, argv);
1165    } catch (cxxopts::option_not_exists_exception& exc) {
1166        std::cout << argv[0] << ": " << exc.what() << std::endl;
1167        print_usage(argv[0]);
1168        exit(2);
1169    }
1170}
1171
1172
1173static void string_split(std::string s, std::vector<std::string>& out, std::string delimiter) {
1174    size_t pos_start = 0, pos_end, delim_len = delimiter.length();
1175    std::string token;
1176
1177    while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) {
1178        token = s.substr(pos_start, pos_end - pos_start);
1179        pos_start = pos_end + delim_len;
1180        out.push_back(token);
1181    }
1182
1183    out.push_back(s.substr(pos_start));
1184}
1185
1186
1187static std::string string_join(const std::vector<std::string>& elems, char delim)
1188{
1189    std::ostringstream out;
1190    for (auto ii = elems.begin(); ii != elems.end(); ++ii) {
1191        out << (*ii);
1192        if (ii + 1 != elems.end()) {
1193            out << delim;
1194        }
1195    }
1196    return out.str();
1197}
1198
1199
1200static cxxopts::ParseResult parse_options(int argc, char **argv) {
1201    cxxopts::Options opt_parser(argv[0]);
1202    std::vector<std::string> mount_options;
1203    opt_parser.add_options()
1204        ("debug", "Enable filesystem debug messages")
1205        ("debug-fuse", "Enable libfuse debug messages")
1206        ("foreground", "Run in foreground")
1207        ("help", "Print help")
1208        ("nocache", "Disable all caching")
1209        ("nosplice", "Do not use splice(2) to transfer data")
1210        ("single", "Run single-threaded")
1211        ("o", "Mount options (see mount.fuse(5) - only use if you know what "
1212              "you are doing)", cxxopts::value(mount_options))
1213        ("num-threads", "Number of libfuse worker threads",
1214                        cxxopts::value<int>()->default_value(SFS_DEFAULT_THREADS))
1215        ("clone-fd", "use separate fuse device fd for each thread",
1216                        cxxopts::value<bool>()->implicit_value(SFS_DEFAULT_CLONE_FD));
1217
1218
1219    // FIXME: Find a better way to limit the try clause to just
1220    // opt_parser.parse() (cf. https://github.com/jarro2783/cxxopts/issues/146)
1221    auto options = parse_wrapper(opt_parser, argc, argv);
1222
1223    if (options.count("help")) {
1224        print_usage(argv[0]);
1225        // Strip everything before the option list from the
1226        // default help string.
1227        auto help = opt_parser.help();
1228        std::cout << std::endl << "options:"
1229                  << help.substr(help.find("\n\n") + 1, string::npos);
1230        exit(0);
1231
1232    } else if (argc != 3) {
1233        std::cout << argv[0] << ": invalid number of arguments\n";
1234        print_usage(argv[0]);
1235        exit(2);
1236    }
1237
1238    fs.debug = options.count("debug") != 0;
1239    fs.debug_fuse = options.count("debug-fuse") != 0;
1240
1241    fs.foreground = options.count("foreground") != 0;
1242    if (fs.debug || fs.debug_fuse)
1243        fs.foreground = true;
1244
1245    fs.nosplice = options.count("nosplice") != 0;
1246    fs.num_threads = options["num-threads"].as<int>();
1247    fs.clone_fd = options["clone-fd"].as<bool>();
1248    char* resolved_path = realpath(argv[1], NULL);
1249    if (resolved_path == NULL)
1250        warn("WARNING: realpath() failed with");
1251    fs.source = std::string {resolved_path};
1252    free(resolved_path);
1253
1254    std::vector<std::string> flattened_mount_opts;
1255    for (auto opt : mount_options) {
1256        string_split(opt, flattened_mount_opts, ",");
1257    }
1258
1259    bool found_fsname = false;
1260    for (auto opt : flattened_mount_opts) {
1261        if (opt.find("fsname=") == 0) {
1262            found_fsname = true;
1263            continue;
1264        }
1265
1266        /* Filter out some obviously incorrect options. */
1267        if (opt == "fd") {
1268            std::cout << argv[0] << ": Unsupported mount option: " << opt << "\n";
1269            print_usage(argv[0]);
1270            exit(2);
1271        }
1272    }
1273    if (!found_fsname) {
1274        flattened_mount_opts.push_back("fsname=" + fs.source);
1275    }
1276    flattened_mount_opts.push_back("default_permissions");
1277    fs.fuse_mount_options = string_join(flattened_mount_opts, ',');
1278    return options;
1279}
1280
1281
1282static void maximize_fd_limit() {
1283    struct rlimit lim {};
1284    auto res = getrlimit(RLIMIT_NOFILE, &lim);
1285    if (res != 0) {
1286        warn("WARNING: getrlimit() failed with");
1287        return;
1288    }
1289    lim.rlim_cur = lim.rlim_max;
1290    res = setrlimit(RLIMIT_NOFILE, &lim);
1291    if (res != 0)
1292        warn("WARNING: setrlimit() failed with");
1293}
1294
1295
1296int main(int argc, char *argv[]) {
1297
1298    struct fuse_loop_config *loop_config = NULL;
1299
1300    // Parse command line options
1301    auto options {parse_options(argc, argv)};
1302
1303    // We need an fd for every dentry in our the filesystem that the
1304    // kernel knows about. This is way more than most processes need,
1305    // so try to get rid of any resource softlimit.
1306    maximize_fd_limit();
1307
1308    // Initialize filesystem root
1309    fs.root.fd = -1;
1310    fs.root.nlookup = 9999;
1311    fs.timeout = options.count("nocache") ? 0 : 86400.0;
1312
1313    struct stat stat;
1314    auto ret = lstat(fs.source.c_str(), &stat);
1315    if (ret == -1)
1316        err(1, "ERROR: failed to stat source (\"%s\")", fs.source.c_str());
1317    if (!S_ISDIR(stat.st_mode))
1318        errx(1, "ERROR: source is not a directory");
1319    fs.src_dev = stat.st_dev;
1320
1321    fs.root.fd = open(fs.source.c_str(), O_PATH);
1322    if (fs.root.fd == -1)
1323        err(1, "ERROR: open(\"%s\", O_PATH)", fs.source.c_str());
1324
1325    // Initialize fuse
1326    fuse_args args = FUSE_ARGS_INIT(0, nullptr);
1327    if (fuse_opt_add_arg(&args, argv[0]) ||
1328        fuse_opt_add_arg(&args, "-o") ||
1329        fuse_opt_add_arg(&args, fs.fuse_mount_options.c_str()) ||
1330        (fs.debug_fuse && fuse_opt_add_arg(&args, "-odebug")))
1331        errx(3, "ERROR: Out of memory");
1332
1333    fuse_lowlevel_ops sfs_oper {};
1334    assign_operations(sfs_oper);
1335    auto se = fuse_session_new(&args, &sfs_oper, sizeof(sfs_oper), &fs);
1336    if (se == nullptr)
1337        goto err_out1;
1338
1339    if (fuse_set_signal_handlers(se) != 0)
1340        goto err_out2;
1341
1342    // Don't apply umask, use modes exactly as specified
1343    umask(0);
1344
1345    // Mount and run main loop
1346    loop_config = fuse_loop_cfg_create();
1347
1348    if (fs.num_threads != -1)
1349        fuse_loop_cfg_set_idle_threads(loop_config, fs.num_threads);
1350
1351    if (fuse_session_mount(se, argv[2]) != 0)
1352        goto err_out3;
1353
1354    fuse_daemonize(fs.foreground);
1355
1356    if (options.count("single"))
1357        ret = fuse_session_loop(se);
1358    else
1359        ret = fuse_session_loop_mt(se, loop_config);
1360
1361
1362    fuse_session_unmount(se);
1363
1364err_out3:
1365    fuse_remove_signal_handlers(se);
1366err_out2:
1367    fuse_session_destroy(se);
1368err_out1:
1369
1370    fuse_loop_cfg_destroy(loop_config);
1371    fuse_opt_free_args(&args);
1372
1373    return ret ? 1 : 0;
1374}
1375
1376