1// SPDX-License-Identifier: GPL-2.0-or-later
2/* Common capabilities, needed by capability.o.
3 */
4
5#include <linux/capability.h>
6#include <linux/audit.h>
7#include <linux/init.h>
8#include <linux/kernel.h>
9#include <linux/lsm_hooks.h>
10#include <linux/file.h>
11#include <linux/mm.h>
12#include <linux/mman.h>
13#include <linux/pagemap.h>
14#include <linux/swap.h>
15#include <linux/skbuff.h>
16#include <linux/netlink.h>
17#include <linux/ptrace.h>
18#include <linux/xattr.h>
19#include <linux/hugetlb.h>
20#include <linux/mount.h>
21#include <linux/sched.h>
22#include <linux/prctl.h>
23#include <linux/securebits.h>
24#include <linux/user_namespace.h>
25#include <linux/binfmts.h>
26#include <linux/personality.h>
27
28/*
29 * If a non-root user executes a setuid-root binary in
30 * !secure(SECURE_NOROOT) mode, then we raise capabilities.
31 * However if fE is also set, then the intent is for only
32 * the file capabilities to be applied, and the setuid-root
33 * bit is left on either to change the uid (plausible) or
34 * to get full privilege on a kernel without file capabilities
35 * support.  So in that case we do not raise capabilities.
36 *
37 * Warn if that happens, once per boot.
38 */
39static void warn_setuid_and_fcaps_mixed(const char *fname)
40{
41	static int warned;
42	if (!warned) {
43		printk(KERN_INFO "warning: `%s' has both setuid-root and"
44			" effective capabilities. Therefore not raising all"
45			" capabilities.\n", fname);
46		warned = 1;
47	}
48}
49
50/**
51 * cap_capable - Determine whether a task has a particular effective capability
52 * @cred: The credentials to use
53 * @ns:  The user namespace in which we need the capability
54 * @cap: The capability to check for
55 * @opts: Bitmask of options defined in include/linux/security.h
56 *
57 * Determine whether the nominated task has the specified capability amongst
58 * its effective set, returning 0 if it does, -ve if it does not.
59 *
60 * NOTE WELL: cap_has_capability() cannot be used like the kernel's capable()
61 * and has_capability() functions.  That is, it has the reverse semantics:
62 * cap_has_capability() returns 0 when a task has a capability, but the
63 * kernel's capable() and has_capability() returns 1 for this case.
64 */
65int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
66		int cap, unsigned int opts)
67{
68	struct user_namespace *ns = targ_ns;
69
70	/* See if cred has the capability in the target user namespace
71	 * by examining the target user namespace and all of the target
72	 * user namespace's parents.
73	 */
74	for (;;) {
75		/* Do we have the necessary capabilities? */
76		if (ns == cred->user_ns)
77			return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
78
79		/*
80		 * If we're already at a lower level than we're looking for,
81		 * we're done searching.
82		 */
83		if (ns->level <= cred->user_ns->level)
84			return -EPERM;
85
86		/*
87		 * The owner of the user namespace in the parent of the
88		 * user namespace has all caps.
89		 */
90		if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid))
91			return 0;
92
93		/*
94		 * If you have a capability in a parent user ns, then you have
95		 * it over all children user namespaces as well.
96		 */
97		ns = ns->parent;
98	}
99
100	/* We never get here */
101}
102
103/**
104 * cap_settime - Determine whether the current process may set the system clock
105 * @ts: The time to set
106 * @tz: The timezone to set
107 *
108 * Determine whether the current process may set the system clock and timezone
109 * information, returning 0 if permission granted, -ve if denied.
110 */
111int cap_settime(const struct timespec64 *ts, const struct timezone *tz)
112{
113	if (!capable(CAP_SYS_TIME))
114		return -EPERM;
115	return 0;
116}
117
118/**
119 * cap_ptrace_access_check - Determine whether the current process may access
120 *			   another
121 * @child: The process to be accessed
122 * @mode: The mode of attachment.
123 *
124 * If we are in the same or an ancestor user_ns and have all the target
125 * task's capabilities, then ptrace access is allowed.
126 * If we have the ptrace capability to the target user_ns, then ptrace
127 * access is allowed.
128 * Else denied.
129 *
130 * Determine whether a process may access another, returning 0 if permission
131 * granted, -ve if denied.
132 */
133int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
134{
135	int ret = 0;
136	const struct cred *cred, *child_cred;
137	const kernel_cap_t *caller_caps;
138
139	rcu_read_lock();
140	cred = current_cred();
141	child_cred = __task_cred(child);
142	if (mode & PTRACE_MODE_FSCREDS)
143		caller_caps = &cred->cap_effective;
144	else
145		caller_caps = &cred->cap_permitted;
146	if (cred->user_ns == child_cred->user_ns &&
147	    cap_issubset(child_cred->cap_permitted, *caller_caps))
148		goto out;
149	if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE))
150		goto out;
151	ret = -EPERM;
152out:
153	rcu_read_unlock();
154	return ret;
155}
156
157/**
158 * cap_ptrace_traceme - Determine whether another process may trace the current
159 * @parent: The task proposed to be the tracer
160 *
161 * If parent is in the same or an ancestor user_ns and has all current's
162 * capabilities, then ptrace access is allowed.
163 * If parent has the ptrace capability to current's user_ns, then ptrace
164 * access is allowed.
165 * Else denied.
166 *
167 * Determine whether the nominated task is permitted to trace the current
168 * process, returning 0 if permission is granted, -ve if denied.
169 */
170int cap_ptrace_traceme(struct task_struct *parent)
171{
172	int ret = 0;
173	const struct cred *cred, *child_cred;
174
175	rcu_read_lock();
176	cred = __task_cred(parent);
177	child_cred = current_cred();
178	if (cred->user_ns == child_cred->user_ns &&
179	    cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
180		goto out;
181	if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE))
182		goto out;
183	ret = -EPERM;
184out:
185	rcu_read_unlock();
186	return ret;
187}
188
189/**
190 * cap_capget - Retrieve a task's capability sets
191 * @target: The task from which to retrieve the capability sets
192 * @effective: The place to record the effective set
193 * @inheritable: The place to record the inheritable set
194 * @permitted: The place to record the permitted set
195 *
196 * This function retrieves the capabilities of the nominated task and returns
197 * them to the caller.
198 */
199int cap_capget(struct task_struct *target, kernel_cap_t *effective,
200	       kernel_cap_t *inheritable, kernel_cap_t *permitted)
201{
202	const struct cred *cred;
203
204	/* Derived from kernel/capability.c:sys_capget. */
205	rcu_read_lock();
206	cred = __task_cred(target);
207	*effective   = cred->cap_effective;
208	*inheritable = cred->cap_inheritable;
209	*permitted   = cred->cap_permitted;
210	rcu_read_unlock();
211	return 0;
212}
213
214/*
215 * Determine whether the inheritable capabilities are limited to the old
216 * permitted set.  Returns 1 if they are limited, 0 if they are not.
217 */
218static inline int cap_inh_is_capped(void)
219{
220	/* they are so limited unless the current task has the CAP_SETPCAP
221	 * capability
222	 */
223	if (cap_capable(current_cred(), current_cred()->user_ns,
224			CAP_SETPCAP, CAP_OPT_NONE) == 0)
225		return 0;
226	return 1;
227}
228
229/**
230 * cap_capset - Validate and apply proposed changes to current's capabilities
231 * @new: The proposed new credentials; alterations should be made here
232 * @old: The current task's current credentials
233 * @effective: A pointer to the proposed new effective capabilities set
234 * @inheritable: A pointer to the proposed new inheritable capabilities set
235 * @permitted: A pointer to the proposed new permitted capabilities set
236 *
237 * This function validates and applies a proposed mass change to the current
238 * process's capability sets.  The changes are made to the proposed new
239 * credentials, and assuming no error, will be committed by the caller of LSM.
240 */
241int cap_capset(struct cred *new,
242	       const struct cred *old,
243	       const kernel_cap_t *effective,
244	       const kernel_cap_t *inheritable,
245	       const kernel_cap_t *permitted)
246{
247	if (cap_inh_is_capped() &&
248	    !cap_issubset(*inheritable,
249			  cap_combine(old->cap_inheritable,
250				      old->cap_permitted)))
251		/* incapable of using this inheritable set */
252		return -EPERM;
253
254	if (!cap_issubset(*inheritable,
255			  cap_combine(old->cap_inheritable,
256				      old->cap_bset)))
257		/* no new pI capabilities outside bounding set */
258		return -EPERM;
259
260	/* verify restrictions on target's new Permitted set */
261	if (!cap_issubset(*permitted, old->cap_permitted))
262		return -EPERM;
263
264	/* verify the _new_Effective_ is a subset of the _new_Permitted_ */
265	if (!cap_issubset(*effective, *permitted))
266		return -EPERM;
267
268	new->cap_effective   = *effective;
269	new->cap_inheritable = *inheritable;
270	new->cap_permitted   = *permitted;
271
272	/*
273	 * Mask off ambient bits that are no longer both permitted and
274	 * inheritable.
275	 */
276	new->cap_ambient = cap_intersect(new->cap_ambient,
277					 cap_intersect(*permitted,
278						       *inheritable));
279	if (WARN_ON(!cap_ambient_invariant_ok(new)))
280		return -EINVAL;
281	return 0;
282}
283
284/**
285 * cap_inode_need_killpriv - Determine if inode change affects privileges
286 * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
287 *
288 * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV
289 * affects the security markings on that inode, and if it is, should
290 * inode_killpriv() be invoked or the change rejected.
291 *
292 * Returns 1 if security.capability has a value, meaning inode_killpriv()
293 * is required, 0 otherwise, meaning inode_killpriv() is not required.
294 */
295int cap_inode_need_killpriv(struct dentry *dentry)
296{
297	struct inode *inode = d_backing_inode(dentry);
298	int error;
299
300	error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0);
301	return error > 0;
302}
303
304/**
305 * cap_inode_killpriv - Erase the security markings on an inode
306 * @dentry: The inode/dentry to alter
307 *
308 * Erase the privilege-enhancing security markings on an inode.
309 *
310 * Returns 0 if successful, -ve on error.
311 */
312int cap_inode_killpriv(struct dentry *dentry)
313{
314	int error;
315
316	error = __vfs_removexattr(dentry, XATTR_NAME_CAPS);
317	if (error == -EOPNOTSUPP)
318		error = 0;
319	return error;
320}
321
322static bool rootid_owns_currentns(kuid_t kroot)
323{
324	struct user_namespace *ns;
325
326	if (!uid_valid(kroot))
327		return false;
328
329	for (ns = current_user_ns(); ; ns = ns->parent) {
330		if (from_kuid(ns, kroot) == 0)
331			return true;
332		if (ns == &init_user_ns)
333			break;
334	}
335
336	return false;
337}
338
339static __u32 sansflags(__u32 m)
340{
341	return m & ~VFS_CAP_FLAGS_EFFECTIVE;
342}
343
344static bool is_v2header(size_t size, const struct vfs_cap_data *cap)
345{
346	if (size != XATTR_CAPS_SZ_2)
347		return false;
348	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2;
349}
350
351static bool is_v3header(size_t size, const struct vfs_cap_data *cap)
352{
353	if (size != XATTR_CAPS_SZ_3)
354		return false;
355	return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
356}
357
358/*
359 * getsecurity: We are called for security.* before any attempt to read the
360 * xattr from the inode itself.
361 *
362 * This gives us a chance to read the on-disk value and convert it.  If we
363 * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
364 *
365 * Note we are not called by vfs_getxattr_alloc(), but that is only called
366 * by the integrity subsystem, which really wants the unconverted values -
367 * so that's good.
368 */
369int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
370			  bool alloc)
371{
372	int size, ret;
373	kuid_t kroot;
374	u32 nsmagic, magic;
375	uid_t root, mappedroot;
376	char *tmpbuf = NULL;
377	struct vfs_cap_data *cap;
378	struct vfs_ns_cap_data *nscap = NULL;
379	struct dentry *dentry;
380	struct user_namespace *fs_ns;
381
382	if (strcmp(name, "capability") != 0)
383		return -EOPNOTSUPP;
384
385	dentry = d_find_any_alias(inode);
386	if (!dentry)
387		return -EINVAL;
388
389	size = sizeof(struct vfs_ns_cap_data);
390	ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS,
391				 &tmpbuf, size, GFP_NOFS);
392	dput(dentry);
393
394	if (ret < 0 || !tmpbuf) {
395		size = ret;
396		goto out_free;
397	}
398
399	fs_ns = inode->i_sb->s_user_ns;
400	cap = (struct vfs_cap_data *) tmpbuf;
401	if (is_v2header((size_t) ret, cap)) {
402		root = 0;
403	} else if (is_v3header((size_t) ret, cap)) {
404		nscap = (struct vfs_ns_cap_data *) tmpbuf;
405		root = le32_to_cpu(nscap->rootid);
406	} else {
407		size = -EINVAL;
408		goto out_free;
409	}
410
411	kroot = make_kuid(fs_ns, root);
412
413	/* If the root kuid maps to a valid uid in current ns, then return
414	 * this as a nscap. */
415	mappedroot = from_kuid(current_user_ns(), kroot);
416	if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
417		size = sizeof(struct vfs_ns_cap_data);
418		if (alloc) {
419			if (!nscap) {
420				/* v2 -> v3 conversion */
421				nscap = kzalloc(size, GFP_ATOMIC);
422				if (!nscap) {
423					size = -ENOMEM;
424					goto out_free;
425				}
426				nsmagic = VFS_CAP_REVISION_3;
427				magic = le32_to_cpu(cap->magic_etc);
428				if (magic & VFS_CAP_FLAGS_EFFECTIVE)
429					nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
430				memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
431				nscap->magic_etc = cpu_to_le32(nsmagic);
432			} else {
433				/* use allocated v3 buffer */
434				tmpbuf = NULL;
435			}
436			nscap->rootid = cpu_to_le32(mappedroot);
437			*buffer = nscap;
438		}
439		goto out_free;
440	}
441
442	if (!rootid_owns_currentns(kroot)) {
443		size = -EOVERFLOW;
444		goto out_free;
445	}
446
447	/* This comes from a parent namespace.  Return as a v2 capability */
448	size = sizeof(struct vfs_cap_data);
449	if (alloc) {
450		if (nscap) {
451			/* v3 -> v2 conversion */
452			cap = kzalloc(size, GFP_ATOMIC);
453			if (!cap) {
454				size = -ENOMEM;
455				goto out_free;
456			}
457			magic = VFS_CAP_REVISION_2;
458			nsmagic = le32_to_cpu(nscap->magic_etc);
459			if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
460				magic |= VFS_CAP_FLAGS_EFFECTIVE;
461			memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
462			cap->magic_etc = cpu_to_le32(magic);
463		} else {
464			/* use unconverted v2 */
465			tmpbuf = NULL;
466		}
467		*buffer = cap;
468	}
469out_free:
470	kfree(tmpbuf);
471	return size;
472}
473
474static kuid_t rootid_from_xattr(const void *value, size_t size,
475				struct user_namespace *task_ns)
476{
477	const struct vfs_ns_cap_data *nscap = value;
478	uid_t rootid = 0;
479
480	if (size == XATTR_CAPS_SZ_3)
481		rootid = le32_to_cpu(nscap->rootid);
482
483	return make_kuid(task_ns, rootid);
484}
485
486static bool validheader(size_t size, const struct vfs_cap_data *cap)
487{
488	return is_v2header(size, cap) || is_v3header(size, cap);
489}
490
491/*
492 * User requested a write of security.capability.  If needed, update the
493 * xattr to change from v2 to v3, or to fixup the v3 rootid.
494 *
495 * If all is ok, we return the new size, on error return < 0.
496 */
497int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size)
498{
499	struct vfs_ns_cap_data *nscap;
500	uid_t nsrootid;
501	const struct vfs_cap_data *cap = *ivalue;
502	__u32 magic, nsmagic;
503	struct inode *inode = d_backing_inode(dentry);
504	struct user_namespace *task_ns = current_user_ns(),
505		*fs_ns = inode->i_sb->s_user_ns;
506	kuid_t rootid;
507	size_t newsize;
508
509	if (!*ivalue)
510		return -EINVAL;
511	if (!validheader(size, cap))
512		return -EINVAL;
513	if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
514		return -EPERM;
515	if (size == XATTR_CAPS_SZ_2)
516		if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
517			/* user is privileged, just write the v2 */
518			return size;
519
520	rootid = rootid_from_xattr(*ivalue, size, task_ns);
521	if (!uid_valid(rootid))
522		return -EINVAL;
523
524	nsrootid = from_kuid(fs_ns, rootid);
525	if (nsrootid == -1)
526		return -EINVAL;
527
528	newsize = sizeof(struct vfs_ns_cap_data);
529	nscap = kmalloc(newsize, GFP_ATOMIC);
530	if (!nscap)
531		return -ENOMEM;
532	nscap->rootid = cpu_to_le32(nsrootid);
533	nsmagic = VFS_CAP_REVISION_3;
534	magic = le32_to_cpu(cap->magic_etc);
535	if (magic & VFS_CAP_FLAGS_EFFECTIVE)
536		nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
537	nscap->magic_etc = cpu_to_le32(nsmagic);
538	memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
539
540	kvfree(*ivalue);
541	*ivalue = nscap;
542	return newsize;
543}
544
545/*
546 * Calculate the new process capability sets from the capability sets attached
547 * to a file.
548 */
549static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
550					  struct linux_binprm *bprm,
551					  bool *effective,
552					  bool *has_fcap)
553{
554	struct cred *new = bprm->cred;
555	unsigned i;
556	int ret = 0;
557
558	if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
559		*effective = true;
560
561	if (caps->magic_etc & VFS_CAP_REVISION_MASK)
562		*has_fcap = true;
563
564	CAP_FOR_EACH_U32(i) {
565		__u32 permitted = caps->permitted.cap[i];
566		__u32 inheritable = caps->inheritable.cap[i];
567
568		/*
569		 * pP' = (X & fP) | (pI & fI)
570		 * The addition of pA' is handled later.
571		 */
572		new->cap_permitted.cap[i] =
573			(new->cap_bset.cap[i] & permitted) |
574			(new->cap_inheritable.cap[i] & inheritable);
575
576		if (permitted & ~new->cap_permitted.cap[i])
577			/* insufficient to execute correctly */
578			ret = -EPERM;
579	}
580
581	/*
582	 * For legacy apps, with no internal support for recognizing they
583	 * do not have enough capabilities, we return an error if they are
584	 * missing some "forced" (aka file-permitted) capabilities.
585	 */
586	return *effective ? ret : 0;
587}
588
589/*
590 * Extract the on-exec-apply capability sets for an executable file.
591 */
592int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
593{
594	struct inode *inode = d_backing_inode(dentry);
595	__u32 magic_etc;
596	unsigned tocopy, i;
597	int size;
598	struct vfs_ns_cap_data data, *nscaps = &data;
599	struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
600	kuid_t rootkuid;
601	struct user_namespace *fs_ns;
602
603	memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
604
605	if (!inode)
606		return -ENODATA;
607
608	fs_ns = inode->i_sb->s_user_ns;
609	size = __vfs_getxattr((struct dentry *)dentry, inode,
610			      XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
611	if (size == -ENODATA || size == -EOPNOTSUPP)
612		/* no data, that's ok */
613		return -ENODATA;
614
615	if (size < 0)
616		return size;
617
618	if (size < sizeof(magic_etc))
619		return -EINVAL;
620
621	cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
622
623	rootkuid = make_kuid(fs_ns, 0);
624	switch (magic_etc & VFS_CAP_REVISION_MASK) {
625	case VFS_CAP_REVISION_1:
626		if (size != XATTR_CAPS_SZ_1)
627			return -EINVAL;
628		tocopy = VFS_CAP_U32_1;
629		break;
630	case VFS_CAP_REVISION_2:
631		if (size != XATTR_CAPS_SZ_2)
632			return -EINVAL;
633		tocopy = VFS_CAP_U32_2;
634		break;
635	case VFS_CAP_REVISION_3:
636		if (size != XATTR_CAPS_SZ_3)
637			return -EINVAL;
638		tocopy = VFS_CAP_U32_3;
639		rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
640		break;
641
642	default:
643		return -EINVAL;
644	}
645	/* Limit the caps to the mounter of the filesystem
646	 * or the more limited uid specified in the xattr.
647	 */
648	if (!rootid_owns_currentns(rootkuid))
649		return -ENODATA;
650
651	CAP_FOR_EACH_U32(i) {
652		if (i >= tocopy)
653			break;
654		cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted);
655		cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable);
656	}
657
658	cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
659	cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
660
661	cpu_caps->rootid = rootkuid;
662
663	return 0;
664}
665
666/*
667 * Attempt to get the on-exec apply capability sets for an executable file from
668 * its xattrs and, if present, apply them to the proposed credentials being
669 * constructed by execve().
670 */
671static int get_file_caps(struct linux_binprm *bprm, struct file *file,
672			 bool *effective, bool *has_fcap)
673{
674	int rc = 0;
675	struct cpu_vfs_cap_data vcaps;
676
677	cap_clear(bprm->cred->cap_permitted);
678
679	if (!file_caps_enabled)
680		return 0;
681
682	if (!mnt_may_suid(file->f_path.mnt))
683		return 0;
684
685	/*
686	 * This check is redundant with mnt_may_suid() but is kept to make
687	 * explicit that capability bits are limited to s_user_ns and its
688	 * descendants.
689	 */
690	if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns))
691		return 0;
692
693	rc = get_vfs_caps_from_disk(file->f_path.dentry, &vcaps);
694	if (rc < 0) {
695		if (rc == -EINVAL)
696			printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
697					bprm->filename);
698		else if (rc == -ENODATA)
699			rc = 0;
700		goto out;
701	}
702
703	rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap);
704
705out:
706	if (rc)
707		cap_clear(bprm->cred->cap_permitted);
708
709	return rc;
710}
711
712static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); }
713
714static inline bool __is_real(kuid_t uid, struct cred *cred)
715{ return uid_eq(cred->uid, uid); }
716
717static inline bool __is_eff(kuid_t uid, struct cred *cred)
718{ return uid_eq(cred->euid, uid); }
719
720static inline bool __is_suid(kuid_t uid, struct cred *cred)
721{ return !__is_real(uid, cred) && __is_eff(uid, cred); }
722
723/*
724 * handle_privileged_root - Handle case of privileged root
725 * @bprm: The execution parameters, including the proposed creds
726 * @has_fcap: Are any file capabilities set?
727 * @effective: Do we have effective root privilege?
728 * @root_uid: This namespace' root UID WRT initial USER namespace
729 *
730 * Handle the case where root is privileged and hasn't been neutered by
731 * SECURE_NOROOT.  If file capabilities are set, they won't be combined with
732 * set UID root and nothing is changed.  If we are root, cap_permitted is
733 * updated.  If we have become set UID root, the effective bit is set.
734 */
735static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap,
736				   bool *effective, kuid_t root_uid)
737{
738	const struct cred *old = current_cred();
739	struct cred *new = bprm->cred;
740
741	if (!root_privileged())
742		return;
743	/*
744	 * If the legacy file capability is set, then don't set privs
745	 * for a setuid root binary run by a non-root user.  Do set it
746	 * for a root user just to cause least surprise to an admin.
747	 */
748	if (has_fcap && __is_suid(root_uid, new)) {
749		warn_setuid_and_fcaps_mixed(bprm->filename);
750		return;
751	}
752	/*
753	 * To support inheritance of root-permissions and suid-root
754	 * executables under compatibility mode, we override the
755	 * capability sets for the file.
756	 */
757	if (__is_eff(root_uid, new) || __is_real(root_uid, new)) {
758		/* pP' = (cap_bset & ~0) | (pI & ~0) */
759		new->cap_permitted = cap_combine(old->cap_bset,
760						 old->cap_inheritable);
761	}
762	/*
763	 * If only the real uid is 0, we do not set the effective bit.
764	 */
765	if (__is_eff(root_uid, new))
766		*effective = true;
767}
768
769#define __cap_gained(field, target, source) \
770	!cap_issubset(target->cap_##field, source->cap_##field)
771#define __cap_grew(target, source, cred) \
772	!cap_issubset(cred->cap_##target, cred->cap_##source)
773#define __cap_full(field, cred) \
774	cap_issubset(CAP_FULL_SET, cred->cap_##field)
775
776static inline bool __is_setuid(struct cred *new, const struct cred *old)
777{ return !uid_eq(new->euid, old->uid); }
778
779static inline bool __is_setgid(struct cred *new, const struct cred *old)
780{ return !gid_eq(new->egid, old->gid); }
781
782/*
783 * 1) Audit candidate if current->cap_effective is set
784 *
785 * We do not bother to audit if 3 things are true:
786 *   1) cap_effective has all caps
787 *   2) we became root *OR* are were already root
788 *   3) root is supposed to have all caps (SECURE_NOROOT)
789 * Since this is just a normal root execing a process.
790 *
791 * Number 1 above might fail if you don't have a full bset, but I think
792 * that is interesting information to audit.
793 *
794 * A number of other conditions require logging:
795 * 2) something prevented setuid root getting all caps
796 * 3) non-setuid root gets fcaps
797 * 4) non-setuid root gets ambient
798 */
799static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old,
800				     kuid_t root, bool has_fcap)
801{
802	bool ret = false;
803
804	if ((__cap_grew(effective, ambient, new) &&
805	     !(__cap_full(effective, new) &&
806	       (__is_eff(root, new) || __is_real(root, new)) &&
807	       root_privileged())) ||
808	    (root_privileged() &&
809	     __is_suid(root, new) &&
810	     !__cap_full(effective, new)) ||
811	    (!__is_setuid(new, old) &&
812	     ((has_fcap &&
813	       __cap_gained(permitted, new, old)) ||
814	      __cap_gained(ambient, new, old))))
815
816		ret = true;
817
818	return ret;
819}
820
821/**
822 * cap_bprm_creds_from_file - Set up the proposed credentials for execve().
823 * @bprm: The execution parameters, including the proposed creds
824 * @file: The file to pull the credentials from
825 *
826 * Set up the proposed credentials for a new execution context being
827 * constructed by execve().  The proposed creds in @bprm->cred is altered,
828 * which won't take effect immediately.  Returns 0 if successful, -ve on error.
829 */
830int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file)
831{
832	/* Process setpcap binaries and capabilities for uid 0 */
833	const struct cred *old = current_cred();
834	struct cred *new = bprm->cred;
835	bool effective = false, has_fcap = false, is_setid;
836	int ret;
837	kuid_t root_uid;
838
839	if (WARN_ON(!cap_ambient_invariant_ok(old)))
840		return -EPERM;
841
842	ret = get_file_caps(bprm, file, &effective, &has_fcap);
843	if (ret < 0)
844		return ret;
845
846	root_uid = make_kuid(new->user_ns, 0);
847
848	handle_privileged_root(bprm, has_fcap, &effective, root_uid);
849
850	/* if we have fs caps, clear dangerous personality flags */
851	if (__cap_gained(permitted, new, old))
852		bprm->per_clear |= PER_CLEAR_ON_SETID;
853
854	/* Don't let someone trace a set[ug]id/setpcap binary with the revised
855	 * credentials unless they have the appropriate permit.
856	 *
857	 * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
858	 */
859	is_setid = __is_setuid(new, old) || __is_setgid(new, old);
860
861	if ((is_setid || __cap_gained(permitted, new, old)) &&
862	    ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
863	     !ptracer_capable(current, new->user_ns))) {
864		/* downgrade; they get no more than they had, and maybe less */
865		if (!ns_capable(new->user_ns, CAP_SETUID) ||
866		    (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
867			new->euid = new->uid;
868			new->egid = new->gid;
869		}
870		new->cap_permitted = cap_intersect(new->cap_permitted,
871						   old->cap_permitted);
872	}
873
874	new->suid = new->fsuid = new->euid;
875	new->sgid = new->fsgid = new->egid;
876
877	/* File caps or setid cancels ambient. */
878	if (has_fcap || is_setid)
879		cap_clear(new->cap_ambient);
880
881	/*
882	 * Now that we've computed pA', update pP' to give:
883	 *   pP' = (X & fP) | (pI & fI) | pA'
884	 */
885	new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);
886
887	/*
888	 * Set pE' = (fE ? pP' : pA').  Because pA' is zero if fE is set,
889	 * this is the same as pE' = (fE ? pP' : 0) | pA'.
890	 */
891	if (effective)
892		new->cap_effective = new->cap_permitted;
893	else
894		new->cap_effective = new->cap_ambient;
895
896	if (WARN_ON(!cap_ambient_invariant_ok(new)))
897		return -EPERM;
898
899	if (nonroot_raised_pE(new, old, root_uid, has_fcap)) {
900		ret = audit_log_bprm_fcaps(bprm, new, old);
901		if (ret < 0)
902			return ret;
903	}
904
905	new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
906
907	if (WARN_ON(!cap_ambient_invariant_ok(new)))
908		return -EPERM;
909
910	/* Check for privilege-elevated exec. */
911	if (is_setid ||
912	    (!__is_real(root_uid, new) &&
913	     (effective ||
914	      __cap_grew(permitted, ambient, new))))
915		bprm->secureexec = 1;
916
917	return 0;
918}
919
920/**
921 * cap_inode_setxattr - Determine whether an xattr may be altered
922 * @dentry: The inode/dentry being altered
923 * @name: The name of the xattr to be changed
924 * @value: The value that the xattr will be changed to
925 * @size: The size of value
926 * @flags: The replacement flag
927 *
928 * Determine whether an xattr may be altered or set on an inode, returning 0 if
929 * permission is granted, -ve if denied.
930 *
931 * This is used to make sure security xattrs don't get updated or set by those
932 * who aren't privileged to do so.
933 */
934int cap_inode_setxattr(struct dentry *dentry, const char *name,
935		       const void *value, size_t size, int flags)
936{
937	struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
938
939	/* Ignore non-security xattrs */
940	if (strncmp(name, XATTR_SECURITY_PREFIX,
941			XATTR_SECURITY_PREFIX_LEN) != 0)
942		return 0;
943
944	/*
945	 * For XATTR_NAME_CAPS the check will be done in
946	 * cap_convert_nscap(), called by setxattr()
947	 */
948	if (strcmp(name, XATTR_NAME_CAPS) == 0)
949		return 0;
950
951	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
952		return -EPERM;
953	return 0;
954}
955
956/**
957 * cap_inode_removexattr - Determine whether an xattr may be removed
958 * @dentry: The inode/dentry being altered
959 * @name: The name of the xattr to be changed
960 *
961 * Determine whether an xattr may be removed from an inode, returning 0 if
962 * permission is granted, -ve if denied.
963 *
964 * This is used to make sure security xattrs don't get removed by those who
965 * aren't privileged to remove them.
966 */
967int cap_inode_removexattr(struct dentry *dentry, const char *name)
968{
969	struct user_namespace *user_ns = dentry->d_sb->s_user_ns;
970
971	/* Ignore non-security xattrs */
972	if (strncmp(name, XATTR_SECURITY_PREFIX,
973			XATTR_SECURITY_PREFIX_LEN) != 0)
974		return 0;
975
976	if (strcmp(name, XATTR_NAME_CAPS) == 0) {
977		/* security.capability gets namespaced */
978		struct inode *inode = d_backing_inode(dentry);
979		if (!inode)
980			return -EINVAL;
981		if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
982			return -EPERM;
983		return 0;
984	}
985
986	if (!ns_capable(user_ns, CAP_SYS_ADMIN))
987		return -EPERM;
988	return 0;
989}
990
991/*
992 * cap_emulate_setxuid() fixes the effective / permitted capabilities of
993 * a process after a call to setuid, setreuid, or setresuid.
994 *
995 *  1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
996 *  {r,e,s}uid != 0, the permitted and effective capabilities are
997 *  cleared.
998 *
999 *  2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
1000 *  capabilities of the process are cleared.
1001 *
1002 *  3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
1003 *  capabilities are set to the permitted capabilities.
1004 *
1005 *  fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
1006 *  never happen.
1007 *
1008 *  -astor
1009 *
1010 * cevans - New behaviour, Oct '99
1011 * A process may, via prctl(), elect to keep its capabilities when it
1012 * calls setuid() and switches away from uid==0. Both permitted and
1013 * effective sets will be retained.
1014 * Without this change, it was impossible for a daemon to drop only some
1015 * of its privilege. The call to setuid(!=0) would drop all privileges!
1016 * Keeping uid 0 is not an option because uid 0 owns too many vital
1017 * files..
1018 * Thanks to Olaf Kirch and Peter Benie for spotting this.
1019 */
1020static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
1021{
1022	kuid_t root_uid = make_kuid(old->user_ns, 0);
1023
1024	if ((uid_eq(old->uid, root_uid) ||
1025	     uid_eq(old->euid, root_uid) ||
1026	     uid_eq(old->suid, root_uid)) &&
1027	    (!uid_eq(new->uid, root_uid) &&
1028	     !uid_eq(new->euid, root_uid) &&
1029	     !uid_eq(new->suid, root_uid))) {
1030		if (!issecure(SECURE_KEEP_CAPS)) {
1031			cap_clear(new->cap_permitted);
1032			cap_clear(new->cap_effective);
1033		}
1034
1035		/*
1036		 * Pre-ambient programs expect setresuid to nonroot followed
1037		 * by exec to drop capabilities.  We should make sure that
1038		 * this remains the case.
1039		 */
1040		cap_clear(new->cap_ambient);
1041	}
1042	if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
1043		cap_clear(new->cap_effective);
1044	if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid))
1045		new->cap_effective = new->cap_permitted;
1046}
1047
1048/**
1049 * cap_task_fix_setuid - Fix up the results of setuid() call
1050 * @new: The proposed credentials
1051 * @old: The current task's current credentials
1052 * @flags: Indications of what has changed
1053 *
1054 * Fix up the results of setuid() call before the credential changes are
1055 * actually applied, returning 0 to grant the changes, -ve to deny them.
1056 */
1057int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
1058{
1059	switch (flags) {
1060	case LSM_SETID_RE:
1061	case LSM_SETID_ID:
1062	case LSM_SETID_RES:
1063		/* juggle the capabilities to follow [RES]UID changes unless
1064		 * otherwise suppressed */
1065		if (!issecure(SECURE_NO_SETUID_FIXUP))
1066			cap_emulate_setxuid(new, old);
1067		break;
1068
1069	case LSM_SETID_FS:
1070		/* juggle the capabilties to follow FSUID changes, unless
1071		 * otherwise suppressed
1072		 *
1073		 * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
1074		 *          if not, we might be a bit too harsh here.
1075		 */
1076		if (!issecure(SECURE_NO_SETUID_FIXUP)) {
1077			kuid_t root_uid = make_kuid(old->user_ns, 0);
1078			if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid))
1079				new->cap_effective =
1080					cap_drop_fs_set(new->cap_effective);
1081
1082			if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid))
1083				new->cap_effective =
1084					cap_raise_fs_set(new->cap_effective,
1085							 new->cap_permitted);
1086		}
1087		break;
1088
1089	default:
1090		return -EINVAL;
1091	}
1092
1093	return 0;
1094}
1095
1096/*
1097 * Rationale: code calling task_setscheduler, task_setioprio, and
1098 * task_setnice, assumes that
1099 *   . if capable(cap_sys_nice), then those actions should be allowed
1100 *   . if not capable(cap_sys_nice), but acting on your own processes,
1101 *   	then those actions should be allowed
1102 * This is insufficient now since you can call code without suid, but
1103 * yet with increased caps.
1104 * So we check for increased caps on the target process.
1105 */
1106static int cap_safe_nice(struct task_struct *p)
1107{
1108	int is_subset, ret = 0;
1109
1110	rcu_read_lock();
1111	is_subset = cap_issubset(__task_cred(p)->cap_permitted,
1112				 current_cred()->cap_permitted);
1113	if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
1114		ret = -EPERM;
1115	rcu_read_unlock();
1116
1117	return ret;
1118}
1119
1120/**
1121 * cap_task_setscheduler - Detemine if scheduler policy change is permitted
1122 * @p: The task to affect
1123 *
1124 * Detemine if the requested scheduler policy change is permitted for the
1125 * specified task, returning 0 if permission is granted, -ve if denied.
1126 */
1127int cap_task_setscheduler(struct task_struct *p)
1128{
1129	return cap_safe_nice(p);
1130}
1131
1132/**
1133 * cap_task_ioprio - Detemine if I/O priority change is permitted
1134 * @p: The task to affect
1135 * @ioprio: The I/O priority to set
1136 *
1137 * Detemine if the requested I/O priority change is permitted for the specified
1138 * task, returning 0 if permission is granted, -ve if denied.
1139 */
1140int cap_task_setioprio(struct task_struct *p, int ioprio)
1141{
1142	return cap_safe_nice(p);
1143}
1144
1145/**
1146 * cap_task_ioprio - Detemine if task priority change is permitted
1147 * @p: The task to affect
1148 * @nice: The nice value to set
1149 *
1150 * Detemine if the requested task priority change is permitted for the
1151 * specified task, returning 0 if permission is granted, -ve if denied.
1152 */
1153int cap_task_setnice(struct task_struct *p, int nice)
1154{
1155	return cap_safe_nice(p);
1156}
1157
1158/*
1159 * Implement PR_CAPBSET_DROP.  Attempt to remove the specified capability from
1160 * the current task's bounding set.  Returns 0 on success, -ve on error.
1161 */
1162static int cap_prctl_drop(unsigned long cap)
1163{
1164	struct cred *new;
1165
1166	if (!ns_capable(current_user_ns(), CAP_SETPCAP))
1167		return -EPERM;
1168	if (!cap_valid(cap))
1169		return -EINVAL;
1170
1171	new = prepare_creds();
1172	if (!new)
1173		return -ENOMEM;
1174	cap_lower(new->cap_bset, cap);
1175	return commit_creds(new);
1176}
1177
1178/**
1179 * cap_task_prctl - Implement process control functions for this security module
1180 * @option: The process control function requested
1181 * @arg2, @arg3, @arg4, @arg5: The argument data for this function
1182 *
1183 * Allow process control functions (sys_prctl()) to alter capabilities; may
1184 * also deny access to other functions not otherwise implemented here.
1185 *
1186 * Returns 0 or +ve on success, -ENOSYS if this function is not implemented
1187 * here, other -ve on error.  If -ENOSYS is returned, sys_prctl() and other LSM
1188 * modules will consider performing the function.
1189 */
1190int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
1191		   unsigned long arg4, unsigned long arg5)
1192{
1193	const struct cred *old = current_cred();
1194	struct cred *new;
1195
1196	switch (option) {
1197	case PR_CAPBSET_READ:
1198		if (!cap_valid(arg2))
1199			return -EINVAL;
1200		return !!cap_raised(old->cap_bset, arg2);
1201
1202	case PR_CAPBSET_DROP:
1203		return cap_prctl_drop(arg2);
1204
1205	/*
1206	 * The next four prctl's remain to assist with transitioning a
1207	 * system from legacy UID=0 based privilege (when filesystem
1208	 * capabilities are not in use) to a system using filesystem
1209	 * capabilities only - as the POSIX.1e draft intended.
1210	 *
1211	 * Note:
1212	 *
1213	 *  PR_SET_SECUREBITS =
1214	 *      issecure_mask(SECURE_KEEP_CAPS_LOCKED)
1215	 *    | issecure_mask(SECURE_NOROOT)
1216	 *    | issecure_mask(SECURE_NOROOT_LOCKED)
1217	 *    | issecure_mask(SECURE_NO_SETUID_FIXUP)
1218	 *    | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED)
1219	 *
1220	 * will ensure that the current process and all of its
1221	 * children will be locked into a pure
1222	 * capability-based-privilege environment.
1223	 */
1224	case PR_SET_SECUREBITS:
1225		if ((((old->securebits & SECURE_ALL_LOCKS) >> 1)
1226		     & (old->securebits ^ arg2))			/*[1]*/
1227		    || ((old->securebits & SECURE_ALL_LOCKS & ~arg2))	/*[2]*/
1228		    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))	/*[3]*/
1229		    || (cap_capable(current_cred(),
1230				    current_cred()->user_ns,
1231				    CAP_SETPCAP,
1232				    CAP_OPT_NONE) != 0)			/*[4]*/
1233			/*
1234			 * [1] no changing of bits that are locked
1235			 * [2] no unlocking of locks
1236			 * [3] no setting of unsupported bits
1237			 * [4] doing anything requires privilege (go read about
1238			 *     the "sendmail capabilities bug")
1239			 */
1240		    )
1241			/* cannot change a locked bit */
1242			return -EPERM;
1243
1244		new = prepare_creds();
1245		if (!new)
1246			return -ENOMEM;
1247		new->securebits = arg2;
1248		return commit_creds(new);
1249
1250	case PR_GET_SECUREBITS:
1251		return old->securebits;
1252
1253	case PR_GET_KEEPCAPS:
1254		return !!issecure(SECURE_KEEP_CAPS);
1255
1256	case PR_SET_KEEPCAPS:
1257		if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */
1258			return -EINVAL;
1259		if (issecure(SECURE_KEEP_CAPS_LOCKED))
1260			return -EPERM;
1261
1262		new = prepare_creds();
1263		if (!new)
1264			return -ENOMEM;
1265		if (arg2)
1266			new->securebits |= issecure_mask(SECURE_KEEP_CAPS);
1267		else
1268			new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
1269		return commit_creds(new);
1270
1271	case PR_CAP_AMBIENT:
1272		if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
1273			if (arg3 | arg4 | arg5)
1274				return -EINVAL;
1275
1276			new = prepare_creds();
1277			if (!new)
1278				return -ENOMEM;
1279			cap_clear(new->cap_ambient);
1280			return commit_creds(new);
1281		}
1282
1283		if (((!cap_valid(arg3)) | arg4 | arg5))
1284			return -EINVAL;
1285
1286		if (arg2 == PR_CAP_AMBIENT_IS_SET) {
1287			return !!cap_raised(current_cred()->cap_ambient, arg3);
1288		} else if (arg2 != PR_CAP_AMBIENT_RAISE &&
1289			   arg2 != PR_CAP_AMBIENT_LOWER) {
1290			return -EINVAL;
1291		} else {
1292			if (arg2 == PR_CAP_AMBIENT_RAISE &&
1293			    (!cap_raised(current_cred()->cap_permitted, arg3) ||
1294			     !cap_raised(current_cred()->cap_inheritable,
1295					 arg3) ||
1296			     issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
1297				return -EPERM;
1298
1299			new = prepare_creds();
1300			if (!new)
1301				return -ENOMEM;
1302			if (arg2 == PR_CAP_AMBIENT_RAISE)
1303				cap_raise(new->cap_ambient, arg3);
1304			else
1305				cap_lower(new->cap_ambient, arg3);
1306			return commit_creds(new);
1307		}
1308
1309	default:
1310		/* No functionality available - continue with default */
1311		return -ENOSYS;
1312	}
1313}
1314
1315/**
1316 * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted
1317 * @mm: The VM space in which the new mapping is to be made
1318 * @pages: The size of the mapping
1319 *
1320 * Determine whether the allocation of a new virtual mapping by the current
1321 * task is permitted, returning 1 if permission is granted, 0 if not.
1322 */
1323int cap_vm_enough_memory(struct mm_struct *mm, long pages)
1324{
1325	int cap_sys_admin = 0;
1326
1327	if (cap_capable(current_cred(), &init_user_ns,
1328				CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) == 0)
1329		cap_sys_admin = 1;
1330
1331	return cap_sys_admin;
1332}
1333
1334/*
1335 * cap_mmap_addr - check if able to map given addr
1336 * @addr: address attempting to be mapped
1337 *
1338 * If the process is attempting to map memory below dac_mmap_min_addr they need
1339 * CAP_SYS_RAWIO.  The other parameters to this function are unused by the
1340 * capability security module.  Returns 0 if this mapping should be allowed
1341 * -EPERM if not.
1342 */
1343int cap_mmap_addr(unsigned long addr)
1344{
1345	int ret = 0;
1346
1347	if (addr < dac_mmap_min_addr) {
1348		ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO,
1349				  CAP_OPT_NONE);
1350		/* set PF_SUPERPRIV if it turns out we allow the low mmap */
1351		if (ret == 0)
1352			current->flags |= PF_SUPERPRIV;
1353	}
1354	return ret;
1355}
1356
1357int cap_mmap_file(struct file *file, unsigned long reqprot,
1358		  unsigned long prot, unsigned long flags)
1359{
1360	return 0;
1361}
1362
1363#ifdef CONFIG_SECURITY
1364
1365static struct security_hook_list capability_hooks[] __lsm_ro_after_init = {
1366	LSM_HOOK_INIT(capable, cap_capable),
1367	LSM_HOOK_INIT(settime, cap_settime),
1368	LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check),
1369	LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme),
1370	LSM_HOOK_INIT(capget, cap_capget),
1371	LSM_HOOK_INIT(capset, cap_capset),
1372	LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file),
1373	LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
1374	LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
1375	LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
1376	LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
1377	LSM_HOOK_INIT(mmap_file, cap_mmap_file),
1378	LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
1379	LSM_HOOK_INIT(task_prctl, cap_task_prctl),
1380	LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler),
1381	LSM_HOOK_INIT(task_setioprio, cap_task_setioprio),
1382	LSM_HOOK_INIT(task_setnice, cap_task_setnice),
1383	LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
1384};
1385
1386static int __init capability_init(void)
1387{
1388	security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
1389				"capability");
1390	return 0;
1391}
1392
1393DEFINE_LSM(capability) = {
1394	.name = "capability",
1395	.order = LSM_ORDER_FIRST,
1396	.init = capability_init,
1397};
1398
1399#endif /* CONFIG_SECURITY */
1400