Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-only
2 : /*
3 : * linux/fs/namespace.c
4 : *
5 : * (C) Copyright Al Viro 2000, 2001
6 : *
7 : * Based on code from fs/super.c, copyright Linus Torvalds and others.
8 : * Heavily rewritten.
9 : */
10 :
11 : #include <linux/syscalls.h>
12 : #include <linux/export.h>
13 : #include <linux/capability.h>
14 : #include <linux/mnt_namespace.h>
15 : #include <linux/user_namespace.h>
16 : #include <linux/namei.h>
17 : #include <linux/security.h>
18 : #include <linux/cred.h>
19 : #include <linux/idr.h>
20 : #include <linux/init.h> /* init_rootfs */
21 : #include <linux/fs_struct.h> /* get_fs_root et.al. */
22 : #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
23 : #include <linux/file.h>
24 : #include <linux/uaccess.h>
25 : #include <linux/proc_ns.h>
26 : #include <linux/magic.h>
27 : #include <linux/memblock.h>
28 : #include <linux/proc_fs.h>
29 : #include <linux/task_work.h>
30 : #include <linux/sched/task.h>
31 : #include <uapi/linux/mount.h>
32 : #include <linux/fs_context.h>
33 : #include <linux/shmem_fs.h>
34 : #include <linux/mnt_idmapping.h>
35 :
36 : #include "pnode.h"
37 : #include "internal.h"
38 :
39 : /* Maximum number of mounts in a mount namespace */
40 : static unsigned int sysctl_mount_max __read_mostly = 100000;
41 :
42 : static unsigned int m_hash_mask __read_mostly;
43 : static unsigned int m_hash_shift __read_mostly;
44 : static unsigned int mp_hash_mask __read_mostly;
45 : static unsigned int mp_hash_shift __read_mostly;
46 :
47 : static __initdata unsigned long mhash_entries;
48 0 : static int __init set_mhash_entries(char *str)
49 : {
50 0 : if (!str)
51 : return 0;
52 0 : mhash_entries = simple_strtoul(str, &str, 0);
53 0 : return 1;
54 : }
55 : __setup("mhash_entries=", set_mhash_entries);
56 :
57 : static __initdata unsigned long mphash_entries;
58 0 : static int __init set_mphash_entries(char *str)
59 : {
60 0 : if (!str)
61 : return 0;
62 0 : mphash_entries = simple_strtoul(str, &str, 0);
63 0 : return 1;
64 : }
65 : __setup("mphash_entries=", set_mphash_entries);
66 :
67 : static u64 event;
68 : static DEFINE_IDA(mnt_id_ida);
69 : static DEFINE_IDA(mnt_group_ida);
70 :
71 : static struct hlist_head *mount_hashtable __read_mostly;
72 : static struct hlist_head *mountpoint_hashtable __read_mostly;
73 : static struct kmem_cache *mnt_cache __read_mostly;
74 : static DECLARE_RWSEM(namespace_sem);
75 : static HLIST_HEAD(unmounted); /* protected by namespace_sem */
76 : static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
77 :
78 : struct mount_kattr {
79 : unsigned int attr_set;
80 : unsigned int attr_clr;
81 : unsigned int propagation;
82 : unsigned int lookup_flags;
83 : bool recurse;
84 : struct user_namespace *mnt_userns;
85 : struct mnt_idmap *mnt_idmap;
86 : };
87 :
88 : /* /sys/fs */
89 : struct kobject *fs_kobj;
90 : EXPORT_SYMBOL_GPL(fs_kobj);
91 :
92 : /*
93 : * vfsmount lock may be taken for read to prevent changes to the
94 : * vfsmount hash, ie. during mountpoint lookups or walking back
95 : * up the tree.
96 : *
97 : * It should be taken for write in all cases where the vfsmount
98 : * tree or hash is modified or when a vfsmount structure is modified.
99 : */
100 : __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
101 :
102 10199492 : static inline void lock_mount_hash(void)
103 : {
104 10199492 : write_seqlock(&mount_lock);
105 10199782 : }
106 :
107 10199782 : static inline void unlock_mount_hash(void)
108 : {
109 10199782 : write_sequnlock(&mount_lock);
110 10199461 : }
111 :
112 3561300107 : static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
113 : {
114 3561300107 : unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
115 3561300107 : tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
116 3561300107 : tmp = tmp + (tmp >> m_hash_shift);
117 3561300107 : return &mount_hashtable[tmp & m_hash_mask];
118 : }
119 :
120 511072 : static inline struct hlist_head *mp_hash(struct dentry *dentry)
121 : {
122 511072 : unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
123 511072 : tmp = tmp + (tmp >> mp_hash_shift);
124 511072 : return &mountpoint_hashtable[tmp & mp_hash_mask];
125 : }
126 :
127 3216886 : static int mnt_alloc_id(struct mount *mnt)
128 : {
129 3216886 : int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);
130 :
131 3216925 : if (res < 0)
132 : return res;
133 3216925 : mnt->mnt_id = res;
134 3216925 : return 0;
135 : }
136 :
137 : static void mnt_free_id(struct mount *mnt)
138 : {
139 3216913 : ida_free(&mnt_id_ida, mnt->mnt_id);
140 0 : }
141 :
142 : /*
143 : * Allocate a new peer group ID
144 : */
145 1383758 : static int mnt_alloc_group_id(struct mount *mnt)
146 : {
147 1383758 : int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);
148 :
149 1383758 : if (res < 0)
150 : return res;
151 1383758 : mnt->mnt_group_id = res;
152 1383758 : return 0;
153 : }
154 :
155 : /*
156 : * Release a peer group ID
157 : */
158 1383758 : void mnt_release_group_id(struct mount *mnt)
159 : {
160 1383758 : ida_free(&mnt_group_ida, mnt->mnt_group_id);
161 1383758 : mnt->mnt_group_id = 0;
162 0 : }
163 :
164 : /*
165 : * vfsmount lock must be held for read
166 : */
167 : static inline void mnt_add_count(struct mount *mnt, int n)
168 : {
169 : #ifdef CONFIG_SMP
170 46621121106 : this_cpu_add(mnt->mnt_pcp->mnt_count, n);
171 : #else
172 : preempt_disable();
173 : mnt->mnt_count += n;
174 : preempt_enable();
175 : #endif
176 3616706379 : }
177 :
178 : /*
179 : * vfsmount lock must be held for write
180 : */
181 7223207 : int mnt_get_count(struct mount *mnt)
182 : {
183 : #ifdef CONFIG_SMP
184 7223207 : int count = 0;
185 7223207 : int cpu;
186 :
187 36116035 : for_each_possible_cpu(cpu) {
188 28892828 : count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
189 : }
190 :
191 7223207 : return count;
192 : #else
193 : return mnt->mnt_count;
194 : #endif
195 : }
196 :
197 3216661 : static struct mount *alloc_vfsmnt(const char *name)
198 : {
199 3216661 : struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
200 3216894 : if (mnt) {
201 3216894 : int err;
202 :
203 3216894 : err = mnt_alloc_id(mnt);
204 3216925 : if (err)
205 0 : goto out_free_cache;
206 :
207 3216925 : if (name) {
208 3216925 : mnt->mnt_devname = kstrdup_const(name,
209 : GFP_KERNEL_ACCOUNT);
210 3216860 : if (!mnt->mnt_devname)
211 0 : goto out_free_id;
212 : }
213 :
214 : #ifdef CONFIG_SMP
215 3216860 : mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
216 3216926 : if (!mnt->mnt_pcp)
217 0 : goto out_free_devname;
218 :
219 3216926 : this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
220 : #else
221 : mnt->mnt_count = 1;
222 : mnt->mnt_writers = 0;
223 : #endif
224 :
225 3216925 : INIT_HLIST_NODE(&mnt->mnt_hash);
226 3216925 : INIT_LIST_HEAD(&mnt->mnt_child);
227 3216925 : INIT_LIST_HEAD(&mnt->mnt_mounts);
228 3216925 : INIT_LIST_HEAD(&mnt->mnt_list);
229 3216925 : INIT_LIST_HEAD(&mnt->mnt_expire);
230 3216925 : INIT_LIST_HEAD(&mnt->mnt_share);
231 3216925 : INIT_LIST_HEAD(&mnt->mnt_slave_list);
232 3216925 : INIT_LIST_HEAD(&mnt->mnt_slave);
233 3216925 : INIT_HLIST_NODE(&mnt->mnt_mp_list);
234 3216925 : INIT_LIST_HEAD(&mnt->mnt_umounting);
235 3216925 : INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
236 3216925 : mnt->mnt.mnt_idmap = &nop_mnt_idmap;
237 : }
238 : return mnt;
239 :
240 : #ifdef CONFIG_SMP
241 : out_free_devname:
242 0 : kfree_const(mnt->mnt_devname);
243 : #endif
244 0 : out_free_id:
245 0 : mnt_free_id(mnt);
246 0 : out_free_cache:
247 0 : kmem_cache_free(mnt_cache, mnt);
248 0 : return NULL;
249 : }
250 :
251 : /*
252 : * Most r/o checks on a fs are for operations that take
253 : * discrete amounts of time, like a write() or unlink().
254 : * We must keep track of when those operations start
255 : * (for permission checks) and when they end, so that
256 : * we can determine when writes are able to occur to
257 : * a filesystem.
258 : */
259 : /*
260 : * __mnt_is_readonly: check whether a mount is read-only
261 : * @mnt: the mount to check for its write status
262 : *
263 : * This shouldn't be used directly ouside of the VFS.
264 : * It does not guarantee that the filesystem will stay
265 : * r/w, just that it is right *now*. This can not and
266 : * should not be used in place of IS_RDONLY(inode).
267 : * mnt_want/drop_write() will _keep_ the filesystem
268 : * r/w.
269 : */
270 677033976 : bool __mnt_is_readonly(struct vfsmount *mnt)
271 : {
272 2910828524 : return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
273 : }
274 : EXPORT_SYMBOL_GPL(__mnt_is_readonly);
275 :
276 : static inline void mnt_inc_writers(struct mount *mnt)
277 : {
278 : #ifdef CONFIG_SMP
279 3664186073 : this_cpu_inc(mnt->mnt_pcp->mnt_writers);
280 : #else
281 : mnt->mnt_writers++;
282 : #endif
283 : }
284 :
285 : static inline void mnt_dec_writers(struct mount *mnt)
286 : {
287 : #ifdef CONFIG_SMP
288 3662922026 : this_cpu_dec(mnt->mnt_pcp->mnt_writers);
289 : #else
290 : mnt->mnt_writers--;
291 : #endif
292 : }
293 :
294 3226127 : static unsigned int mnt_get_writers(struct mount *mnt)
295 : {
296 : #ifdef CONFIG_SMP
297 3226127 : unsigned int count = 0;
298 3226127 : int cpu;
299 :
300 16130438 : for_each_possible_cpu(cpu) {
301 12904333 : count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
302 : }
303 :
304 3226115 : return count;
305 : #else
306 : return mnt->mnt_writers;
307 : #endif
308 : }
309 :
310 1833228178 : static int mnt_is_readonly(struct vfsmount *mnt)
311 : {
312 1833228178 : if (READ_ONCE(mnt->mnt_sb->s_readonly_remount))
313 : return 1;
314 : /*
315 : * The barrier pairs with the barrier in sb_start_ro_state_change()
316 : * making sure if we don't see s_readonly_remount set yet, we also will
317 : * not see any superblock / mount flag changes done by remount.
318 : * It also pairs with the barrier in sb_end_ro_state_change()
319 : * assuring that if we see s_readonly_remount already cleared, we will
320 : * see the values of superblock / mount flags updated by remount.
321 : */
322 1833226213 : smp_rmb();
323 3666485285 : return __mnt_is_readonly(mnt);
324 : }
325 :
326 : /*
327 : * Most r/o & frozen checks on a fs are for operations that take discrete
328 : * amounts of time, like a write() or unlink(). We must keep track of when
329 : * those operations start (for permission checks) and when they end, so that we
330 : * can determine when writes are able to occur to a filesystem.
331 : */
332 : /**
333 : * __mnt_want_write - get write access to a mount without freeze protection
334 : * @m: the mount on which to take a write
335 : *
336 : * This tells the low-level filesystem that a write is about to be performed to
337 : * it, and makes sure that writes are allowed (mnt it read-write) before
338 : * returning success. This operation does not protect against filesystem being
339 : * frozen. When the write operation is finished, __mnt_drop_write() must be
340 : * called. This is effectively a refcount.
341 : */
342 1832464006 : int __mnt_want_write(struct vfsmount *m)
343 : {
344 1832464006 : struct mount *mnt = real_mount(m);
345 1832464006 : int ret = 0;
346 :
347 1832464006 : preempt_disable();
348 1831915140 : mnt_inc_writers(mnt);
349 : /*
350 : * The store to mnt_inc_writers must be visible before we pass
351 : * MNT_WRITE_HOLD loop below, so that the slowpath can see our
352 : * incremented count after it has set MNT_WRITE_HOLD.
353 : */
354 1832270933 : smp_mb();
355 1832270933 : might_lock(&mount_lock.lock);
356 1832270933 : while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
357 1246 : if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
358 1246 : cpu_relax();
359 : } else {
360 : /*
361 : * This prevents priority inversion, if the task
362 : * setting MNT_WRITE_HOLD got preempted on a remote
363 : * CPU, and it prevents life lock if the task setting
364 : * MNT_WRITE_HOLD has a lower priority and is bound to
365 : * the same CPU as the task that is spinning here.
366 : */
367 : preempt_enable();
368 : lock_mount_hash();
369 : unlock_mount_hash();
370 1834232893 : preempt_disable();
371 : }
372 : }
373 : /*
374 : * The barrier pairs with the barrier sb_start_ro_state_change() making
375 : * sure that if we see MNT_WRITE_HOLD cleared, we will also see
376 : * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in
377 : * mnt_is_readonly() and bail in case we are racing with remount
378 : * read-only.
379 : */
380 1834245885 : smp_rmb();
381 1834201783 : if (mnt_is_readonly(m)) {
382 231535 : mnt_dec_writers(mnt);
383 231535 : ret = -EROFS;
384 : }
385 1832594572 : preempt_enable();
386 :
387 1833464525 : return ret;
388 : }
389 :
390 : /**
391 : * mnt_want_write - get write access to a mount
392 : * @m: the mount on which to take a write
393 : *
394 : * This tells the low-level filesystem that a write is about to be performed to
395 : * it, and makes sure that writes are allowed (mount is read-write, filesystem
396 : * is not frozen) before returning success. When the write operation is
397 : * finished, mnt_drop_write() must be called. This is effectively a refcount.
398 : */
399 1189170865 : int mnt_want_write(struct vfsmount *m)
400 : {
401 1189170865 : int ret;
402 :
403 1189170865 : sb_start_write(m->mnt_sb);
404 1188979457 : ret = __mnt_want_write(m);
405 1190129869 : if (ret)
406 136748 : sb_end_write(m->mnt_sb);
407 1190129868 : return ret;
408 : }
409 : EXPORT_SYMBOL_GPL(mnt_want_write);
410 :
411 : /**
412 : * __mnt_want_write_file - get write access to a file's mount
413 : * @file: the file who's mount on which to take a write
414 : *
415 : * This is like __mnt_want_write, but if the file is already open for writing it
416 : * skips incrementing mnt_writers (since the open file already has a reference)
417 : * and instead only does the check for emergency r/o remounts. This must be
418 : * paired with __mnt_drop_write_file.
419 : */
420 495116348 : int __mnt_want_write_file(struct file *file)
421 : {
422 495116348 : if (file->f_mode & FMODE_WRITER) {
423 : /*
424 : * Superblock may have become readonly while there are still
425 : * writable fd's, e.g. due to a fs error with errors=remount-ro
426 : */
427 800427760 : if (__mnt_is_readonly(file->f_path.mnt))
428 : return -EROFS;
429 400209149 : return 0;
430 : }
431 94907199 : return __mnt_want_write(file->f_path.mnt);
432 : }
433 :
434 : /**
435 : * mnt_want_write_file - get write access to a file's mount
436 : * @file: the file who's mount on which to take a write
437 : *
438 : * This is like mnt_want_write, but if the file is already open for writing it
439 : * skips incrementing mnt_writers (since the open file already has a reference)
440 : * and instead only does the freeze protection and the check for emergency r/o
441 : * remounts. This must be paired with mnt_drop_write_file.
442 : */
443 420807666 : int mnt_want_write_file(struct file *file)
444 : {
445 420807666 : int ret;
446 :
447 420807666 : sb_start_write(file_inode(file)->i_sb);
448 420898374 : ret = __mnt_want_write_file(file);
449 420813844 : if (ret)
450 396 : sb_end_write(file_inode(file)->i_sb);
451 420813844 : return ret;
452 : }
453 : EXPORT_SYMBOL_GPL(mnt_want_write_file);
454 :
455 : /**
456 : * __mnt_drop_write - give up write access to a mount
457 : * @mnt: the mount on which to give up write access
458 : *
459 : * Tells the low-level filesystem that we are done
460 : * performing writes to it. Must be matched with
461 : * __mnt_want_write() call above.
462 : */
463 1831420169 : void __mnt_drop_write(struct vfsmount *mnt)
464 : {
465 1831420169 : preempt_disable();
466 1830966226 : mnt_dec_writers(real_mount(mnt));
467 1831724265 : preempt_enable();
468 1831765151 : }
469 :
470 : /**
471 : * mnt_drop_write - give up write access to a mount
472 : * @mnt: the mount on which to give up write access
473 : *
474 : * Tells the low-level filesystem that we are done performing writes to it and
475 : * also allows filesystem to be frozen again. Must be matched with
476 : * mnt_want_write() call above.
477 : */
478 1189027092 : void mnt_drop_write(struct vfsmount *mnt)
479 : {
480 1189027092 : __mnt_drop_write(mnt);
481 1188970095 : sb_end_write(mnt->mnt_sb);
482 1188893184 : }
483 : EXPORT_SYMBOL_GPL(mnt_drop_write);
484 :
485 494802033 : void __mnt_drop_write_file(struct file *file)
486 : {
487 494802033 : if (!(file->f_mode & FMODE_WRITER))
488 94560583 : __mnt_drop_write(file->f_path.mnt);
489 494827033 : }
490 :
491 420557163 : void mnt_drop_write_file(struct file *file)
492 : {
493 420557163 : __mnt_drop_write_file(file);
494 420503978 : sb_end_write(file_inode(file)->i_sb);
495 420710188 : }
496 : EXPORT_SYMBOL(mnt_drop_write_file);
497 :
498 : /**
499 : * mnt_hold_writers - prevent write access to the given mount
500 : * @mnt: mnt to prevent write access to
501 : *
502 : * Prevents write access to @mnt if there are no active writers for @mnt.
503 : * This function needs to be called and return successfully before changing
504 : * properties of @mnt that need to remain stable for callers with write access
505 : * to @mnt.
506 : *
507 : * After this functions has been called successfully callers must pair it with
508 : * a call to mnt_unhold_writers() in order to stop preventing write access to
509 : * @mnt.
510 : *
511 : * Context: This function expects lock_mount_hash() to be held serializing
512 : * setting MNT_WRITE_HOLD.
513 : * Return: On success 0 is returned.
514 : * On error, -EBUSY is returned.
515 : */
516 9242 : static inline int mnt_hold_writers(struct mount *mnt)
517 : {
518 9242 : mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
519 : /*
520 : * After storing MNT_WRITE_HOLD, we'll read the counters. This store
521 : * should be visible before we do.
522 : */
523 9242 : smp_mb();
524 :
525 : /*
526 : * With writers on hold, if this value is zero, then there are
527 : * definitely no active writers (although held writers may subsequently
528 : * increment the count, they'll have to wait, and decrement it after
529 : * seeing MNT_READONLY).
530 : *
531 : * It is OK to have counter incremented on one CPU and decremented on
532 : * another: the sum will add up correctly. The danger would be when we
533 : * sum up each counter, if we read a counter before it is incremented,
534 : * but then read another CPU's count which it has been subsequently
535 : * decremented from -- we would see more decrements than we should.
536 : * MNT_WRITE_HOLD protects against this scenario, because
537 : * mnt_want_write first increments count, then smp_mb, then spins on
538 : * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
539 : * we're counting up here.
540 : */
541 9242 : if (mnt_get_writers(mnt) > 0)
542 360 : return -EBUSY;
543 :
544 : return 0;
545 : }
546 :
547 : /**
548 : * mnt_unhold_writers - stop preventing write access to the given mount
549 : * @mnt: mnt to stop preventing write access to
550 : *
551 : * Stop preventing write access to @mnt allowing callers to gain write access
552 : * to @mnt again.
553 : *
554 : * This function can only be called after a successful call to
555 : * mnt_hold_writers().
556 : *
557 : * Context: This function expects lock_mount_hash() to be held.
558 : */
559 : static inline void mnt_unhold_writers(struct mount *mnt)
560 : {
561 : /*
562 : * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
563 : * that become unheld will see MNT_READONLY.
564 : */
565 6412 : smp_wmb();
566 3206 : mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
567 1254 : }
568 :
569 1952 : static int mnt_make_readonly(struct mount *mnt)
570 : {
571 1952 : int ret;
572 :
573 1952 : ret = mnt_hold_writers(mnt);
574 1952 : if (!ret)
575 1952 : mnt->mnt.mnt_flags |= MNT_READONLY;
576 1952 : mnt_unhold_writers(mnt);
577 1952 : return ret;
578 : }
579 :
580 7433 : int sb_prepare_remount_readonly(struct super_block *sb)
581 : {
582 7433 : struct mount *mnt;
583 7433 : int err = 0;
584 :
585 : /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
586 7433 : if (atomic_long_read(&sb->s_remove_count))
587 : return -EBUSY;
588 :
589 876 : lock_mount_hash();
590 6552 : list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
591 6036 : if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
592 6036 : err = mnt_hold_writers(mnt);
593 6036 : if (err)
594 : break;
595 : }
596 : }
597 876 : if (!err && atomic_long_read(&sb->s_remove_count))
598 : err = -EBUSY;
599 :
600 876 : if (!err)
601 516 : sb_start_ro_state_change(sb);
602 10512 : list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
603 9636 : if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
604 6036 : mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
605 : }
606 876 : unlock_mount_hash();
607 :
608 876 : return err;
609 : }
610 :
611 3215230 : static void free_vfsmnt(struct mount *mnt)
612 : {
613 3215230 : mnt_idmap_put(mnt_idmap(&mnt->mnt));
614 3214738 : kfree_const(mnt->mnt_devname);
615 : #ifdef CONFIG_SMP
616 3214630 : free_percpu(mnt->mnt_pcp);
617 : #endif
618 3216920 : kmem_cache_free(mnt_cache, mnt);
619 3216453 : }
620 :
621 3215274 : static void delayed_free_vfsmnt(struct rcu_head *head)
622 : {
623 3215274 : free_vfsmnt(container_of(head, struct mount, mnt_rcu));
624 3216449 : }
625 :
626 : /* call under rcu_read_lock */
627 8936330190 : int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
628 : {
629 8936330190 : struct mount *mnt;
630 8936330190 : if (read_seqretry(&mount_lock, seq))
631 : return 1;
632 8935070960 : if (bastard == NULL)
633 : return 0;
634 8934718697 : mnt = real_mount(bastard);
635 8934718697 : mnt_add_count(mnt, 1);
636 8937936128 : smp_mb(); // see mntput_no_expire()
637 8948673978 : if (likely(!read_seqretry(&mount_lock, seq)))
638 : return 0;
639 4050 : if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
640 0 : mnt_add_count(mnt, -1);
641 0 : return 1;
642 : }
643 4050 : lock_mount_hash();
644 4075 : if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
645 0 : mnt_add_count(mnt, -1);
646 0 : unlock_mount_hash();
647 0 : return 1;
648 : }
649 4075 : unlock_mount_hash();
650 : /* caller will mntput() */
651 4075 : return -1;
652 : }
653 :
654 : /* call under rcu_read_lock */
655 60846795 : static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
656 : {
657 60846795 : int res = __legitimize_mnt(bastard, seq);
658 60901848 : if (likely(!res))
659 : return true;
660 3171 : if (unlikely(res < 0)) {
661 1160 : rcu_read_unlock();
662 1160 : mntput(bastard);
663 1156 : rcu_read_lock();
664 : }
665 : return false;
666 : }
667 :
668 : /**
669 : * __lookup_mnt - find first child mount
670 : * @mnt: parent mount
671 : * @dentry: mountpoint
672 : *
673 : * If @mnt has a child mount @c mounted @dentry find and return it.
674 : *
675 : * Note that the child mount @c need not be unique. There are cases
676 : * where shadow mounts are created. For example, during mount
677 : * propagation when a source mount @mnt whose root got overmounted by a
678 : * mount @o after path lookup but before @namespace_sem could be
679 : * acquired gets copied and propagated. So @mnt gets copied including
680 : * @o. When @mnt is propagated to a destination mount @d that already
681 : * has another mount @n mounted at the same mountpoint then the source
682 : * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on
683 : * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt
684 : * on @dentry.
685 : *
686 : * Return: The first child of @mnt mounted @dentry or NULL.
687 : */
688 3559848682 : struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
689 : {
690 3559848682 : struct hlist_head *head = m_hash(mnt, dentry);
691 3561502607 : struct mount *p;
692 :
693 7260253262 : hlist_for_each_entry_rcu(p, head, mnt_hash)
694 2829393054 : if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
695 2692145006 : return p;
696 : return NULL;
697 : }
698 :
699 : /*
700 : * lookup_mnt - Return the first child mount mounted at path
701 : *
702 : * "First" means first mounted chronologically. If you create the
703 : * following mounts:
704 : *
705 : * mount /dev/sda1 /mnt
706 : * mount /dev/sda2 /mnt
707 : * mount /dev/sda3 /mnt
708 : *
709 : * Then lookup_mnt() on the base /mnt dentry in the root mount will
710 : * return successively the root dentry and vfsmount of /dev/sda1, then
711 : * /dev/sda2, then /dev/sda3, then NULL.
712 : *
713 : * lookup_mnt takes a reference to the found vfsmount.
714 : */
715 60853701 : struct vfsmount *lookup_mnt(const struct path *path)
716 : {
717 60853701 : struct mount *child_mnt;
718 60853701 : struct vfsmount *m;
719 60853701 : unsigned seq;
720 :
721 60853701 : rcu_read_lock();
722 60855502 : do {
723 60855502 : seq = read_seqbegin(&mount_lock);
724 60858844 : child_mnt = __lookup_mnt(path->mnt, path->dentry);
725 60844385 : m = child_mnt ? &child_mnt->mnt : NULL;
726 60844385 : } while (!legitimize_mnt(m, seq));
727 60896454 : rcu_read_unlock();
728 60893330 : return m;
729 : }
730 :
731 : static inline void lock_ns_list(struct mnt_namespace *ns)
732 : {
733 908799123 : spin_lock(&ns->ns_lock);
734 : }
735 :
736 : static inline void unlock_ns_list(struct mnt_namespace *ns)
737 : {
738 910232718 : spin_unlock(&ns->ns_lock);
739 : }
740 :
741 : static inline bool mnt_is_cursor(struct mount *mnt)
742 : {
743 818852413 : return mnt->mnt.mnt_flags & MNT_CURSOR;
744 : }
745 :
746 : /*
747 : * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
748 : * current mount namespace.
749 : *
750 : * The common case is dentries are not mountpoints at all and that
751 : * test is handled inline. For the slow case when we are actually
752 : * dealing with a mountpoint of some kind, walk through all of the
753 : * mounts in the current mount namespace and test to see if the dentry
754 : * is a mountpoint.
755 : *
756 : * The mount_hashtable is not usable in the context because we
757 : * need to identify all mounts that may be in the current mount
758 : * namespace not just a mount that happens to have some specified
759 : * parent mount.
760 : */
761 24 : bool __is_local_mountpoint(struct dentry *dentry)
762 : {
763 24 : struct mnt_namespace *ns = current->nsproxy->mnt_ns;
764 24 : struct mount *mnt;
765 24 : bool is_covered = false;
766 :
767 24 : down_read(&namespace_sem);
768 24 : lock_ns_list(ns);
769 2264 : list_for_each_entry(mnt, &ns->list, mnt_list) {
770 2264 : if (mnt_is_cursor(mnt))
771 0 : continue;
772 2264 : is_covered = (mnt->mnt_mountpoint == dentry);
773 2264 : if (is_covered)
774 : break;
775 : }
776 24 : unlock_ns_list(ns);
777 24 : up_read(&namespace_sem);
778 :
779 24 : return is_covered;
780 : }
781 :
782 132456 : static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
783 : {
784 132456 : struct hlist_head *chain = mp_hash(dentry);
785 132456 : struct mountpoint *mp;
786 :
787 264912 : hlist_for_each_entry(mp, chain, m_hash) {
788 132456 : if (mp->m_dentry == dentry) {
789 132456 : mp->m_count++;
790 132456 : return mp;
791 : }
792 : }
793 : return NULL;
794 : }
795 :
796 511072 : static struct mountpoint *get_mountpoint(struct dentry *dentry)
797 : {
798 511072 : struct mountpoint *mp, *new = NULL;
799 511072 : int ret;
800 :
801 511072 : if (d_mountpoint(dentry)) {
802 : /* might be worth a WARN_ON() */
803 132456 : if (d_unlinked(dentry))
804 : return ERR_PTR(-ENOENT);
805 132456 : mountpoint:
806 132456 : read_seqlock_excl(&mount_lock);
807 132456 : mp = lookup_mountpoint(dentry);
808 132456 : read_sequnlock_excl(&mount_lock);
809 132456 : if (mp)
810 132456 : goto done;
811 : }
812 :
813 0 : if (!new)
814 378616 : new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
815 378616 : if (!new)
816 : return ERR_PTR(-ENOMEM);
817 :
818 :
819 : /* Exactly one processes may set d_mounted */
820 378616 : ret = d_set_mounted(dentry);
821 :
822 : /* Someone else set d_mounted? */
823 378616 : if (ret == -EBUSY)
824 0 : goto mountpoint;
825 :
826 : /* The dentry is not available as a mountpoint? */
827 378616 : mp = ERR_PTR(ret);
828 378616 : if (ret)
829 0 : goto done;
830 :
831 : /* Add the new mountpoint to the hash table */
832 378616 : read_seqlock_excl(&mount_lock);
833 378616 : new->m_dentry = dget(dentry);
834 378616 : new->m_count = 1;
835 378616 : hlist_add_head(&new->m_hash, mp_hash(dentry));
836 378616 : INIT_HLIST_HEAD(&new->m_list);
837 378616 : read_sequnlock_excl(&mount_lock);
838 :
839 378616 : mp = new;
840 378616 : new = NULL;
841 511072 : done:
842 511072 : kfree(new);
843 511072 : return mp;
844 : }
845 :
846 : /*
847 : * vfsmount lock must be held. Additionally, the caller is responsible
848 : * for serializing calls for given disposal list.
849 : */
850 3565428 : static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
851 : {
852 3565428 : if (!--mp->m_count) {
853 378616 : struct dentry *dentry = mp->m_dentry;
854 378616 : BUG_ON(!hlist_empty(&mp->m_list));
855 378616 : spin_lock(&dentry->d_lock);
856 378616 : dentry->d_flags &= ~DCACHE_MOUNTED;
857 378616 : spin_unlock(&dentry->d_lock);
858 378616 : dput_to_list(dentry, list);
859 378616 : hlist_del(&mp->m_hash);
860 378616 : kfree(mp);
861 : }
862 3565428 : }
863 :
864 : /* called with namespace_lock and vfsmount lock */
865 : static void put_mountpoint(struct mountpoint *mp)
866 : {
867 2928017 : __put_mountpoint(mp, &ex_mountpoints);
868 318 : }
869 :
870 : static inline int check_mnt(struct mount *mnt)
871 : {
872 64831993 : return mnt->mnt_ns == current->nsproxy->mnt_ns;
873 : }
874 :
875 : /*
876 : * vfsmount lock must be held for write
877 : */
878 2913206 : static void touch_mnt_namespace(struct mnt_namespace *ns)
879 : {
880 2913206 : if (ns) {
881 2913206 : ns->event = ++event;
882 2913206 : wake_up_interruptible(&ns->poll);
883 : }
884 2913206 : }
885 :
886 : /*
887 : * vfsmount lock must be held for write
888 : */
889 3059877 : static void __touch_mnt_namespace(struct mnt_namespace *ns)
890 : {
891 3059877 : if (ns && ns->event != event) {
892 1643103 : ns->event = event;
893 1643103 : wake_up_interruptible(&ns->poll);
894 : }
895 3059877 : }
896 :
897 : /*
898 : * vfsmount lock must be held for write
899 : */
900 3054356 : static struct mountpoint *unhash_mnt(struct mount *mnt)
901 : {
902 3054356 : struct mountpoint *mp;
903 3054356 : mnt->mnt_parent = mnt;
904 3054356 : mnt->mnt_mountpoint = mnt->mnt.mnt_root;
905 3054356 : list_del_init(&mnt->mnt_child);
906 3054356 : hlist_del_init_rcu(&mnt->mnt_hash);
907 3054356 : hlist_del_init(&mnt->mnt_mp_list);
908 3054356 : mp = mnt->mnt_mp;
909 3054356 : mnt->mnt_mp = NULL;
910 3054356 : return mp;
911 : }
912 :
913 : /*
914 : * vfsmount lock must be held for write
915 : */
916 2928017 : static void umount_mnt(struct mount *mnt)
917 : {
918 2928017 : put_mountpoint(unhash_mnt(mnt));
919 2928017 : }
920 :
921 : /*
922 : * vfsmount lock must be held for write
923 : */
924 3054356 : void mnt_set_mountpoint(struct mount *mnt,
925 : struct mountpoint *mp,
926 : struct mount *child_mnt)
927 : {
928 3054356 : mp->m_count++;
929 3054356 : mnt_add_count(mnt, 1); /* essentially, that's mntget */
930 3054356 : child_mnt->mnt_mountpoint = mp->m_dentry;
931 3054356 : child_mnt->mnt_parent = mnt;
932 3054356 : child_mnt->mnt_mp = mp;
933 3054356 : hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
934 3054356 : }
935 :
936 : /**
937 : * mnt_set_mountpoint_beneath - mount a mount beneath another one
938 : *
939 : * @new_parent: the source mount
940 : * @top_mnt: the mount beneath which @new_parent is mounted
941 : * @new_mp: the new mountpoint of @top_mnt on @new_parent
942 : *
943 : * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and
944 : * parent @top_mnt->mnt_parent and mount it on top of @new_parent at
945 : * @new_mp. And mount @new_parent on the old parent and old
946 : * mountpoint of @top_mnt.
947 : *
948 : * Context: This function expects namespace_lock() and lock_mount_hash()
949 : * to have been acquired in that order.
950 : */
951 0 : static void mnt_set_mountpoint_beneath(struct mount *new_parent,
952 : struct mount *top_mnt,
953 : struct mountpoint *new_mp)
954 : {
955 0 : struct mount *old_top_parent = top_mnt->mnt_parent;
956 0 : struct mountpoint *old_top_mp = top_mnt->mnt_mp;
957 :
958 0 : mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent);
959 0 : mnt_change_mountpoint(new_parent, new_mp, top_mnt);
960 0 : }
961 :
962 :
963 3054356 : static void __attach_mnt(struct mount *mnt, struct mount *parent)
964 : {
965 3054356 : hlist_add_head_rcu(&mnt->mnt_hash,
966 : m_hash(&parent->mnt, mnt->mnt_mountpoint));
967 3054356 : list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
968 3054356 : }
969 :
970 : /**
971 : * attach_mnt - mount a mount, attach to @mount_hashtable and parent's
972 : * list of child mounts
973 : * @parent: the parent
974 : * @mnt: the new mount
975 : * @mp: the new mountpoint
976 : * @beneath: whether to mount @mnt beneath or on top of @parent
977 : *
978 : * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt
979 : * to @parent's child mount list and to @mount_hashtable.
980 : *
981 : * If @beneath is true, remove @mnt from its current parent and
982 : * mountpoint and mount it on @mp on @parent, and mount @parent on the
983 : * old parent and old mountpoint of @mnt. Finally, attach @parent to
984 : * @mnt_hashtable and @parent->mnt_parent->mnt_mounts.
985 : *
986 : * Note, when __attach_mnt() is called @mnt->mnt_parent already points
987 : * to the correct parent.
988 : *
989 : * Context: This function expects namespace_lock() and lock_mount_hash()
990 : * to have been acquired in that order.
991 : */
992 156573 : static void attach_mnt(struct mount *mnt, struct mount *parent,
993 : struct mountpoint *mp, bool beneath)
994 : {
995 156573 : if (beneath)
996 0 : mnt_set_mountpoint_beneath(mnt, parent, mp);
997 : else
998 156573 : mnt_set_mountpoint(parent, mp, mnt);
999 : /*
1000 : * Note, @mnt->mnt_parent has to be used. If @mnt was mounted
1001 : * beneath @parent then @mnt will need to be attached to
1002 : * @parent's old parent, not @parent. IOW, @mnt->mnt_parent
1003 : * isn't the same mount as @parent.
1004 : */
1005 156573 : __attach_mnt(mnt, mnt->mnt_parent);
1006 156573 : }
1007 :
1008 0 : void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
1009 : {
1010 0 : struct mountpoint *old_mp = mnt->mnt_mp;
1011 0 : struct mount *old_parent = mnt->mnt_parent;
1012 :
1013 0 : list_del_init(&mnt->mnt_child);
1014 0 : hlist_del_init(&mnt->mnt_mp_list);
1015 0 : hlist_del_init_rcu(&mnt->mnt_hash);
1016 :
1017 0 : attach_mnt(mnt, parent, mp, false);
1018 :
1019 0 : put_mountpoint(old_mp);
1020 0 : mnt_add_count(old_parent, -1);
1021 0 : }
1022 :
1023 : /*
1024 : * vfsmount lock must be held for write
1025 : */
1026 2897783 : static void commit_tree(struct mount *mnt)
1027 : {
1028 2897783 : struct mount *parent = mnt->mnt_parent;
1029 2897783 : struct mount *m;
1030 2897783 : LIST_HEAD(head);
1031 2897783 : struct mnt_namespace *n = parent->mnt_ns;
1032 :
1033 2897783 : BUG_ON(parent == mnt);
1034 :
1035 2897783 : list_add_tail(&head, &mnt->mnt_list);
1036 5801663 : list_for_each_entry(m, &head, mnt_list)
1037 2903880 : m->mnt_ns = n;
1038 :
1039 2897783 : list_splice(&head, n->list.prev);
1040 :
1041 2897783 : n->mounts += n->pending_mounts;
1042 2897783 : n->pending_mounts = 0;
1043 :
1044 2897783 : __attach_mnt(mnt, parent);
1045 2897783 : touch_mnt_namespace(n);
1046 2897783 : }
1047 :
1048 : static struct mount *next_mnt(struct mount *p, struct mount *root)
1049 : {
1050 4849674 : struct list_head *next = p->mnt_mounts.next;
1051 289288 : if (next == &p->mnt_mounts) {
1052 4849674 : while (1) {
1053 4849674 : if (p == root)
1054 : return NULL;
1055 783741 : next = p->mnt_child.next;
1056 783741 : if (next != &p->mnt_parent->mnt_mounts)
1057 : break;
1058 : p = p->mnt_parent;
1059 : }
1060 : }
1061 783741 : return list_entry(next, struct mount, mnt_child);
1062 : }
1063 :
1064 : static struct mount *skip_mnt_tree(struct mount *p)
1065 : {
1066 0 : struct list_head *prev = p->mnt_mounts.prev;
1067 0 : while (prev != &p->mnt_mounts) {
1068 0 : p = list_entry(prev, struct mount, mnt_child);
1069 0 : prev = p->mnt_mounts.prev;
1070 : }
1071 0 : return p;
1072 : }
1073 :
1074 : /**
1075 : * vfs_create_mount - Create a mount for a configured superblock
1076 : * @fc: The configuration context with the superblock attached
1077 : *
1078 : * Create a mount to an already configured superblock. If necessary, the
1079 : * caller should invoke vfs_get_tree() before calling this.
1080 : *
1081 : * Note that this does not attach the mount to anything.
1082 : */
1083 125779 : struct vfsmount *vfs_create_mount(struct fs_context *fc)
1084 : {
1085 125779 : struct mount *mnt;
1086 :
1087 125779 : if (!fc->root)
1088 : return ERR_PTR(-EINVAL);
1089 :
1090 125779 : mnt = alloc_vfsmnt(fc->source ?: "none");
1091 125839 : if (!mnt)
1092 : return ERR_PTR(-ENOMEM);
1093 :
1094 125839 : if (fc->sb_flags & SB_KERNMOUNT)
1095 0 : mnt->mnt.mnt_flags = MNT_INTERNAL;
1096 :
1097 125839 : atomic_inc(&fc->root->d_sb->s_active);
1098 125839 : mnt->mnt.mnt_sb = fc->root->d_sb;
1099 125839 : mnt->mnt.mnt_root = dget(fc->root);
1100 125839 : mnt->mnt_mountpoint = mnt->mnt.mnt_root;
1101 125839 : mnt->mnt_parent = mnt;
1102 :
1103 125839 : lock_mount_hash();
1104 125839 : list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
1105 125839 : unlock_mount_hash();
1106 125839 : return &mnt->mnt;
1107 : }
1108 : EXPORT_SYMBOL(vfs_create_mount);
1109 :
1110 3478 : struct vfsmount *fc_mount(struct fs_context *fc)
1111 : {
1112 3478 : int err = vfs_get_tree(fc);
1113 3478 : if (!err) {
1114 3449 : up_write(&fc->root->d_sb->s_umount);
1115 3449 : return vfs_create_mount(fc);
1116 : }
1117 29 : return ERR_PTR(err);
1118 : }
1119 : EXPORT_SYMBOL(fc_mount);
1120 :
1121 3478 : struct vfsmount *vfs_kern_mount(struct file_system_type *type,
1122 : int flags, const char *name,
1123 : void *data)
1124 : {
1125 3478 : struct fs_context *fc;
1126 3478 : struct vfsmount *mnt;
1127 3478 : int ret = 0;
1128 :
1129 3478 : if (!type)
1130 : return ERR_PTR(-EINVAL);
1131 :
1132 3478 : fc = fs_context_for_mount(type, flags);
1133 3478 : if (IS_ERR(fc))
1134 : return ERR_CAST(fc);
1135 :
1136 3478 : if (name)
1137 6956 : ret = vfs_parse_fs_string(fc, "source",
1138 : name, strlen(name));
1139 3478 : if (!ret)
1140 3478 : ret = parse_monolithic_mount_data(fc, data);
1141 3478 : if (!ret)
1142 3478 : mnt = fc_mount(fc);
1143 : else
1144 0 : mnt = ERR_PTR(ret);
1145 :
1146 3478 : put_fs_context(fc);
1147 3478 : return mnt;
1148 : }
1149 : EXPORT_SYMBOL_GPL(vfs_kern_mount);
1150 :
1151 : struct vfsmount *
1152 11 : vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
1153 : const char *name, void *data)
1154 : {
1155 : /* Until it is worked out how to pass the user namespace
1156 : * through from the parent mount to the submount don't support
1157 : * unprivileged mounts with submounts.
1158 : */
1159 11 : if (mountpoint->d_sb->s_user_ns != &init_user_ns)
1160 : return ERR_PTR(-EPERM);
1161 :
1162 11 : return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
1163 : }
1164 : EXPORT_SYMBOL_GPL(vfs_submount);
1165 :
1166 3091005 : static struct mount *clone_mnt(struct mount *old, struct dentry *root,
1167 : int flag)
1168 : {
1169 3091005 : struct super_block *sb = old->mnt.mnt_sb;
1170 3091005 : struct mount *mnt;
1171 3091005 : int err;
1172 :
1173 3091005 : mnt = alloc_vfsmnt(old->mnt_devname);
1174 3091086 : if (!mnt)
1175 : return ERR_PTR(-ENOMEM);
1176 :
1177 3091086 : if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
1178 2795656 : mnt->mnt_group_id = 0; /* not a peer of original */
1179 : else
1180 295430 : mnt->mnt_group_id = old->mnt_group_id;
1181 :
1182 3091086 : if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
1183 1255504 : err = mnt_alloc_group_id(mnt);
1184 1255504 : if (err)
1185 0 : goto out_free;
1186 : }
1187 :
1188 3091086 : mnt->mnt.mnt_flags = old->mnt.mnt_flags;
1189 3091086 : mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
1190 :
1191 3091086 : atomic_inc(&sb->s_active);
1192 3091087 : mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
1193 :
1194 3091087 : mnt->mnt.mnt_sb = sb;
1195 3091087 : mnt->mnt.mnt_root = dget(root);
1196 3091087 : mnt->mnt_mountpoint = mnt->mnt.mnt_root;
1197 3091087 : mnt->mnt_parent = mnt;
1198 3091087 : lock_mount_hash();
1199 3091087 : list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
1200 3091087 : unlock_mount_hash();
1201 :
1202 3091087 : if ((flag & CL_SLAVE) ||
1203 579832 : ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
1204 2555977 : list_add(&mnt->mnt_slave, &old->mnt_slave_list);
1205 2555977 : mnt->mnt_master = old;
1206 2555977 : CLEAR_MNT_SHARED(mnt);
1207 535110 : } else if (!(flag & CL_PRIVATE)) {
1208 378107 : if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
1209 287145 : list_add(&mnt->mnt_share, &old->mnt_share);
1210 378107 : if (IS_MNT_SLAVE(old))
1211 7114 : list_add(&mnt->mnt_slave, &old->mnt_slave);
1212 378107 : mnt->mnt_master = old->mnt_master;
1213 : } else {
1214 157003 : CLEAR_MNT_SHARED(mnt);
1215 : }
1216 3091087 : if (flag & CL_MAKE_SHARED)
1217 1386947 : set_mnt_shared(mnt);
1218 :
1219 : /* stick the duplicate mount on the same expiry list
1220 : * as the original if that was on one */
1221 3091087 : if (flag & CL_EXPIRE) {
1222 151801 : if (!list_empty(&old->mnt_expire))
1223 0 : list_add(&mnt->mnt_expire, &old->mnt_expire);
1224 : }
1225 :
1226 : return mnt;
1227 :
1228 : out_free:
1229 0 : mnt_free_id(mnt);
1230 0 : free_vfsmnt(mnt);
1231 0 : return ERR_PTR(err);
1232 : }
1233 :
1234 3216901 : static void cleanup_mnt(struct mount *mnt)
1235 : {
1236 3216901 : struct hlist_node *p;
1237 3216901 : struct mount *m;
1238 : /*
1239 : * The warning here probably indicates that somebody messed
1240 : * up a mnt_want/drop_write() pair. If this happens, the
1241 : * filesystem was probably unable to make r/w->r/o transitions.
1242 : * The locking used to deal with mnt_count decrement provides barriers,
1243 : * so mnt_get_writers() below is safe.
1244 : */
1245 3216901 : WARN_ON(mnt_get_writers(mnt));
1246 3216879 : if (unlikely(mnt->mnt_pins.first))
1247 0 : mnt_pin_kill(mnt);
1248 6559780 : hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
1249 126021 : hlist_del(&m->mnt_umount);
1250 126021 : mntput(&m->mnt);
1251 : }
1252 3216880 : fsnotify_vfsmount_delete(&mnt->mnt);
1253 3216905 : dput(mnt->mnt.mnt_root);
1254 3216908 : deactivate_super(mnt->mnt.mnt_sb);
1255 3216913 : mnt_free_id(mnt);
1256 3216926 : call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
1257 3216918 : }
1258 :
1259 3216877 : static void __cleanup_mnt(struct rcu_head *head)
1260 : {
1261 3216877 : cleanup_mnt(container_of(head, struct mount, mnt_rcu));
1262 3216891 : }
1263 :
1264 : static LLIST_HEAD(delayed_mntput_list);
1265 0 : static void delayed_mntput(struct work_struct *unused)
1266 : {
1267 0 : struct llist_node *node = llist_del_all(&delayed_mntput_list);
1268 0 : struct mount *m, *t;
1269 :
1270 0 : llist_for_each_entry_safe(m, t, node, mnt_llist)
1271 0 : cleanup_mnt(m);
1272 0 : }
1273 : static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
1274 :
1275 12561265845 : static void mntput_no_expire(struct mount *mnt)
1276 : {
1277 12561265845 : LIST_HEAD(list);
1278 12561265845 : int count;
1279 :
1280 12561265845 : rcu_read_lock();
1281 12559544240 : if (likely(READ_ONCE(mnt->mnt_ns))) {
1282 : /*
1283 : * Since we don't do lock_mount_hash() here,
1284 : * ->mnt_ns can change under us. However, if it's
1285 : * non-NULL, then there's a reference that won't
1286 : * be dropped until after an RCU delay done after
1287 : * turning ->mnt_ns NULL. So if we observe it
1288 : * non-NULL under rcu_read_lock(), the reference
1289 : * we are dropping is not the final one.
1290 : */
1291 12553650365 : mnt_add_count(mnt, -1);
1292 12554815849 : rcu_read_unlock();
1293 25117181224 : return;
1294 : }
1295 5893875 : lock_mount_hash();
1296 : /*
1297 : * make sure that if __legitimize_mnt() has not seen us grab
1298 : * mount_lock, we'll see their refcount increment here.
1299 : */
1300 5894042 : smp_mb();
1301 5894042 : mnt_add_count(mnt, -1);
1302 5894042 : count = mnt_get_count(mnt);
1303 5894042 : if (count != 0) {
1304 2677116 : WARN_ON(count < 0);
1305 2677116 : rcu_read_unlock();
1306 2677116 : unlock_mount_hash();
1307 2677116 : return;
1308 : }
1309 3216926 : if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
1310 0 : rcu_read_unlock();
1311 0 : unlock_mount_hash();
1312 0 : return;
1313 : }
1314 3216926 : mnt->mnt.mnt_flags |= MNT_DOOMED;
1315 3216926 : rcu_read_unlock();
1316 :
1317 3216926 : list_del(&mnt->mnt_instance);
1318 :
1319 3216926 : if (unlikely(!list_empty(&mnt->mnt_mounts))) {
1320 12072 : struct mount *p, *tmp;
1321 138093 : list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
1322 126021 : __put_mountpoint(unhash_mnt(p), &list);
1323 126021 : hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
1324 : }
1325 : }
1326 3216926 : unlock_mount_hash();
1327 3216926 : shrink_dentry_list(&list);
1328 :
1329 3216926 : if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
1330 3216900 : struct task_struct *task = current;
1331 3216900 : if (likely(!(task->flags & PF_KTHREAD))) {
1332 3216900 : init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
1333 3216900 : if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
1334 : return;
1335 : }
1336 0 : if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
1337 0 : schedule_delayed_work(&delayed_mntput_work, 1);
1338 0 : return;
1339 : }
1340 26 : cleanup_mnt(mnt);
1341 : }
1342 :
1343 19546900710 : void mntput(struct vfsmount *mnt)
1344 : {
1345 19546900710 : if (mnt) {
1346 12564152289 : struct mount *m = real_mount(mnt);
1347 : /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
1348 12564152289 : if (unlikely(m->mnt_expiry_mark))
1349 0 : m->mnt_expiry_mark = 0;
1350 12564152289 : mntput_no_expire(m);
1351 : }
1352 19541234108 : }
1353 : EXPORT_SYMBOL(mntput);
1354 :
1355 3615860616 : struct vfsmount *mntget(struct vfsmount *mnt)
1356 : {
1357 3615860616 : if (mnt)
1358 3615995195 : mnt_add_count(real_mount(mnt), 1);
1359 3616706379 : return mnt;
1360 : }
1361 : EXPORT_SYMBOL(mntget);
1362 :
1363 : /*
1364 : * Make a mount point inaccessible to new lookups.
1365 : * Because there may still be current users, the caller MUST WAIT
1366 : * for an RCU grace period before destroying the mount point.
1367 : */
1368 0 : void mnt_make_shortterm(struct vfsmount *mnt)
1369 : {
1370 0 : if (mnt)
1371 156618 : real_mount(mnt)->mnt_ns = NULL;
1372 0 : }
1373 :
1374 : /**
1375 : * path_is_mountpoint() - Check if path is a mount in the current namespace.
1376 : * @path: path to check
1377 : *
1378 : * d_mountpoint() can only be used reliably to establish if a dentry is
1379 : * not mounted in any namespace and that common case is handled inline.
1380 : * d_mountpoint() isn't aware of the possibility there may be multiple
1381 : * mounts using a given dentry in a different namespace. This function
1382 : * checks if the passed in path is a mountpoint rather than the dentry
1383 : * alone.
1384 : */
1385 0 : bool path_is_mountpoint(const struct path *path)
1386 : {
1387 0 : unsigned seq;
1388 0 : bool res;
1389 :
1390 0 : if (!d_mountpoint(path->dentry))
1391 : return false;
1392 :
1393 0 : rcu_read_lock();
1394 0 : do {
1395 0 : seq = read_seqbegin(&mount_lock);
1396 0 : res = __path_is_mountpoint(path);
1397 0 : } while (read_seqretry(&mount_lock, seq));
1398 0 : rcu_read_unlock();
1399 :
1400 0 : return res;
1401 : }
1402 : EXPORT_SYMBOL(path_is_mountpoint);
1403 :
1404 26 : struct vfsmount *mnt_clone_internal(const struct path *path)
1405 : {
1406 26 : struct mount *p;
1407 26 : p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
1408 26 : if (IS_ERR(p))
1409 : return ERR_CAST(p);
1410 26 : p->mnt.mnt_flags |= MNT_INTERNAL;
1411 26 : return &p->mnt;
1412 : }
1413 :
1414 : #ifdef CONFIG_PROC_FS
1415 818027421 : static struct mount *mnt_list_next(struct mnt_namespace *ns,
1416 : struct list_head *p)
1417 : {
1418 818027421 : struct mount *mnt, *ret = NULL;
1419 :
1420 818027421 : lock_ns_list(ns);
1421 826927503 : list_for_each_continue(p, &ns->list) {
1422 818850149 : mnt = list_entry(p, typeof(*mnt), mnt_list);
1423 818850149 : if (!mnt_is_cursor(mnt)) {
1424 : ret = mnt;
1425 : break;
1426 : }
1427 : }
1428 819394009 : unlock_ns_list(ns);
1429 :
1430 819318281 : return ret;
1431 : }
1432 :
1433 : /* iterator; we want it to have access to namespace_sem, thus here... */
1434 82717633 : static void *m_start(struct seq_file *m, loff_t *pos)
1435 : {
1436 82717633 : struct proc_mounts *p = m->private;
1437 82717633 : struct list_head *prev;
1438 :
1439 82717633 : down_read(&namespace_sem);
1440 82719858 : if (!*pos) {
1441 8066262 : prev = &p->ns->list;
1442 : } else {
1443 74653596 : prev = &p->cursor.mnt_list;
1444 :
1445 : /* Read after we'd reached the end? */
1446 74653596 : if (list_empty(prev))
1447 : return NULL;
1448 : }
1449 :
1450 74087172 : return mnt_list_next(p->ns, prev);
1451 : }
1452 :
1453 744007168 : static void *m_next(struct seq_file *m, void *v, loff_t *pos)
1454 : {
1455 744007168 : struct proc_mounts *p = m->private;
1456 744007168 : struct mount *mnt = v;
1457 :
1458 744007168 : ++*pos;
1459 744007168 : return mnt_list_next(p->ns, &mnt->mnt_list);
1460 : }
1461 :
1462 82716307 : static void m_stop(struct seq_file *m, void *v)
1463 : {
1464 82716307 : struct proc_mounts *p = m->private;
1465 82716307 : struct mount *mnt = v;
1466 :
1467 82716307 : lock_ns_list(p->ns);
1468 82766370 : if (mnt)
1469 66045171 : list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list);
1470 : else
1471 16721199 : list_del_init(&p->cursor.mnt_list);
1472 82766369 : unlock_ns_list(p->ns);
1473 82763158 : up_read(&namespace_sem);
1474 82762691 : }
1475 :
1476 745185793 : static int m_show(struct seq_file *m, void *v)
1477 : {
1478 745185793 : struct proc_mounts *p = m->private;
1479 745185793 : struct mount *r = v;
1480 745185793 : return p->show(m, &r->mnt);
1481 : }
1482 :
1483 : const struct seq_operations mounts_op = {
1484 : .start = m_start,
1485 : .next = m_next,
1486 : .stop = m_stop,
1487 : .show = m_show,
1488 : };
1489 :
1490 8041405 : void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
1491 : {
1492 8041405 : down_read(&namespace_sem);
1493 8055371 : lock_ns_list(ns);
1494 8072316 : list_del(&cursor->mnt_list);
1495 8072316 : unlock_ns_list(ns);
1496 8071310 : up_read(&namespace_sem);
1497 8070175 : }
1498 : #endif /* CONFIG_PROC_FS */
1499 :
1500 : /**
1501 : * may_umount_tree - check if a mount tree is busy
1502 : * @m: root of mount tree
1503 : *
1504 : * This is called to check if a tree of mounts has any
1505 : * open files, pwds, chroots or sub mounts that are
1506 : * busy.
1507 : */
1508 0 : int may_umount_tree(struct vfsmount *m)
1509 : {
1510 0 : struct mount *mnt = real_mount(m);
1511 0 : int actual_refs = 0;
1512 0 : int minimum_refs = 0;
1513 0 : struct mount *p;
1514 0 : BUG_ON(!m);
1515 :
1516 : /* write lock needed for mnt_get_count */
1517 0 : lock_mount_hash();
1518 0 : for (p = mnt; p; p = next_mnt(p, mnt)) {
1519 0 : actual_refs += mnt_get_count(p);
1520 0 : minimum_refs += 2;
1521 : }
1522 0 : unlock_mount_hash();
1523 :
1524 0 : if (actual_refs > minimum_refs)
1525 0 : return 0;
1526 :
1527 : return 1;
1528 : }
1529 :
1530 : EXPORT_SYMBOL(may_umount_tree);
1531 :
1532 : /**
1533 : * may_umount - check if a mount point is busy
1534 : * @mnt: root of mount
1535 : *
1536 : * This is called to check if a mount point has any
1537 : * open files, pwds, chroots or sub mounts. If the
1538 : * mount has sub mounts this will return busy
1539 : * regardless of whether the sub mounts are busy.
1540 : *
1541 : * Doesn't take quota and stuff into account. IOW, in some cases it will
1542 : * give false negatives. The main reason why it's here is that we need
1543 : * a non-destructive way to look for easily umountable filesystems.
1544 : */
1545 0 : int may_umount(struct vfsmount *mnt)
1546 : {
1547 0 : int ret = 1;
1548 0 : down_read(&namespace_sem);
1549 0 : lock_mount_hash();
1550 0 : if (propagate_mount_busy(real_mount(mnt), 2))
1551 0 : ret = 0;
1552 0 : unlock_mount_hash();
1553 0 : up_read(&namespace_sem);
1554 0 : return ret;
1555 : }
1556 :
1557 : EXPORT_SYMBOL(may_umount);
1558 :
1559 782182 : static void namespace_unlock(void)
1560 : {
1561 782182 : struct hlist_head head;
1562 782182 : struct hlist_node *p;
1563 782182 : struct mount *m;
1564 782182 : LIST_HEAD(list);
1565 :
1566 782182 : hlist_move_list(&unmounted, &head);
1567 782182 : list_splice_init(&ex_mountpoints, &list);
1568 :
1569 782182 : up_write(&namespace_sem);
1570 :
1571 782182 : shrink_dentry_list(&list);
1572 :
1573 782182 : if (likely(hlist_empty(&head)))
1574 521811 : return;
1575 :
1576 260371 : synchronize_rcu_expedited();
1577 :
1578 3454594 : hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
1579 2933852 : hlist_del(&m->mnt_umount);
1580 2933852 : mntput(&m->mnt);
1581 : }
1582 : }
1583 :
1584 : static inline void namespace_lock(void)
1585 : {
1586 782070 : down_write(&namespace_sem);
1587 : }
1588 :
1589 : enum umount_tree_flags {
1590 : UMOUNT_SYNC = 1,
1591 : UMOUNT_PROPAGATE = 2,
1592 : UMOUNT_CONNECTED = 4,
1593 : };
1594 :
1595 3059877 : static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
1596 : {
1597 : /* Leaving mounts connected is only valid for lazy umounts */
1598 3059877 : if (how & UMOUNT_SYNC)
1599 : return true;
1600 :
1601 : /* A mount without a parent has nothing to be connected to */
1602 1730723 : if (!mnt_has_parent(mnt))
1603 : return true;
1604 :
1605 : /* Because the reference counting rules change when mounts are
1606 : * unmounted and connected, umounted mounts may not be
1607 : * connected to mounted mounts.
1608 : */
1609 1724884 : if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
1610 : return true;
1611 :
1612 : /* Has it been requested that the mount remain connected? */
1613 156800 : if (how & UMOUNT_CONNECTED)
1614 : return false;
1615 :
1616 : /* Is the mount locked such that it needs to remain connected? */
1617 156800 : if (IS_MNT_LOCKED(mnt))
1618 126021 : return false;
1619 :
1620 : /* By default disconnect the mount */
1621 : return true;
1622 : }
1623 :
1624 : /*
1625 : * mount_lock must be held
1626 : * namespace_sem must be held for write
1627 : */
1628 260371 : static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
1629 : {
1630 260371 : LIST_HEAD(tmp_list);
1631 260371 : struct mount *p;
1632 :
1633 260371 : if (how & UMOUNT_PROPAGATE)
1634 254532 : propagate_mount_unlock(mnt);
1635 :
1636 : /* Gather the mounts to umount */
1637 1094713 : for (p = mnt; p; p = next_mnt(p, mnt)) {
1638 417171 : p->mnt.mnt_flags |= MNT_UMOUNT;
1639 417171 : list_move(&p->mnt_list, &tmp_list);
1640 : }
1641 :
1642 : /* Hide the mounts from mnt_mounts */
1643 677542 : list_for_each_entry(p, &tmp_list, mnt_list) {
1644 417171 : list_del_init(&p->mnt_child);
1645 : }
1646 :
1647 : /* Add propogated mounts to the tmp_list */
1648 260371 : if (how & UMOUNT_PROPAGATE)
1649 254532 : propagate_umount(&tmp_list);
1650 :
1651 3320248 : while (!list_empty(&tmp_list)) {
1652 3059877 : struct mnt_namespace *ns;
1653 3059877 : bool disconnect;
1654 3059877 : p = list_first_entry(&tmp_list, struct mount, mnt_list);
1655 3059877 : list_del_init(&p->mnt_expire);
1656 3059877 : list_del_init(&p->mnt_list);
1657 3059877 : ns = p->mnt_ns;
1658 3059877 : if (ns) {
1659 3059877 : ns->mounts--;
1660 3059877 : __touch_mnt_namespace(ns);
1661 : }
1662 3059877 : p->mnt_ns = NULL;
1663 3059877 : if (how & UMOUNT_SYNC)
1664 1329154 : p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
1665 :
1666 3059877 : disconnect = disconnect_mount(p, how);
1667 3059877 : if (mnt_has_parent(p)) {
1668 3054038 : mnt_add_count(p->mnt_parent, -1);
1669 3054038 : if (!disconnect) {
1670 : /* Don't forget about p */
1671 126021 : list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
1672 : } else {
1673 2928017 : umount_mnt(p);
1674 : }
1675 : }
1676 3059877 : change_mnt_propagation(p, MS_PRIVATE);
1677 3059877 : if (disconnect)
1678 2933856 : hlist_add_head(&p->mnt_umount, &unmounted);
1679 : }
1680 260371 : }
1681 :
1682 : static void shrink_submounts(struct mount *mnt);
1683 :
1684 0 : static int do_umount_root(struct super_block *sb)
1685 : {
1686 0 : int ret = 0;
1687 :
1688 0 : down_write(&sb->s_umount);
1689 0 : if (!sb_rdonly(sb)) {
1690 0 : struct fs_context *fc;
1691 :
1692 0 : fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
1693 : SB_RDONLY);
1694 0 : if (IS_ERR(fc)) {
1695 0 : ret = PTR_ERR(fc);
1696 : } else {
1697 0 : ret = parse_monolithic_mount_data(fc, NULL);
1698 0 : if (!ret)
1699 0 : ret = reconfigure_super(fc);
1700 0 : put_fs_context(fc);
1701 : }
1702 : }
1703 0 : up_write(&sb->s_umount);
1704 0 : return ret;
1705 : }
1706 :
1707 254455 : static int do_umount(struct mount *mnt, int flags)
1708 : {
1709 254455 : struct super_block *sb = mnt->mnt.mnt_sb;
1710 254455 : int retval;
1711 :
1712 254455 : retval = security_sb_umount(&mnt->mnt, flags);
1713 254455 : if (retval)
1714 : return retval;
1715 :
1716 : /*
1717 : * Allow userspace to request a mountpoint be expired rather than
1718 : * unmounting unconditionally. Unmount only happens if:
1719 : * (1) the mark is already set (the mark is cleared by mntput())
1720 : * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
1721 : */
1722 254455 : if (flags & MNT_EXPIRE) {
1723 0 : if (&mnt->mnt == current->fs->root.mnt ||
1724 0 : flags & (MNT_FORCE | MNT_DETACH))
1725 : return -EINVAL;
1726 :
1727 : /*
1728 : * probably don't strictly need the lock here if we examined
1729 : * all race cases, but it's a slowpath.
1730 : */
1731 0 : lock_mount_hash();
1732 0 : if (mnt_get_count(mnt) != 2) {
1733 0 : unlock_mount_hash();
1734 0 : return -EBUSY;
1735 : }
1736 0 : unlock_mount_hash();
1737 :
1738 0 : if (!xchg(&mnt->mnt_expiry_mark, 1))
1739 : return -EAGAIN;
1740 : }
1741 :
1742 : /*
1743 : * If we may have to abort operations to get out of this
1744 : * mount, and they will themselves hold resources we must
1745 : * allow the fs to do things. In the Unix tradition of
1746 : * 'Gee thats tricky lets do it in userspace' the umount_begin
1747 : * might fail to complete on the first run through as other tasks
1748 : * must return, and the like. Thats for the mount program to worry
1749 : * about for the moment.
1750 : */
1751 :
1752 254455 : if (flags & MNT_FORCE && sb->s_op->umount_begin) {
1753 0 : sb->s_op->umount_begin(sb);
1754 : }
1755 :
1756 : /*
1757 : * No sense to grab the lock for this test, but test itself looks
1758 : * somewhat bogus. Suggestions for better replacement?
1759 : * Ho-hum... In principle, we might treat that as umount + switch
1760 : * to rootfs. GC would eventually take care of the old vfsmount.
1761 : * Actually it makes sense, especially if rootfs would contain a
1762 : * /reboot - static binary that would close all descriptors and
1763 : * call reboot(9). Then init(8) could umount root and exec /reboot.
1764 : */
1765 254455 : if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
1766 : /*
1767 : * Special case for "unmounting" root ...
1768 : * we just try to remount it readonly.
1769 : */
1770 0 : if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
1771 : return -EPERM;
1772 0 : return do_umount_root(sb);
1773 : }
1774 :
1775 254455 : namespace_lock();
1776 254564 : lock_mount_hash();
1777 :
1778 : /* Recheck MNT_LOCKED with the locks held */
1779 254564 : retval = -EINVAL;
1780 254564 : if (mnt->mnt.mnt_flags & MNT_LOCKED)
1781 0 : goto out;
1782 :
1783 254564 : event++;
1784 254564 : if (flags & MNT_DETACH) {
1785 130784 : if (!list_empty(&mnt->mnt_list))
1786 130784 : umount_tree(mnt, UMOUNT_PROPAGATE);
1787 : retval = 0;
1788 : } else {
1789 123780 : shrink_submounts(mnt);
1790 123780 : retval = -EBUSY;
1791 123780 : if (!propagate_mount_busy(mnt, 2)) {
1792 123748 : if (!list_empty(&mnt->mnt_list))
1793 123748 : umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
1794 : retval = 0;
1795 : }
1796 : }
1797 32 : out:
1798 254564 : unlock_mount_hash();
1799 254564 : namespace_unlock();
1800 254564 : return retval;
1801 : }
1802 :
1803 : /*
1804 : * __detach_mounts - lazily unmount all mounts on the specified dentry
1805 : *
1806 : * During unlink, rmdir, and d_drop it is possible to loose the path
1807 : * to an existing mountpoint, and wind up leaking the mount.
1808 : * detach_mounts allows lazily unmounting those mounts instead of
1809 : * leaking them.
1810 : *
1811 : * The caller may hold dentry->d_inode->i_mutex.
1812 : */
1813 0 : void __detach_mounts(struct dentry *dentry)
1814 : {
1815 0 : struct mountpoint *mp;
1816 0 : struct mount *mnt;
1817 :
1818 0 : namespace_lock();
1819 0 : lock_mount_hash();
1820 0 : mp = lookup_mountpoint(dentry);
1821 0 : if (!mp)
1822 0 : goto out_unlock;
1823 :
1824 0 : event++;
1825 0 : while (!hlist_empty(&mp->m_list)) {
1826 0 : mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
1827 0 : if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
1828 0 : umount_mnt(mnt);
1829 0 : hlist_add_head(&mnt->mnt_umount, &unmounted);
1830 : }
1831 0 : else umount_tree(mnt, UMOUNT_CONNECTED);
1832 : }
1833 0 : put_mountpoint(mp);
1834 0 : out_unlock:
1835 0 : unlock_mount_hash();
1836 0 : namespace_unlock();
1837 0 : }
1838 :
1839 : /*
1840 : * Is the caller allowed to modify his namespace?
1841 : */
1842 683383 : bool may_mount(void)
1843 : {
1844 683383 : return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
1845 : }
1846 :
1847 : /**
1848 : * path_mounted - check whether path is mounted
1849 : * @path: path to check
1850 : *
1851 : * Determine whether @path refers to the root of a mount.
1852 : *
1853 : * Return: true if @path is the root of a mount, false if not.
1854 : */
1855 : static inline bool path_mounted(const struct path *path)
1856 : {
1857 428502 : return path->mnt->mnt_root == path->dentry;
1858 : }
1859 :
1860 0 : static void warn_mandlock(void)
1861 : {
1862 0 : pr_warn_once("=======================================================\n"
1863 : "WARNING: The mand mount option has been deprecated and\n"
1864 : " and is ignored by this kernel. Remove the mand\n"
1865 : " option from the mount to silence this warning.\n"
1866 : "=======================================================\n");
1867 0 : }
1868 :
1869 269904 : static int can_umount(const struct path *path, int flags)
1870 : {
1871 269904 : struct mount *mnt = real_mount(path->mnt);
1872 :
1873 269904 : if (!may_mount())
1874 : return -EPERM;
1875 269940 : if (!path_mounted(path))
1876 : return -EINVAL;
1877 254452 : if (!check_mnt(mnt))
1878 : return -EINVAL;
1879 254452 : if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
1880 : return -EINVAL;
1881 254452 : if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
1882 0 : return -EPERM;
1883 : return 0;
1884 : }
1885 :
1886 : // caller is responsible for flags being sane
1887 269958 : int path_umount(struct path *path, int flags)
1888 : {
1889 269958 : struct mount *mnt = real_mount(path->mnt);
1890 269958 : int ret;
1891 :
1892 269958 : ret = can_umount(path, flags);
1893 269932 : if (!ret)
1894 254443 : ret = do_umount(mnt, flags);
1895 :
1896 : /* we mustn't call path_put() as that would clear mnt_expiry_mark */
1897 270053 : dput(path->dentry);
1898 270051 : mntput_no_expire(mnt);
1899 270052 : return ret;
1900 : }
1901 :
1902 418953 : static int ksys_umount(char __user *name, int flags)
1903 : {
1904 418953 : int lookup_flags = LOOKUP_MOUNTPOINT;
1905 418953 : struct path path;
1906 418953 : int ret;
1907 :
1908 : // basic validity checks done first
1909 418953 : if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
1910 : return -EINVAL;
1911 :
1912 418953 : if (!(flags & UMOUNT_NOFOLLOW))
1913 269745 : lookup_flags |= LOOKUP_FOLLOW;
1914 418953 : ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
1915 418878 : if (ret)
1916 : return ret;
1917 269981 : return path_umount(&path, flags);
1918 : }
1919 :
1920 837831 : SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1921 : {
1922 418925 : return ksys_umount(name, flags);
1923 : }
1924 :
1925 : #ifdef __ARCH_WANT_SYS_OLDUMOUNT
1926 :
1927 : /*
1928 : * The 2.0 compatible umount. No flags.
1929 : */
1930 0 : SYSCALL_DEFINE1(oldumount, char __user *, name)
1931 : {
1932 0 : return ksys_umount(name, 0);
1933 : }
1934 :
1935 : #endif
1936 :
1937 : static bool is_mnt_ns_file(struct dentry *dentry)
1938 : {
1939 : /* Is this a proxy for a mount namespace? */
1940 2644341 : return dentry->d_op == &ns_dentry_operations &&
1941 0 : dentry->d_fsdata == &mntns_operations;
1942 : }
1943 :
1944 : static struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
1945 : {
1946 : return container_of(ns, struct mnt_namespace, ns);
1947 : }
1948 :
1949 0 : struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
1950 : {
1951 0 : return &mnt->ns;
1952 : }
1953 :
1954 139818 : static bool mnt_ns_loop(struct dentry *dentry)
1955 : {
1956 : /* Could bind mounting the mount namespace inode cause a
1957 : * mount namespace loop?
1958 : */
1959 139818 : struct mnt_namespace *mnt_ns;
1960 279636 : if (!is_mnt_ns_file(dentry))
1961 : return false;
1962 :
1963 0 : mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
1964 0 : return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
1965 : }
1966 :
1967 2645006 : struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1968 : int flag)
1969 : {
1970 2645006 : struct mount *res, *p, *q, *r, *parent;
1971 :
1972 2645006 : if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
1973 : return ERR_PTR(-EINVAL);
1974 :
1975 5289347 : if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
1976 : return ERR_PTR(-EINVAL);
1977 :
1978 2645006 : res = q = clone_mnt(mnt, dentry, flag);
1979 2645006 : if (IS_ERR(q))
1980 : return q;
1981 :
1982 2645006 : q->mnt_mountpoint = mnt->mnt_mountpoint;
1983 :
1984 2645006 : p = mnt;
1985 2653290 : list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
1986 8284 : struct mount *s;
1987 8284 : if (!is_subdir(r->mnt_mountpoint, dentry))
1988 2910 : continue;
1989 :
1990 317884 : for (s = r; s; s = next_mnt(s, r)) {
1991 156255 : if (!(flag & CL_COPY_UNBINDABLE) &&
1992 6097 : IS_MNT_UNBINDABLE(s)) {
1993 0 : if (s->mnt.mnt_flags & MNT_LOCKED) {
1994 : /* Both unbindable and locked. */
1995 0 : q = ERR_PTR(-EPERM);
1996 0 : goto out;
1997 : } else {
1998 0 : s = skip_mnt_tree(s);
1999 0 : continue;
2000 : }
2001 : }
2002 306413 : if (!(flag & CL_COPY_MNT_NS_FILE) &&
2003 150158 : is_mnt_ns_file(s->mnt.mnt_root)) {
2004 0 : s = skip_mnt_tree(s);
2005 0 : continue;
2006 : }
2007 309104 : while (p != s->mnt_parent) {
2008 152849 : p = p->mnt_parent;
2009 152849 : q = q->mnt_parent;
2010 : }
2011 156255 : p = s;
2012 156255 : parent = q;
2013 156255 : q = clone_mnt(p, p->mnt.mnt_root, flag);
2014 156255 : if (IS_ERR(q))
2015 0 : goto out;
2016 156255 : lock_mount_hash();
2017 156255 : list_add_tail(&q->mnt_list, &res->mnt_list);
2018 156255 : attach_mnt(q, parent, p->mnt_mp, false);
2019 156255 : unlock_mount_hash();
2020 : }
2021 : }
2022 : return res;
2023 0 : out:
2024 0 : if (res) {
2025 0 : lock_mount_hash();
2026 0 : umount_tree(res, UMOUNT_SYNC);
2027 0 : unlock_mount_hash();
2028 : }
2029 : return q;
2030 : }
2031 :
2032 : /* Caller should check returned pointer for errors */
2033 :
2034 0 : struct vfsmount *collect_mounts(const struct path *path)
2035 : {
2036 0 : struct mount *tree;
2037 0 : namespace_lock();
2038 0 : if (!check_mnt(real_mount(path->mnt)))
2039 : tree = ERR_PTR(-EINVAL);
2040 : else
2041 0 : tree = copy_tree(real_mount(path->mnt), path->dentry,
2042 : CL_COPY_ALL | CL_PRIVATE);
2043 0 : namespace_unlock();
2044 0 : if (IS_ERR(tree))
2045 : return ERR_CAST(tree);
2046 0 : return &tree->mnt;
2047 : }
2048 :
2049 : static void free_mnt_ns(struct mnt_namespace *);
2050 : static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
2051 :
2052 131293 : void dissolve_on_fput(struct vfsmount *mnt)
2053 : {
2054 131293 : struct mnt_namespace *ns;
2055 131293 : namespace_lock();
2056 131293 : lock_mount_hash();
2057 131293 : ns = real_mount(mnt)->mnt_ns;
2058 131293 : if (ns) {
2059 131280 : if (is_anon_ns(ns))
2060 759 : umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
2061 : else
2062 : ns = NULL;
2063 : }
2064 131293 : unlock_mount_hash();
2065 131293 : namespace_unlock();
2066 131293 : if (ns)
2067 759 : free_mnt_ns(ns);
2068 131293 : }
2069 :
2070 5080 : void drop_collected_mounts(struct vfsmount *mnt)
2071 : {
2072 5080 : namespace_lock();
2073 5080 : lock_mount_hash();
2074 5080 : umount_tree(real_mount(mnt), 0);
2075 5080 : unlock_mount_hash();
2076 5080 : namespace_unlock();
2077 5080 : }
2078 :
2079 289708 : static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
2080 : {
2081 289708 : struct mount *child;
2082 :
2083 612253 : list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
2084 322522 : if (!is_subdir(child->mnt_mountpoint, dentry))
2085 322545 : continue;
2086 :
2087 0 : if (child->mnt.mnt_flags & MNT_LOCKED)
2088 : return true;
2089 : }
2090 : return false;
2091 : }
2092 :
2093 : /**
2094 : * clone_private_mount - create a private clone of a path
2095 : * @path: path to clone
2096 : *
2097 : * This creates a new vfsmount, which will be the clone of @path. The new mount
2098 : * will not be attached anywhere in the namespace and will be private (i.e.
2099 : * changes to the originating mount won't be propagated into this).
2100 : *
2101 : * Release with mntput().
2102 : */
2103 156947 : struct vfsmount *clone_private_mount(const struct path *path)
2104 : {
2105 156947 : struct mount *old_mnt = real_mount(path->mnt);
2106 156947 : struct mount *new_mnt;
2107 :
2108 156947 : down_read(&namespace_sem);
2109 156904 : if (IS_MNT_UNBINDABLE(old_mnt))
2110 0 : goto invalid;
2111 :
2112 156904 : if (!check_mnt(old_mnt))
2113 0 : goto invalid;
2114 :
2115 156904 : if (has_locked_children(old_mnt, path->dentry))
2116 0 : goto invalid;
2117 :
2118 156893 : new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
2119 156973 : up_read(&namespace_sem);
2120 :
2121 156977 : if (IS_ERR(new_mnt))
2122 : return ERR_CAST(new_mnt);
2123 :
2124 : /* Longterm mount to be removed by kern_unmount*() */
2125 156977 : new_mnt->mnt_ns = MNT_NS_INTERNAL;
2126 :
2127 156977 : return &new_mnt->mnt;
2128 :
2129 0 : invalid:
2130 0 : up_read(&namespace_sem);
2131 0 : return ERR_PTR(-EINVAL);
2132 : }
2133 : EXPORT_SYMBOL_GPL(clone_private_mount);
2134 :
2135 0 : int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
2136 : struct vfsmount *root)
2137 : {
2138 0 : struct mount *mnt;
2139 0 : int res = f(root, arg);
2140 0 : if (res)
2141 : return res;
2142 0 : list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
2143 0 : res = f(&mnt->mnt, arg);
2144 0 : if (res)
2145 0 : return res;
2146 : }
2147 : return 0;
2148 : }
2149 :
2150 1378 : static void lock_mnt_tree(struct mount *mnt)
2151 : {
2152 1378 : struct mount *p;
2153 :
2154 128777 : for (p = mnt; p; p = next_mnt(p, mnt)) {
2155 127399 : int flags = p->mnt.mnt_flags;
2156 : /* Don't allow unprivileged users to change mount flags */
2157 127399 : flags |= MNT_LOCK_ATIME;
2158 :
2159 127399 : if (flags & MNT_READONLY)
2160 72865 : flags |= MNT_LOCK_READONLY;
2161 :
2162 127399 : if (flags & MNT_NODEV)
2163 25040 : flags |= MNT_LOCK_NODEV;
2164 :
2165 127399 : if (flags & MNT_NOSUID)
2166 27796 : flags |= MNT_LOCK_NOSUID;
2167 :
2168 127399 : if (flags & MNT_NOEXEC)
2169 23055 : flags |= MNT_LOCK_NOEXEC;
2170 : /* Don't allow unprivileged users to reveal what is under a mount */
2171 127399 : if (list_empty(&p->mnt_expire))
2172 127399 : flags |= MNT_LOCKED;
2173 127399 : p->mnt.mnt_flags = flags;
2174 : }
2175 1378 : }
2176 :
2177 0 : static void cleanup_group_ids(struct mount *mnt, struct mount *end)
2178 : {
2179 0 : struct mount *p;
2180 :
2181 0 : for (p = mnt; p != end; p = next_mnt(p, mnt)) {
2182 0 : if (p->mnt_group_id && !IS_MNT_SHARED(p))
2183 0 : mnt_release_group_id(p);
2184 : }
2185 0 : }
2186 :
2187 252872 : static int invent_group_ids(struct mount *mnt, bool recurse)
2188 : {
2189 252872 : struct mount *p;
2190 :
2191 770807 : for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
2192 259442 : if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
2193 128254 : int err = mnt_alloc_group_id(p);
2194 128254 : if (err) {
2195 0 : cleanup_group_ids(mnt, p);
2196 0 : return err;
2197 : }
2198 : }
2199 : }
2200 :
2201 : return 0;
2202 : }
2203 :
2204 2897783 : int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
2205 : {
2206 2897783 : unsigned int max = READ_ONCE(sysctl_mount_max);
2207 2897783 : unsigned int mounts = 0;
2208 2897783 : struct mount *p;
2209 :
2210 2897783 : if (ns->mounts >= max)
2211 : return -ENOSPC;
2212 2897783 : max -= ns->mounts;
2213 2897783 : if (ns->pending_mounts >= max)
2214 : return -ENOSPC;
2215 2897783 : max -= ns->pending_mounts;
2216 :
2217 5801663 : for (p = mnt; p; p = next_mnt(p, mnt))
2218 2903880 : mounts++;
2219 :
2220 2897783 : if (mounts > max)
2221 : return -ENOSPC;
2222 :
2223 2897783 : ns->pending_mounts += mounts;
2224 2897783 : return 0;
2225 : }
2226 :
2227 : enum mnt_tree_flags_t {
2228 : MNT_TREE_MOVE = BIT(0),
2229 : MNT_TREE_BENEATH = BIT(1),
2230 : };
2231 :
2232 : /**
2233 : * attach_recursive_mnt - attach a source mount tree
2234 : * @source_mnt: mount tree to be attached
2235 : * @top_mnt: mount that @source_mnt will be mounted on or mounted beneath
2236 : * @dest_mp: the mountpoint @source_mnt will be mounted at
2237 : * @flags: modify how @source_mnt is supposed to be attached
2238 : *
2239 : * NOTE: in the table below explains the semantics when a source mount
2240 : * of a given type is attached to a destination mount of a given type.
2241 : * ---------------------------------------------------------------------------
2242 : * | BIND MOUNT OPERATION |
2243 : * |**************************************************************************
2244 : * | source-->| shared | private | slave | unbindable |
2245 : * | dest | | | | |
2246 : * | | | | | | |
2247 : * | v | | | | |
2248 : * |**************************************************************************
2249 : * | shared | shared (++) | shared (+) | shared(+++)| invalid |
2250 : * | | | | | |
2251 : * |non-shared| shared (+) | private | slave (*) | invalid |
2252 : * ***************************************************************************
2253 : * A bind operation clones the source mount and mounts the clone on the
2254 : * destination mount.
2255 : *
2256 : * (++) the cloned mount is propagated to all the mounts in the propagation
2257 : * tree of the destination mount and the cloned mount is added to
2258 : * the peer group of the source mount.
2259 : * (+) the cloned mount is created under the destination mount and is marked
2260 : * as shared. The cloned mount is added to the peer group of the source
2261 : * mount.
2262 : * (+++) the mount is propagated to all the mounts in the propagation tree
2263 : * of the destination mount and the cloned mount is made slave
2264 : * of the same master as that of the source mount. The cloned mount
2265 : * is marked as 'shared and slave'.
2266 : * (*) the cloned mount is made a slave of the same master as that of the
2267 : * source mount.
2268 : *
2269 : * ---------------------------------------------------------------------------
2270 : * | MOVE MOUNT OPERATION |
2271 : * |**************************************************************************
2272 : * | source-->| shared | private | slave | unbindable |
2273 : * | dest | | | | |
2274 : * | | | | | | |
2275 : * | v | | | | |
2276 : * |**************************************************************************
2277 : * | shared | shared (+) | shared (+) | shared(+++) | invalid |
2278 : * | | | | | |
2279 : * |non-shared| shared (+*) | private | slave (*) | unbindable |
2280 : * ***************************************************************************
2281 : *
2282 : * (+) the mount is moved to the destination. And is then propagated to
2283 : * all the mounts in the propagation tree of the destination mount.
2284 : * (+*) the mount is moved to the destination.
2285 : * (+++) the mount is moved to the destination and is then propagated to
2286 : * all the mounts belonging to the destination mount's propagation tree.
2287 : * the mount is marked as 'shared and slave'.
2288 : * (*) the mount continues to be a slave at the new location.
2289 : *
2290 : * if the source mount is a tree, the operations explained above is
2291 : * applied to each mount in the tree.
2292 : * Must be called without spinlocks held, since this function can sleep
2293 : * in allocations.
2294 : *
2295 : * Context: The function expects namespace_lock() to be held.
2296 : * Return: If @source_mnt was successfully attached 0 is returned.
2297 : * Otherwise a negative error code is returned.
2298 : */
2299 255403 : static int attach_recursive_mnt(struct mount *source_mnt,
2300 : struct mount *top_mnt,
2301 : struct mountpoint *dest_mp,
2302 : enum mnt_tree_flags_t flags)
2303 : {
2304 255403 : struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2305 255403 : HLIST_HEAD(tree_list);
2306 255403 : struct mnt_namespace *ns = top_mnt->mnt_ns;
2307 255403 : struct mountpoint *smp;
2308 255403 : struct mount *child, *dest_mnt, *p;
2309 255403 : struct hlist_node *n;
2310 255403 : int err = 0;
2311 255403 : bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH;
2312 :
2313 : /*
2314 : * Preallocate a mountpoint in case the new mounts need to be
2315 : * mounted beneath mounts on the same mountpoint.
2316 : */
2317 255403 : smp = get_mountpoint(source_mnt->mnt.mnt_root);
2318 255403 : if (IS_ERR(smp))
2319 0 : return PTR_ERR(smp);
2320 :
2321 : /* Is there space to add these mounts to the mount namespace? */
2322 255403 : if (!moving) {
2323 255085 : err = count_mounts(ns, source_mnt);
2324 255085 : if (err)
2325 0 : goto out;
2326 : }
2327 :
2328 255403 : if (beneath)
2329 0 : dest_mnt = top_mnt->mnt_parent;
2330 : else
2331 : dest_mnt = top_mnt;
2332 :
2333 255403 : if (IS_MNT_SHARED(dest_mnt)) {
2334 251847 : err = invent_group_ids(source_mnt, true);
2335 251847 : if (err)
2336 0 : goto out;
2337 251847 : err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
2338 : }
2339 255403 : lock_mount_hash();
2340 255403 : if (err)
2341 0 : goto out_cleanup_ids;
2342 :
2343 255403 : if (IS_MNT_SHARED(dest_mnt)) {
2344 503694 : for (p = source_mnt; p; p = next_mnt(p, source_mnt))
2345 251847 : set_mnt_shared(p);
2346 : }
2347 :
2348 255403 : if (moving) {
2349 318 : if (beneath)
2350 0 : dest_mp = smp;
2351 318 : unhash_mnt(source_mnt);
2352 318 : attach_mnt(source_mnt, top_mnt, dest_mp, beneath);
2353 318 : touch_mnt_namespace(source_mnt->mnt_ns);
2354 : } else {
2355 255085 : if (source_mnt->mnt_ns) {
2356 : /* move from anon - the caller will destroy */
2357 130534 : list_del_init(&source_mnt->mnt_ns->list);
2358 : }
2359 255085 : if (beneath)
2360 0 : mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
2361 : else
2362 255085 : mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
2363 255085 : commit_tree(source_mnt);
2364 : }
2365 :
2366 3153504 : hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
2367 2642698 : struct mount *q;
2368 2642698 : hlist_del_init(&child->mnt_hash);
2369 2642698 : q = __lookup_mnt(&child->mnt_parent->mnt,
2370 : child->mnt_mountpoint);
2371 2642698 : if (q)
2372 0 : mnt_change_mountpoint(child, smp, q);
2373 : /* Notice when we are propagating across user namespaces */
2374 2642698 : if (child->mnt_parent->mnt_ns->user_ns != user_ns)
2375 0 : lock_mnt_tree(child);
2376 2642698 : child->mnt.mnt_flags &= ~MNT_LOCKED;
2377 2642698 : commit_tree(child);
2378 : }
2379 255403 : put_mountpoint(smp);
2380 255403 : unlock_mount_hash();
2381 :
2382 255403 : return 0;
2383 :
2384 : out_cleanup_ids:
2385 0 : while (!hlist_empty(&tree_list)) {
2386 0 : child = hlist_entry(tree_list.first, struct mount, mnt_hash);
2387 0 : child->mnt_parent->mnt_ns->pending_mounts = 0;
2388 0 : umount_tree(child, UMOUNT_SYNC);
2389 : }
2390 0 : unlock_mount_hash();
2391 0 : cleanup_group_ids(source_mnt, NULL);
2392 0 : out:
2393 0 : ns->pending_mounts = 0;
2394 :
2395 0 : read_seqlock_excl(&mount_lock);
2396 0 : put_mountpoint(smp);
2397 0 : read_sequnlock_excl(&mount_lock);
2398 :
2399 0 : return err;
2400 : }
2401 :
2402 : /**
2403 : * do_lock_mount - lock mount and mountpoint
2404 : * @path: target path
2405 : * @beneath: whether the intention is to mount beneath @path
2406 : *
2407 : * Follow the mount stack on @path until the top mount @mnt is found. If
2408 : * the initial @path->{mnt,dentry} is a mountpoint lookup the first
2409 : * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root}
2410 : * until nothing is stacked on top of it anymore.
2411 : *
2412 : * Acquire the inode_lock() on the top mount's ->mnt_root to protect
2413 : * against concurrent removal of the new mountpoint from another mount
2414 : * namespace.
2415 : *
2416 : * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint
2417 : * @mp on @mnt->mnt_parent must be acquired. This protects against a
2418 : * concurrent unlink of @mp->mnt_dentry from another mount namespace
2419 : * where @mnt doesn't have a child mount mounted @mp. A concurrent
2420 : * removal of @mnt->mnt_root doesn't matter as nothing will be mounted
2421 : * on top of it for @beneath.
2422 : *
2423 : * In addition, @beneath needs to make sure that @mnt hasn't been
2424 : * unmounted or moved from its current mountpoint in between dropping
2425 : * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt
2426 : * being unmounted would be detected later by e.g., calling
2427 : * check_mnt(mnt) in the function it's called from. For the @beneath
2428 : * case however, it's useful to detect it directly in do_lock_mount().
2429 : * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points
2430 : * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will
2431 : * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL.
2432 : *
2433 : * Return: Either the target mountpoint on the top mount or the top
2434 : * mount's mountpoint.
2435 : */
2436 255651 : static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
2437 : {
2438 255651 : struct vfsmount *mnt = path->mnt;
2439 255651 : struct dentry *dentry;
2440 255651 : struct mountpoint *mp = ERR_PTR(-ENOENT);
2441 :
2442 255651 : for (;;) {
2443 255651 : struct mount *m;
2444 :
2445 255651 : if (beneath) {
2446 0 : m = real_mount(mnt);
2447 0 : read_seqlock_excl(&mount_lock);
2448 0 : dentry = dget(m->mnt_mountpoint);
2449 0 : read_sequnlock_excl(&mount_lock);
2450 : } else {
2451 255651 : dentry = path->dentry;
2452 : }
2453 :
2454 255651 : inode_lock(dentry->d_inode);
2455 255655 : if (unlikely(cant_mount(dentry))) {
2456 0 : inode_unlock(dentry->d_inode);
2457 0 : goto out;
2458 : }
2459 :
2460 255655 : namespace_lock();
2461 :
2462 255658 : if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) {
2463 0 : namespace_unlock();
2464 0 : inode_unlock(dentry->d_inode);
2465 0 : goto out;
2466 : }
2467 :
2468 255658 : mnt = lookup_mnt(path);
2469 255658 : if (likely(!mnt))
2470 : break;
2471 :
2472 0 : namespace_unlock();
2473 0 : inode_unlock(dentry->d_inode);
2474 0 : if (beneath)
2475 0 : dput(dentry);
2476 0 : path_put(path);
2477 0 : path->mnt = mnt;
2478 0 : path->dentry = dget(mnt->mnt_root);
2479 : }
2480 :
2481 255658 : mp = get_mountpoint(dentry);
2482 255658 : if (IS_ERR(mp)) {
2483 0 : namespace_unlock();
2484 0 : inode_unlock(dentry->d_inode);
2485 : }
2486 :
2487 255658 : out:
2488 255658 : if (beneath)
2489 0 : dput(dentry);
2490 :
2491 255658 : return mp;
2492 : }
2493 :
2494 : static inline struct mountpoint *lock_mount(struct path *path)
2495 : {
2496 124753 : return do_lock_mount(path, false);
2497 : }
2498 :
2499 255669 : static void unlock_mount(struct mountpoint *where)
2500 : {
2501 255669 : struct dentry *dentry = where->m_dentry;
2502 :
2503 255669 : read_seqlock_excl(&mount_lock);
2504 255669 : put_mountpoint(where);
2505 255669 : read_sequnlock_excl(&mount_lock);
2506 :
2507 255669 : namespace_unlock();
2508 255669 : inode_unlock(dentry->d_inode);
2509 255669 : }
2510 :
2511 124551 : static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
2512 : {
2513 124551 : if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
2514 : return -EINVAL;
2515 :
2516 249102 : if (d_is_dir(mp->m_dentry) !=
2517 124551 : d_is_dir(mnt->mnt.mnt_root))
2518 : return -ENOTDIR;
2519 :
2520 124551 : return attach_recursive_mnt(mnt, p, mp, 0);
2521 : }
2522 :
2523 : /*
2524 : * Sanity check the flags to change_mnt_propagation.
2525 : */
2526 :
2527 2640 : static int flags_to_propagation_type(int ms_flags)
2528 : {
2529 2640 : int type = ms_flags & ~(MS_REC | MS_SILENT);
2530 :
2531 : /* Fail if any non-propagation flags are set */
2532 2640 : if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
2533 : return 0;
2534 : /* Only one propagation flag should be set */
2535 5280 : if (!is_power_of_2(type))
2536 0 : return 0;
2537 : return type;
2538 : }
2539 :
2540 : /*
2541 : * recursively change the type of the mountpoint.
2542 : */
2543 2640 : static int do_change_type(struct path *path, int ms_flags)
2544 : {
2545 2640 : struct mount *m;
2546 2640 : struct mount *mnt = real_mount(path->mnt);
2547 2640 : int recurse = ms_flags & MS_REC;
2548 2640 : int type;
2549 2640 : int err = 0;
2550 :
2551 2640 : if (!path_mounted(path))
2552 : return -EINVAL;
2553 :
2554 2640 : type = flags_to_propagation_type(ms_flags);
2555 2640 : if (!type)
2556 : return -EINVAL;
2557 :
2558 2640 : namespace_lock();
2559 2640 : if (type == MS_SHARED) {
2560 1025 : err = invent_group_ids(mnt, recurse);
2561 1025 : if (err)
2562 0 : goto out_unlock;
2563 : }
2564 :
2565 2640 : lock_mount_hash();
2566 69169 : for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
2567 33094 : change_mnt_propagation(m, type);
2568 2640 : unlock_mount_hash();
2569 :
2570 2640 : out_unlock:
2571 2640 : namespace_unlock();
2572 2640 : return err;
2573 : }
2574 :
2575 133657 : static struct mount *__do_loopback(struct path *old_path, int recurse)
2576 : {
2577 133657 : struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
2578 :
2579 133657 : if (IS_MNT_UNBINDABLE(old))
2580 : return mnt;
2581 :
2582 133488 : if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
2583 : return mnt;
2584 :
2585 133488 : if (!recurse && has_locked_children(old, old_path->dentry))
2586 : return mnt;
2587 :
2588 133488 : if (recurse)
2589 665 : mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
2590 : else
2591 132823 : mnt = clone_mnt(old, old_path->dentry, 0);
2592 :
2593 133488 : if (!IS_ERR(mnt))
2594 133488 : mnt->mnt.mnt_flags &= ~MNT_LOCKED;
2595 :
2596 : return mnt;
2597 : }
2598 :
2599 : /*
2600 : * do loopback mount.
2601 : */
2602 2364 : static int do_loopback(struct path *path, const char *old_name,
2603 : int recurse)
2604 : {
2605 2364 : struct path old_path;
2606 2364 : struct mount *mnt = NULL, *parent;
2607 2364 : struct mountpoint *mp;
2608 2364 : int err;
2609 2364 : if (!old_name || !*old_name)
2610 : return -EINVAL;
2611 2364 : err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
2612 2364 : if (err)
2613 : return err;
2614 :
2615 2364 : err = -EINVAL;
2616 2364 : if (mnt_ns_loop(old_path.dentry))
2617 0 : goto out;
2618 :
2619 2364 : mp = lock_mount(path);
2620 2364 : if (IS_ERR(mp)) {
2621 0 : err = PTR_ERR(mp);
2622 0 : goto out;
2623 : }
2624 :
2625 2364 : parent = real_mount(path->mnt);
2626 2364 : if (!check_mnt(parent))
2627 0 : goto out2;
2628 :
2629 2364 : mnt = __do_loopback(&old_path, recurse);
2630 2364 : if (IS_ERR(mnt)) {
2631 169 : err = PTR_ERR(mnt);
2632 169 : goto out2;
2633 : }
2634 :
2635 2195 : err = graft_tree(mnt, parent, mp);
2636 2195 : if (err) {
2637 0 : lock_mount_hash();
2638 0 : umount_tree(mnt, UMOUNT_SYNC);
2639 0 : unlock_mount_hash();
2640 : }
2641 2195 : out2:
2642 2364 : unlock_mount(mp);
2643 2364 : out:
2644 2364 : path_put(&old_path);
2645 2364 : return err;
2646 : }
2647 :
2648 131293 : static struct file *open_detached_copy(struct path *path, bool recursive)
2649 : {
2650 131293 : struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2651 131293 : struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
2652 131293 : struct mount *mnt, *p;
2653 131293 : struct file *file;
2654 :
2655 131293 : if (IS_ERR(ns))
2656 : return ERR_CAST(ns);
2657 :
2658 131293 : namespace_lock();
2659 131293 : mnt = __do_loopback(path, recursive);
2660 131293 : if (IS_ERR(mnt)) {
2661 0 : namespace_unlock();
2662 0 : free_mnt_ns(ns);
2663 0 : return ERR_CAST(mnt);
2664 : }
2665 :
2666 131293 : lock_mount_hash();
2667 393879 : for (p = mnt; p; p = next_mnt(p, mnt)) {
2668 131293 : p->mnt_ns = ns;
2669 131293 : ns->mounts++;
2670 : }
2671 131293 : ns->root = mnt;
2672 131293 : list_add_tail(&ns->list, &mnt->mnt_list);
2673 131293 : mntget(&mnt->mnt);
2674 131293 : unlock_mount_hash();
2675 131293 : namespace_unlock();
2676 :
2677 131293 : mntput(path->mnt);
2678 131293 : path->mnt = &mnt->mnt;
2679 131293 : file = dentry_open(path, O_PATH, current_cred());
2680 131293 : if (IS_ERR(file))
2681 0 : dissolve_on_fput(path->mnt);
2682 : else
2683 131293 : file->f_mode |= FMODE_NEED_UNMOUNT;
2684 : return file;
2685 : }
2686 :
2687 262586 : SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
2688 : {
2689 131293 : struct file *file;
2690 131293 : struct path path;
2691 131293 : int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
2692 131293 : bool detached = flags & OPEN_TREE_CLONE;
2693 131293 : int error;
2694 131293 : int fd;
2695 :
2696 131293 : BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
2697 :
2698 131293 : if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
2699 : AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
2700 : OPEN_TREE_CLOEXEC))
2701 : return -EINVAL;
2702 :
2703 131293 : if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
2704 : return -EINVAL;
2705 :
2706 131293 : if (flags & AT_NO_AUTOMOUNT)
2707 811 : lookup_flags &= ~LOOKUP_AUTOMOUNT;
2708 131293 : if (flags & AT_SYMLINK_NOFOLLOW)
2709 811 : lookup_flags &= ~LOOKUP_FOLLOW;
2710 131293 : if (flags & AT_EMPTY_PATH)
2711 131188 : lookup_flags |= LOOKUP_EMPTY;
2712 :
2713 131293 : if (detached && !may_mount())
2714 : return -EPERM;
2715 :
2716 131293 : fd = get_unused_fd_flags(flags & O_CLOEXEC);
2717 131293 : if (fd < 0)
2718 0 : return fd;
2719 :
2720 131293 : error = user_path_at(dfd, filename, lookup_flags, &path);
2721 131293 : if (unlikely(error)) {
2722 0 : file = ERR_PTR(error);
2723 : } else {
2724 131293 : if (detached)
2725 131293 : file = open_detached_copy(&path, flags & AT_RECURSIVE);
2726 : else
2727 0 : file = dentry_open(&path, O_PATH, current_cred());
2728 131293 : path_put(&path);
2729 : }
2730 131293 : if (IS_ERR(file)) {
2731 0 : put_unused_fd(fd);
2732 0 : return PTR_ERR(file);
2733 : }
2734 131293 : fd_install(fd, file);
2735 131293 : return fd;
2736 : }
2737 :
2738 : /*
2739 : * Don't allow locked mount flags to be cleared.
2740 : *
2741 : * No locks need to be held here while testing the various MNT_LOCK
2742 : * flags because those flags can never be cleared once they are set.
2743 : */
2744 22230 : static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
2745 : {
2746 22230 : unsigned int fl = mnt->mnt.mnt_flags;
2747 :
2748 22230 : if ((fl & MNT_LOCK_READONLY) &&
2749 0 : !(mnt_flags & MNT_READONLY))
2750 : return false;
2751 :
2752 22230 : if ((fl & MNT_LOCK_NODEV) &&
2753 0 : !(mnt_flags & MNT_NODEV))
2754 : return false;
2755 :
2756 22230 : if ((fl & MNT_LOCK_NOSUID) &&
2757 0 : !(mnt_flags & MNT_NOSUID))
2758 : return false;
2759 :
2760 22230 : if ((fl & MNT_LOCK_NOEXEC) &&
2761 0 : !(mnt_flags & MNT_NOEXEC))
2762 : return false;
2763 :
2764 22230 : if ((fl & MNT_LOCK_ATIME) &&
2765 0 : ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
2766 0 : return false;
2767 :
2768 : return true;
2769 : }
2770 :
2771 12119 : static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
2772 : {
2773 12119 : bool readonly_request = (mnt_flags & MNT_READONLY);
2774 :
2775 15313 : if (readonly_request == __mnt_is_readonly(&mnt->mnt))
2776 : return 0;
2777 :
2778 1952 : if (readonly_request)
2779 1952 : return mnt_make_readonly(mnt);
2780 :
2781 0 : mnt->mnt.mnt_flags &= ~MNT_READONLY;
2782 0 : return 0;
2783 : }
2784 :
2785 : static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
2786 : {
2787 13851 : mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
2788 13851 : mnt->mnt.mnt_flags = mnt_flags;
2789 13851 : touch_mnt_namespace(mnt->mnt_ns);
2790 12119 : }
2791 :
2792 143366 : static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
2793 : {
2794 143366 : struct super_block *sb = mnt->mnt_sb;
2795 :
2796 275316 : if (!__mnt_is_readonly(mnt) &&
2797 129647 : (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
2798 129296 : (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
2799 551 : char *buf = (char *)__get_free_page(GFP_KERNEL);
2800 551 : char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
2801 :
2802 1102 : pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
2803 : sb->s_type->name,
2804 : is_mounted(mnt) ? "remounted" : "mounted",
2805 : mntpath, &sb->s_time_max,
2806 : (unsigned long long)sb->s_time_max);
2807 :
2808 551 : free_page((unsigned long)buf);
2809 551 : sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
2810 : }
2811 143366 : }
2812 :
2813 : /*
2814 : * Handle reconfiguration of the mountpoint only without alteration of the
2815 : * superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND
2816 : * to mount(2).
2817 : */
2818 12119 : static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
2819 : {
2820 12119 : struct super_block *sb = path->mnt->mnt_sb;
2821 12119 : struct mount *mnt = real_mount(path->mnt);
2822 12119 : int ret;
2823 :
2824 12119 : if (!check_mnt(mnt))
2825 : return -EINVAL;
2826 :
2827 12119 : if (!path_mounted(path))
2828 : return -EINVAL;
2829 :
2830 12119 : if (!can_change_locked_flags(mnt, mnt_flags))
2831 : return -EPERM;
2832 :
2833 : /*
2834 : * We're only checking whether the superblock is read-only not
2835 : * changing it, so only take down_read(&sb->s_umount).
2836 : */
2837 12119 : down_read(&sb->s_umount);
2838 12119 : lock_mount_hash();
2839 12119 : ret = change_mount_ro_state(mnt, mnt_flags);
2840 12119 : if (ret == 0)
2841 12119 : set_mount_attributes(mnt, mnt_flags);
2842 12119 : unlock_mount_hash();
2843 12119 : up_read(&sb->s_umount);
2844 :
2845 12119 : mnt_warn_timestamp_expiry(path, &mnt->mnt);
2846 :
2847 12119 : return ret;
2848 : }
2849 :
2850 : /*
2851 : * change filesystem flags. dir should be a physical root of filesystem.
2852 : * If you've mounted a non-root directory somewhere and want to do remount
2853 : * on it - tough luck.
2854 : */
2855 8857 : static int do_remount(struct path *path, int ms_flags, int sb_flags,
2856 : int mnt_flags, void *data)
2857 : {
2858 8857 : int err;
2859 8857 : struct super_block *sb = path->mnt->mnt_sb;
2860 8857 : struct mount *mnt = real_mount(path->mnt);
2861 8857 : struct fs_context *fc;
2862 :
2863 8857 : if (!check_mnt(mnt))
2864 : return -EINVAL;
2865 :
2866 8857 : if (!path_mounted(path))
2867 : return -EINVAL;
2868 :
2869 8857 : if (!can_change_locked_flags(mnt, mnt_flags))
2870 : return -EPERM;
2871 :
2872 8857 : fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
2873 8857 : if (IS_ERR(fc))
2874 0 : return PTR_ERR(fc);
2875 :
2876 8857 : fc->oldapi = true;
2877 8857 : err = parse_monolithic_mount_data(fc, data);
2878 8857 : if (!err) {
2879 8806 : down_write(&sb->s_umount);
2880 8806 : err = -EPERM;
2881 8806 : if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
2882 8806 : err = reconfigure_super(fc);
2883 8806 : if (!err) {
2884 1732 : lock_mount_hash();
2885 1732 : set_mount_attributes(mnt, mnt_flags);
2886 1732 : unlock_mount_hash();
2887 : }
2888 : }
2889 8806 : up_write(&sb->s_umount);
2890 : }
2891 :
2892 8857 : mnt_warn_timestamp_expiry(path, &mnt->mnt);
2893 :
2894 8857 : put_fs_context(fc);
2895 8857 : return err;
2896 : }
2897 :
2898 130534 : static inline int tree_contains_unbindable(struct mount *mnt)
2899 : {
2900 130534 : struct mount *p;
2901 391576 : for (p = mnt; p; p = next_mnt(p, mnt)) {
2902 130534 : if (IS_MNT_UNBINDABLE(p))
2903 : return 1;
2904 : }
2905 : return 0;
2906 : }
2907 :
2908 : /*
2909 : * Check that there aren't references to earlier/same mount namespaces in the
2910 : * specified subtree. Such references can act as pins for mount namespaces
2911 : * that aren't checked by the mount-cycle checking code, thereby allowing
2912 : * cycles to be made.
2913 : */
2914 130852 : static bool check_for_nsfs_mounts(struct mount *subtree)
2915 : {
2916 130852 : struct mount *p;
2917 130852 : bool ret = false;
2918 :
2919 130852 : lock_mount_hash();
2920 536612 : for (p = subtree; p; p = next_mnt(p, subtree))
2921 137454 : if (mnt_ns_loop(p->mnt.mnt_root))
2922 0 : goto out;
2923 :
2924 : ret = true;
2925 130852 : out:
2926 130852 : unlock_mount_hash();
2927 130852 : return ret;
2928 : }
2929 :
2930 0 : static int do_set_group(struct path *from_path, struct path *to_path)
2931 : {
2932 0 : struct mount *from, *to;
2933 0 : int err;
2934 :
2935 0 : from = real_mount(from_path->mnt);
2936 0 : to = real_mount(to_path->mnt);
2937 :
2938 0 : namespace_lock();
2939 :
2940 0 : err = -EINVAL;
2941 : /* To and From must be mounted */
2942 0 : if (!is_mounted(&from->mnt))
2943 0 : goto out;
2944 0 : if (!is_mounted(&to->mnt))
2945 0 : goto out;
2946 :
2947 0 : err = -EPERM;
2948 : /* We should be allowed to modify mount namespaces of both mounts */
2949 0 : if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
2950 0 : goto out;
2951 0 : if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
2952 0 : goto out;
2953 :
2954 0 : err = -EINVAL;
2955 : /* To and From paths should be mount roots */
2956 0 : if (!path_mounted(from_path))
2957 0 : goto out;
2958 0 : if (!path_mounted(to_path))
2959 0 : goto out;
2960 :
2961 : /* Setting sharing groups is only allowed across same superblock */
2962 0 : if (from->mnt.mnt_sb != to->mnt.mnt_sb)
2963 0 : goto out;
2964 :
2965 : /* From mount root should be wider than To mount root */
2966 0 : if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
2967 0 : goto out;
2968 :
2969 : /* From mount should not have locked children in place of To's root */
2970 0 : if (has_locked_children(from, to->mnt.mnt_root))
2971 0 : goto out;
2972 :
2973 : /* Setting sharing groups is only allowed on private mounts */
2974 0 : if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
2975 0 : goto out;
2976 :
2977 : /* From should not be private */
2978 0 : if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
2979 0 : goto out;
2980 :
2981 0 : if (IS_MNT_SLAVE(from)) {
2982 0 : struct mount *m = from->mnt_master;
2983 :
2984 0 : list_add(&to->mnt_slave, &m->mnt_slave_list);
2985 0 : to->mnt_master = m;
2986 : }
2987 :
2988 0 : if (IS_MNT_SHARED(from)) {
2989 0 : to->mnt_group_id = from->mnt_group_id;
2990 0 : list_add(&to->mnt_share, &from->mnt_share);
2991 0 : lock_mount_hash();
2992 0 : set_mnt_shared(to);
2993 0 : unlock_mount_hash();
2994 : }
2995 :
2996 : err = 0;
2997 0 : out:
2998 0 : namespace_unlock();
2999 0 : return err;
3000 : }
3001 :
3002 : /**
3003 : * path_overmounted - check if path is overmounted
3004 : * @path: path to check
3005 : *
3006 : * Check if path is overmounted, i.e., if there's a mount on top of
3007 : * @path->mnt with @path->dentry as mountpoint.
3008 : *
3009 : * Context: This function expects namespace_lock() to be held.
3010 : * Return: If path is overmounted true is returned, false if not.
3011 : */
3012 11 : static inline bool path_overmounted(const struct path *path)
3013 : {
3014 11 : rcu_read_lock();
3015 11 : if (unlikely(__lookup_mnt(path->mnt, path->dentry))) {
3016 0 : rcu_read_unlock();
3017 0 : return true;
3018 : }
3019 11 : rcu_read_unlock();
3020 11 : return false;
3021 : }
3022 :
3023 : /**
3024 : * can_move_mount_beneath - check that we can mount beneath the top mount
3025 : * @from: mount to mount beneath
3026 : * @to: mount under which to mount
3027 : *
3028 : * - Make sure that @to->dentry is actually the root of a mount under
3029 : * which we can mount another mount.
3030 : * - Make sure that nothing can be mounted beneath the caller's current
3031 : * root or the rootfs of the namespace.
3032 : * - Make sure that the caller can unmount the topmost mount ensuring
3033 : * that the caller could reveal the underlying mountpoint.
3034 : * - Ensure that nothing has been mounted on top of @from before we
3035 : * grabbed @namespace_sem to avoid creating pointless shadow mounts.
3036 : * - Prevent mounting beneath a mount if the propagation relationship
3037 : * between the source mount, parent mount, and top mount would lead to
3038 : * nonsensical mount trees.
3039 : *
3040 : * Context: This function expects namespace_lock() to be held.
3041 : * Return: On success 0, and on error a negative error code is returned.
3042 : */
3043 0 : static int can_move_mount_beneath(const struct path *from,
3044 : const struct path *to,
3045 : const struct mountpoint *mp)
3046 : {
3047 0 : struct mount *mnt_from = real_mount(from->mnt),
3048 0 : *mnt_to = real_mount(to->mnt),
3049 0 : *parent_mnt_to = mnt_to->mnt_parent;
3050 :
3051 0 : if (!mnt_has_parent(mnt_to))
3052 : return -EINVAL;
3053 :
3054 0 : if (!path_mounted(to))
3055 : return -EINVAL;
3056 :
3057 0 : if (IS_MNT_LOCKED(mnt_to))
3058 : return -EINVAL;
3059 :
3060 : /* Avoid creating shadow mounts during mount propagation. */
3061 0 : if (path_overmounted(from))
3062 : return -EINVAL;
3063 :
3064 : /*
3065 : * Mounting beneath the rootfs only makes sense when the
3066 : * semantics of pivot_root(".", ".") are used.
3067 : */
3068 0 : if (&mnt_to->mnt == current->fs->root.mnt)
3069 : return -EINVAL;
3070 0 : if (parent_mnt_to == current->nsproxy->mnt_ns->root)
3071 : return -EINVAL;
3072 :
3073 0 : for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent)
3074 0 : if (p == mnt_to)
3075 : return -EINVAL;
3076 :
3077 : /*
3078 : * If the parent mount propagates to the child mount this would
3079 : * mean mounting @mnt_from on @mnt_to->mnt_parent and then
3080 : * propagating a copy @c of @mnt_from on top of @mnt_to. This
3081 : * defeats the whole purpose of mounting beneath another mount.
3082 : */
3083 0 : if (propagation_would_overmount(parent_mnt_to, mnt_to, mp))
3084 : return -EINVAL;
3085 :
3086 : /*
3087 : * If @mnt_to->mnt_parent propagates to @mnt_from this would
3088 : * mean propagating a copy @c of @mnt_from on top of @mnt_from.
3089 : * Afterwards @mnt_from would be mounted on top of
3090 : * @mnt_to->mnt_parent and @mnt_to would be unmounted from
3091 : * @mnt->mnt_parent and remounted on @mnt_from. But since @c is
3092 : * already mounted on @mnt_from, @mnt_to would ultimately be
3093 : * remounted on top of @c. Afterwards, @mnt_from would be
3094 : * covered by a copy @c of @mnt_from and @c would be covered by
3095 : * @mnt_from itself. This defeats the whole purpose of mounting
3096 : * @mnt_from beneath @mnt_to.
3097 : */
3098 0 : if (propagation_would_overmount(parent_mnt_to, mnt_from, mp))
3099 0 : return -EINVAL;
3100 :
3101 : return 0;
3102 : }
3103 :
3104 130904 : static int do_move_mount(struct path *old_path, struct path *new_path,
3105 : bool beneath)
3106 : {
3107 130904 : struct mnt_namespace *ns;
3108 130904 : struct mount *p;
3109 130904 : struct mount *old;
3110 130904 : struct mount *parent;
3111 130904 : struct mountpoint *mp, *old_mp;
3112 130904 : int err;
3113 130904 : bool attached;
3114 130904 : enum mnt_tree_flags_t flags = 0;
3115 :
3116 130904 : mp = do_lock_mount(new_path, beneath);
3117 130904 : if (IS_ERR(mp))
3118 0 : return PTR_ERR(mp);
3119 :
3120 130904 : old = real_mount(old_path->mnt);
3121 130904 : p = real_mount(new_path->mnt);
3122 130904 : parent = old->mnt_parent;
3123 130904 : attached = mnt_has_parent(old);
3124 130904 : if (attached)
3125 370 : flags |= MNT_TREE_MOVE;
3126 130904 : old_mp = old->mnt_mp;
3127 130904 : ns = old->mnt_ns;
3128 :
3129 130904 : err = -EINVAL;
3130 : /* The mountpoint must be in our namespace. */
3131 130904 : if (!check_mnt(p))
3132 0 : goto out;
3133 :
3134 : /* The thing moved must be mounted... */
3135 261808 : if (!is_mounted(&old->mnt))
3136 0 : goto out;
3137 :
3138 : /* ... and either ours or the root of anon namespace */
3139 130904 : if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
3140 0 : goto out;
3141 :
3142 130904 : if (old->mnt.mnt_flags & MNT_LOCKED)
3143 0 : goto out;
3144 :
3145 130904 : if (!path_mounted(old_path))
3146 0 : goto out;
3147 :
3148 261808 : if (d_is_dir(new_path->dentry) !=
3149 : d_is_dir(old_path->dentry))
3150 0 : goto out;
3151 : /*
3152 : * Don't move a mount residing in a shared parent.
3153 : */
3154 130904 : if (attached && IS_MNT_SHARED(parent))
3155 39 : goto out;
3156 :
3157 130865 : if (beneath) {
3158 0 : err = can_move_mount_beneath(old_path, new_path, mp);
3159 0 : if (err)
3160 0 : goto out;
3161 :
3162 0 : err = -EINVAL;
3163 0 : p = p->mnt_parent;
3164 0 : flags |= MNT_TREE_BENEATH;
3165 : }
3166 :
3167 : /*
3168 : * Don't move a mount tree containing unbindable mounts to a destination
3169 : * mount which is shared.
3170 : */
3171 130865 : if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
3172 13 : goto out;
3173 130852 : err = -ELOOP;
3174 130852 : if (!check_for_nsfs_mounts(old))
3175 0 : goto out;
3176 392995 : for (; mnt_has_parent(p); p = p->mnt_parent)
3177 262143 : if (p == old)
3178 0 : goto out;
3179 :
3180 130852 : err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags);
3181 130852 : if (err)
3182 0 : goto out;
3183 :
3184 : /* if the mount is moved, it should no longer be expire
3185 : * automatically */
3186 130852 : list_del_init(&old->mnt_expire);
3187 130852 : if (attached)
3188 318 : put_mountpoint(old_mp);
3189 130534 : out:
3190 130904 : unlock_mount(mp);
3191 130904 : if (!err) {
3192 130852 : if (attached)
3193 318 : mntput_no_expire(parent);
3194 : else
3195 130534 : free_mnt_ns(ns);
3196 : }
3197 : return err;
3198 : }
3199 :
3200 370 : static int do_move_mount_old(struct path *path, const char *old_name)
3201 : {
3202 370 : struct path old_path;
3203 370 : int err;
3204 :
3205 370 : if (!old_name || !*old_name)
3206 : return -EINVAL;
3207 :
3208 370 : err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
3209 370 : if (err)
3210 : return err;
3211 :
3212 370 : err = do_move_mount(&old_path, path, false);
3213 370 : path_put(&old_path);
3214 370 : return err;
3215 : }
3216 :
3217 : /*
3218 : * add a mount into a namespace's mount tree
3219 : */
3220 122401 : static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
3221 : const struct path *path, int mnt_flags)
3222 : {
3223 122401 : struct mount *parent = real_mount(path->mnt);
3224 :
3225 122401 : mnt_flags &= ~MNT_INTERNAL_FLAGS;
3226 :
3227 122401 : if (unlikely(!check_mnt(parent))) {
3228 : /* that's acceptable only for automounts done in private ns */
3229 0 : if (!(mnt_flags & MNT_SHRINKABLE))
3230 : return -EINVAL;
3231 : /* ... and for those we'd better have mountpoint still alive */
3232 0 : if (!parent->mnt_ns)
3233 : return -EINVAL;
3234 : }
3235 :
3236 : /* Refuse the same filesystem on the same mount point */
3237 122401 : if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path))
3238 : return -EBUSY;
3239 :
3240 122356 : if (d_is_symlink(newmnt->mnt.mnt_root))
3241 : return -EINVAL;
3242 :
3243 122356 : newmnt->mnt.mnt_flags = mnt_flags;
3244 122356 : return graft_tree(newmnt, parent, mp);
3245 : }
3246 :
3247 : static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);
3248 :
3249 : /*
3250 : * Create a new mount using a superblock configuration and request it
3251 : * be added to the namespace tree.
3252 : */
3253 122318 : static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
3254 : unsigned int mnt_flags)
3255 : {
3256 122318 : struct vfsmount *mnt;
3257 122318 : struct mountpoint *mp;
3258 122318 : struct super_block *sb = fc->root->d_sb;
3259 122318 : int error;
3260 :
3261 122318 : error = security_sb_kern_mount(sb);
3262 122318 : if (!error && mount_too_revealing(sb, &mnt_flags))
3263 0 : error = -EPERM;
3264 :
3265 122270 : if (unlikely(error)) {
3266 0 : fc_drop_locked(fc);
3267 0 : return error;
3268 : }
3269 :
3270 122270 : up_write(&sb->s_umount);
3271 :
3272 122321 : mnt = vfs_create_mount(fc);
3273 122390 : if (IS_ERR(mnt))
3274 0 : return PTR_ERR(mnt);
3275 :
3276 122390 : mnt_warn_timestamp_expiry(mountpoint, mnt);
3277 :
3278 122389 : mp = lock_mount(mountpoint);
3279 122390 : if (IS_ERR(mp)) {
3280 0 : mntput(mnt);
3281 0 : return PTR_ERR(mp);
3282 : }
3283 122390 : error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
3284 122390 : unlock_mount(mp);
3285 122390 : if (error < 0)
3286 45 : mntput(mnt);
3287 : return error;
3288 : }
3289 :
3290 : /*
3291 : * create a new mount for userspace and request it to be added into the
3292 : * namespace's tree
3293 : */
3294 123953 : static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
3295 : int mnt_flags, const char *name, void *data)
3296 : {
3297 123953 : struct file_system_type *type;
3298 123953 : struct fs_context *fc;
3299 123953 : const char *subtype = NULL;
3300 123953 : int err = 0;
3301 :
3302 123953 : if (!fstype)
3303 : return -EINVAL;
3304 :
3305 123953 : type = get_fs_type(fstype);
3306 124021 : if (!type)
3307 : return -ENODEV;
3308 :
3309 124021 : if (type->fs_flags & FS_HAS_SUBTYPE) {
3310 0 : subtype = strchr(fstype, '.');
3311 0 : if (subtype) {
3312 0 : subtype++;
3313 0 : if (!*subtype) {
3314 0 : put_filesystem(type);
3315 0 : return -EINVAL;
3316 : }
3317 : }
3318 : }
3319 :
3320 124021 : fc = fs_context_for_mount(type, sb_flags);
3321 124007 : put_filesystem(type);
3322 123993 : if (IS_ERR(fc))
3323 0 : return PTR_ERR(fc);
3324 :
3325 123993 : if (subtype)
3326 0 : err = vfs_parse_fs_string(fc, "subtype",
3327 : subtype, strlen(subtype));
3328 123980 : if (!err && name)
3329 247960 : err = vfs_parse_fs_string(fc, "source", name, strlen(name));
3330 123935 : if (!err)
3331 123937 : err = parse_monolithic_mount_data(fc, data);
3332 123977 : if (!err && !mount_capable(fc))
3333 : err = -EPERM;
3334 123936 : if (!err)
3335 123858 : err = vfs_get_tree(fc);
3336 124011 : if (!err)
3337 122349 : err = do_new_mount_fc(fc, path, mnt_flags);
3338 :
3339 124052 : put_fs_context(fc);
3340 124052 : return err;
3341 : }
3342 :
3343 11 : int finish_automount(struct vfsmount *m, const struct path *path)
3344 : {
3345 11 : struct dentry *dentry = path->dentry;
3346 11 : struct mountpoint *mp;
3347 11 : struct mount *mnt;
3348 11 : int err;
3349 :
3350 11 : if (!m)
3351 : return 0;
3352 11 : if (IS_ERR(m))
3353 0 : return PTR_ERR(m);
3354 :
3355 11 : mnt = real_mount(m);
3356 : /* The new mount record should have at least 2 refs to prevent it being
3357 : * expired before we get a chance to add it
3358 : */
3359 11 : BUG_ON(mnt_get_count(mnt) < 2);
3360 :
3361 11 : if (m->mnt_sb == path->mnt->mnt_sb &&
3362 0 : m->mnt_root == dentry) {
3363 0 : err = -ELOOP;
3364 0 : goto discard;
3365 : }
3366 :
3367 : /*
3368 : * we don't want to use lock_mount() - in this case finding something
3369 : * that overmounts our mountpoint to be means "quitely drop what we've
3370 : * got", not "try to mount it on top".
3371 : */
3372 11 : inode_lock(dentry->d_inode);
3373 11 : namespace_lock();
3374 11 : if (unlikely(cant_mount(dentry))) {
3375 0 : err = -ENOENT;
3376 0 : goto discard_locked;
3377 : }
3378 11 : if (path_overmounted(path)) {
3379 0 : err = 0;
3380 0 : goto discard_locked;
3381 : }
3382 11 : mp = get_mountpoint(dentry);
3383 11 : if (IS_ERR(mp)) {
3384 0 : err = PTR_ERR(mp);
3385 0 : goto discard_locked;
3386 : }
3387 :
3388 11 : err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
3389 11 : unlock_mount(mp);
3390 11 : if (unlikely(err))
3391 0 : goto discard;
3392 11 : mntput(m);
3393 11 : return 0;
3394 :
3395 0 : discard_locked:
3396 0 : namespace_unlock();
3397 0 : inode_unlock(dentry->d_inode);
3398 0 : discard:
3399 : /* remove m from any expiration list it may be on */
3400 0 : if (!list_empty(&mnt->mnt_expire)) {
3401 0 : namespace_lock();
3402 0 : list_del_init(&mnt->mnt_expire);
3403 0 : namespace_unlock();
3404 : }
3405 0 : mntput(m);
3406 0 : mntput(m);
3407 0 : return err;
3408 : }
3409 :
3410 : /**
3411 : * mnt_set_expiry - Put a mount on an expiration list
3412 : * @mnt: The mount to list.
3413 : * @expiry_list: The list to add the mount to.
3414 : */
3415 0 : void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
3416 : {
3417 0 : namespace_lock();
3418 :
3419 0 : list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
3420 :
3421 0 : namespace_unlock();
3422 0 : }
3423 : EXPORT_SYMBOL(mnt_set_expiry);
3424 :
3425 : /*
3426 : * process a list of expirable mountpoints with the intent of discarding any
3427 : * mountpoints that aren't in use and haven't been touched since last we came
3428 : * here
3429 : */
3430 0 : void mark_mounts_for_expiry(struct list_head *mounts)
3431 : {
3432 0 : struct mount *mnt, *next;
3433 0 : LIST_HEAD(graveyard);
3434 :
3435 0 : if (list_empty(mounts))
3436 0 : return;
3437 :
3438 0 : namespace_lock();
3439 0 : lock_mount_hash();
3440 :
3441 : /* extract from the expiration list every vfsmount that matches the
3442 : * following criteria:
3443 : * - only referenced by its parent vfsmount
3444 : * - still marked for expiry (marked on the last call here; marks are
3445 : * cleared by mntput())
3446 : */
3447 0 : list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
3448 0 : if (!xchg(&mnt->mnt_expiry_mark, 1) ||
3449 0 : propagate_mount_busy(mnt, 1))
3450 0 : continue;
3451 0 : list_move(&mnt->mnt_expire, &graveyard);
3452 : }
3453 0 : while (!list_empty(&graveyard)) {
3454 0 : mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
3455 0 : touch_mnt_namespace(mnt->mnt_ns);
3456 0 : umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
3457 : }
3458 0 : unlock_mount_hash();
3459 0 : namespace_unlock();
3460 : }
3461 :
3462 : EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
3463 :
3464 : /*
3465 : * Ripoff of 'select_parent()'
3466 : *
3467 : * search the list of submounts for a given mountpoint, and move any
3468 : * shrinkable submounts to the 'graveyard' list.
3469 : */
3470 123780 : static int select_submounts(struct mount *parent, struct list_head *graveyard)
3471 : {
3472 123780 : struct mount *this_parent = parent;
3473 123780 : struct list_head *next;
3474 123780 : int found = 0;
3475 :
3476 123780 : repeat:
3477 123780 : next = this_parent->mnt_mounts.next;
3478 : resume:
3479 123860 : while (next != &this_parent->mnt_mounts) {
3480 80 : struct list_head *tmp = next;
3481 80 : struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
3482 :
3483 80 : next = tmp->next;
3484 80 : if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
3485 80 : continue;
3486 : /*
3487 : * Descend a level if the d_mounts list is non-empty.
3488 : */
3489 0 : if (!list_empty(&mnt->mnt_mounts)) {
3490 0 : this_parent = mnt;
3491 0 : goto repeat;
3492 : }
3493 :
3494 0 : if (!propagate_mount_busy(mnt, 1)) {
3495 0 : list_move_tail(&mnt->mnt_expire, graveyard);
3496 0 : found++;
3497 : }
3498 : }
3499 : /*
3500 : * All done at this level ... ascend and resume the search
3501 : */
3502 123780 : if (this_parent != parent) {
3503 0 : next = this_parent->mnt_child.next;
3504 0 : this_parent = this_parent->mnt_parent;
3505 0 : goto resume;
3506 : }
3507 123780 : return found;
3508 : }
3509 :
3510 : /*
3511 : * process a list of expirable mountpoints with the intent of discarding any
3512 : * submounts of a specific parent mountpoint
3513 : *
3514 : * mount_lock must be held for write
3515 : */
3516 123780 : static void shrink_submounts(struct mount *mnt)
3517 : {
3518 123780 : LIST_HEAD(graveyard);
3519 123780 : struct mount *m;
3520 :
3521 : /* extract submounts of 'mountpoint' from the expiration list */
3522 123780 : while (select_submounts(mnt, &graveyard)) {
3523 0 : while (!list_empty(&graveyard)) {
3524 0 : m = list_first_entry(&graveyard, struct mount,
3525 : mnt_expire);
3526 0 : touch_mnt_namespace(m->mnt_ns);
3527 0 : umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
3528 : }
3529 : }
3530 123780 : }
3531 :
3532 150360 : static void *copy_mount_options(const void __user * data)
3533 : {
3534 150360 : char *copy;
3535 150360 : unsigned left, offset;
3536 :
3537 150360 : if (!data)
3538 : return NULL;
3539 :
3540 119472 : copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
3541 119499 : if (!copy)
3542 : return ERR_PTR(-ENOMEM);
3543 :
3544 119499 : left = copy_from_user(copy, data, PAGE_SIZE);
3545 :
3546 : /*
3547 : * Not all architectures have an exact copy_from_user(). Resort to
3548 : * byte at a time.
3549 : */
3550 119497 : offset = PAGE_SIZE - left;
3551 119497 : while (left) {
3552 12 : char c;
3553 12 : if (get_user(c, (const char __user *)data + offset))
3554 : break;
3555 0 : copy[offset] = c;
3556 0 : left--;
3557 0 : offset++;
3558 : }
3559 :
3560 119487 : if (left == PAGE_SIZE) {
3561 0 : kfree(copy);
3562 0 : return ERR_PTR(-EFAULT);
3563 : }
3564 :
3565 : return copy;
3566 : }
3567 :
3568 : static char *copy_mount_string(const void __user *data)
3569 : {
3570 272491 : return data ? strndup_user(data, PATH_MAX) : NULL;
3571 : }
3572 :
3573 : /*
3574 : * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
3575 : * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
3576 : *
3577 : * data is a (void *) that can point to any structure up to
3578 : * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
3579 : * information (or be NULL).
3580 : *
3581 : * Pre-0.97 versions of mount() didn't have a flags word.
3582 : * When the flags word was introduced its top half was required
3583 : * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
3584 : * Therefore, if this magic number is present, it carries no information
3585 : * and must be discarded.
3586 : */
3587 150337 : int path_mount(const char *dev_name, struct path *path,
3588 : const char *type_page, unsigned long flags, void *data_page)
3589 : {
3590 150337 : unsigned int mnt_flags = 0, sb_flags;
3591 150337 : int ret;
3592 :
3593 : /* Discard magic */
3594 150337 : if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
3595 78 : flags &= ~MS_MGC_MSK;
3596 :
3597 : /* Basic sanity checks */
3598 150337 : if (data_page)
3599 119462 : ((char *)data_page)[PAGE_SIZE - 1] = 0;
3600 :
3601 150337 : if (flags & MS_NOUSER)
3602 : return -EINVAL;
3603 :
3604 150337 : ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
3605 150337 : if (ret)
3606 : return ret;
3607 150337 : if (!may_mount())
3608 : return -EPERM;
3609 150329 : if (flags & SB_MANDLOCK)
3610 0 : warn_mandlock();
3611 :
3612 : /* Default to relatime unless overriden */
3613 150329 : if (!(flags & MS_NOATIME))
3614 150228 : mnt_flags |= MNT_RELATIME;
3615 :
3616 : /* Separate the per-mountpoint flags */
3617 150329 : if (flags & MS_NOSUID)
3618 9213 : mnt_flags |= MNT_NOSUID;
3619 150329 : if (flags & MS_NODEV)
3620 5147 : mnt_flags |= MNT_NODEV;
3621 150329 : if (flags & MS_NOEXEC)
3622 4401 : mnt_flags |= MNT_NOEXEC;
3623 150329 : if (flags & MS_NOATIME)
3624 67 : mnt_flags |= MNT_NOATIME;
3625 150329 : if (flags & MS_NODIRATIME)
3626 17 : mnt_flags |= MNT_NODIRATIME;
3627 150329 : if (flags & MS_STRICTATIME)
3628 119 : mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
3629 150329 : if (flags & MS_RDONLY)
3630 20651 : mnt_flags |= MNT_READONLY;
3631 150329 : if (flags & MS_NOSYMFOLLOW)
3632 0 : mnt_flags |= MNT_NOSYMFOLLOW;
3633 :
3634 : /* The default atime for remount is preservation */
3635 150329 : if ((flags & MS_REMOUNT) &&
3636 : ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
3637 : MS_STRICTATIME)) == 0)) {
3638 20959 : mnt_flags &= ~MNT_ATIME_MASK;
3639 20959 : mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK;
3640 : }
3641 :
3642 150329 : sb_flags = flags & (SB_RDONLY |
3643 : SB_SYNCHRONOUS |
3644 : SB_MANDLOCK |
3645 : SB_DIRSYNC |
3646 : SB_SILENT |
3647 : SB_POSIXACL |
3648 : SB_LAZYTIME |
3649 : SB_I_VERSION);
3650 :
3651 150329 : if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
3652 12119 : return do_reconfigure_mnt(path, mnt_flags);
3653 138210 : if (flags & MS_REMOUNT)
3654 8857 : return do_remount(path, flags, sb_flags, mnt_flags, data_page);
3655 129353 : if (flags & MS_BIND)
3656 2364 : return do_loopback(path, dev_name, flags & MS_REC);
3657 126989 : if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
3658 2640 : return do_change_type(path, flags);
3659 124349 : if (flags & MS_MOVE)
3660 370 : return do_move_mount_old(path, dev_name);
3661 :
3662 123979 : return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
3663 : data_page);
3664 : }
3665 :
3666 150358 : long do_mount(const char *dev_name, const char __user *dir_name,
3667 : const char *type_page, unsigned long flags, void *data_page)
3668 : {
3669 150358 : struct path path;
3670 150358 : int ret;
3671 :
3672 150358 : ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
3673 150365 : if (ret)
3674 13 : return ret;
3675 150352 : ret = path_mount(dev_name, &path, type_page, flags, data_page);
3676 150382 : path_put(&path);
3677 150382 : return ret;
3678 : }
3679 :
3680 136373 : static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
3681 : {
3682 136373 : return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
3683 : }
3684 :
3685 : static void dec_mnt_namespaces(struct ucounts *ucounts)
3686 : {
3687 136373 : dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
3688 : }
3689 :
3690 136373 : static void free_mnt_ns(struct mnt_namespace *ns)
3691 : {
3692 136373 : if (!is_anon_ns(ns))
3693 1643 : ns_free_inum(&ns->ns);
3694 136373 : dec_mnt_namespaces(ns->ucounts);
3695 136373 : put_user_ns(ns->user_ns);
3696 136373 : kfree(ns);
3697 136373 : }
3698 :
3699 : /*
3700 : * Assign a sequence number so we can detect when we attempt to bind
3701 : * mount a reference to an older mount namespace into the current
3702 : * mount namespace, preventing reference counting loops. A 64bit
3703 : * number incrementing at 10Ghz will take 12,427 years to wrap which
3704 : * is effectively never, so we can ignore the possibility.
3705 : */
3706 : static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
3707 :
3708 136373 : static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
3709 : {
3710 136373 : struct mnt_namespace *new_ns;
3711 136373 : struct ucounts *ucounts;
3712 136373 : int ret;
3713 :
3714 136373 : ucounts = inc_mnt_namespaces(user_ns);
3715 136373 : if (!ucounts)
3716 : return ERR_PTR(-ENOSPC);
3717 :
3718 136373 : new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT);
3719 136373 : if (!new_ns) {
3720 0 : dec_mnt_namespaces(ucounts);
3721 0 : return ERR_PTR(-ENOMEM);
3722 : }
3723 136373 : if (!anon) {
3724 1643 : ret = ns_alloc_inum(&new_ns->ns);
3725 1643 : if (ret) {
3726 0 : kfree(new_ns);
3727 0 : dec_mnt_namespaces(ucounts);
3728 0 : return ERR_PTR(ret);
3729 : }
3730 : }
3731 136373 : new_ns->ns.ops = &mntns_operations;
3732 136373 : if (!anon)
3733 1643 : new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
3734 136373 : refcount_set(&new_ns->ns.count, 1);
3735 136373 : INIT_LIST_HEAD(&new_ns->list);
3736 136373 : init_waitqueue_head(&new_ns->poll);
3737 136373 : spin_lock_init(&new_ns->ns_lock);
3738 136373 : new_ns->user_ns = get_user_ns(user_ns);
3739 136373 : new_ns->ucounts = ucounts;
3740 136373 : return new_ns;
3741 : }
3742 :
3743 : __latent_entropy
3744 2457 : struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
3745 : struct user_namespace *user_ns, struct fs_struct *new_fs)
3746 : {
3747 2457 : struct mnt_namespace *new_ns;
3748 2457 : struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
3749 2457 : struct mount *p, *q;
3750 2457 : struct mount *old;
3751 2457 : struct mount *new;
3752 2457 : int copy_flags;
3753 :
3754 2457 : BUG_ON(!ns);
3755 :
3756 2457 : if (likely(!(flags & CLONE_NEWNS))) {
3757 814 : get_mnt_ns(ns);
3758 814 : return ns;
3759 : }
3760 :
3761 1643 : old = ns->root;
3762 :
3763 1643 : new_ns = alloc_mnt_ns(user_ns, false);
3764 1643 : if (IS_ERR(new_ns))
3765 : return new_ns;
3766 :
3767 1643 : namespace_lock();
3768 : /* First pass: copy the tree topology */
3769 1643 : copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
3770 1643 : if (user_ns != ns->user_ns)
3771 1378 : copy_flags |= CL_SHARED_TO_SLAVE;
3772 1643 : new = copy_tree(old, old->mnt.mnt_root, copy_flags);
3773 1643 : if (IS_ERR(new)) {
3774 0 : namespace_unlock();
3775 0 : free_mnt_ns(new_ns);
3776 0 : return ERR_CAST(new);
3777 : }
3778 1643 : if (user_ns != ns->user_ns) {
3779 1378 : lock_mount_hash();
3780 1378 : lock_mnt_tree(new);
3781 1378 : unlock_mount_hash();
3782 : }
3783 1643 : new_ns->root = new;
3784 1643 : list_add_tail(&new_ns->list, &new->mnt_list);
3785 :
3786 : /*
3787 : * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
3788 : * as belonging to new namespace. We have already acquired a private
3789 : * fs_struct, so tsk->fs->lock is not needed.
3790 : */
3791 1643 : p = old;
3792 1643 : q = new;
3793 151801 : while (p) {
3794 151801 : q->mnt_ns = new_ns;
3795 151801 : new_ns->mounts++;
3796 151801 : if (new_fs) {
3797 151801 : if (&p->mnt == new_fs->root.mnt) {
3798 1643 : new_fs->root.mnt = mntget(&q->mnt);
3799 1643 : rootmnt = &p->mnt;
3800 : }
3801 151801 : if (&p->mnt == new_fs->pwd.mnt) {
3802 1643 : new_fs->pwd.mnt = mntget(&q->mnt);
3803 1643 : pwdmnt = &p->mnt;
3804 : }
3805 : }
3806 151801 : p = next_mnt(p, old);
3807 151801 : q = next_mnt(q, new);
3808 151801 : if (!q)
3809 : break;
3810 : // an mntns binding we'd skipped?
3811 150158 : while (p->mnt.mnt_root != q->mnt.mnt_root)
3812 0 : p = next_mnt(skip_mnt_tree(p), old);
3813 : }
3814 1643 : namespace_unlock();
3815 :
3816 1643 : if (rootmnt)
3817 1643 : mntput(rootmnt);
3818 1643 : if (pwdmnt)
3819 1643 : mntput(pwdmnt);
3820 :
3821 : return new_ns;
3822 : }
3823 :
3824 3437 : struct dentry *mount_subtree(struct vfsmount *m, const char *name)
3825 : {
3826 3437 : struct mount *mnt = real_mount(m);
3827 3437 : struct mnt_namespace *ns;
3828 3437 : struct super_block *s;
3829 3437 : struct path path;
3830 3437 : int err;
3831 :
3832 3437 : ns = alloc_mnt_ns(&init_user_ns, true);
3833 3437 : if (IS_ERR(ns)) {
3834 0 : mntput(m);
3835 0 : return ERR_CAST(ns);
3836 : }
3837 3437 : mnt->mnt_ns = ns;
3838 3437 : ns->root = mnt;
3839 3437 : ns->mounts++;
3840 3437 : list_add(&mnt->mnt_list, &ns->list);
3841 :
3842 3437 : err = vfs_path_lookup(m->mnt_root, m,
3843 : name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
3844 :
3845 3437 : put_mnt_ns(ns);
3846 :
3847 3437 : if (err)
3848 1 : return ERR_PTR(err);
3849 :
3850 : /* trade a vfsmount reference for active sb one */
3851 3436 : s = path.mnt->mnt_sb;
3852 3436 : atomic_inc(&s->s_active);
3853 3436 : mntput(path.mnt);
3854 : /* lock the sucker */
3855 3436 : down_write(&s->s_umount);
3856 : /* ... and return the root of (sub)tree on it */
3857 3436 : return path.dentry;
3858 : }
3859 : EXPORT_SYMBOL(mount_subtree);
3860 :
3861 300669 : SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
3862 : char __user *, type, unsigned long, flags, void __user *, data)
3863 : {
3864 150312 : int ret;
3865 150312 : char *kernel_type;
3866 150312 : char *kernel_dev;
3867 150312 : void *options;
3868 :
3869 150312 : kernel_type = copy_mount_string(type);
3870 150371 : ret = PTR_ERR(kernel_type);
3871 150371 : if (IS_ERR(kernel_type))
3872 0 : goto out_type;
3873 :
3874 150371 : kernel_dev = copy_mount_string(dev_name);
3875 150354 : ret = PTR_ERR(kernel_dev);
3876 150354 : if (IS_ERR(kernel_dev))
3877 0 : goto out_dev;
3878 :
3879 150354 : options = copy_mount_options(data);
3880 150333 : ret = PTR_ERR(options);
3881 150333 : if (IS_ERR(options))
3882 0 : goto out_data;
3883 :
3884 150333 : ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
3885 :
3886 150395 : kfree(options);
3887 150395 : out_data:
3888 150395 : kfree(kernel_dev);
3889 150395 : out_dev:
3890 150395 : kfree(kernel_type);
3891 150395 : out_type:
3892 150395 : return ret;
3893 : }
3894 :
3895 : #define FSMOUNT_VALID_FLAGS \
3896 : (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \
3897 : MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME | \
3898 : MOUNT_ATTR_NOSYMFOLLOW)
3899 :
3900 : #define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP)
3901 :
3902 : #define MOUNT_SETATTR_PROPAGATION_FLAGS \
3903 : (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)
3904 :
3905 2700 : static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
3906 : {
3907 2700 : unsigned int mnt_flags = 0;
3908 :
3909 2700 : if (attr_flags & MOUNT_ATTR_RDONLY)
3910 0 : mnt_flags |= MNT_READONLY;
3911 2700 : if (attr_flags & MOUNT_ATTR_NOSUID)
3912 0 : mnt_flags |= MNT_NOSUID;
3913 2700 : if (attr_flags & MOUNT_ATTR_NODEV)
3914 0 : mnt_flags |= MNT_NODEV;
3915 2700 : if (attr_flags & MOUNT_ATTR_NOEXEC)
3916 0 : mnt_flags |= MNT_NOEXEC;
3917 2700 : if (attr_flags & MOUNT_ATTR_NODIRATIME)
3918 0 : mnt_flags |= MNT_NODIRATIME;
3919 2700 : if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW)
3920 0 : mnt_flags |= MNT_NOSYMFOLLOW;
3921 :
3922 2700 : return mnt_flags;
3923 : }
3924 :
3925 : /*
3926 : * Create a kernel mount representation for a new, prepared superblock
3927 : * (specified by fs_fd) and attach to an open_tree-like file descriptor.
3928 : */
3929 0 : SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
3930 : unsigned int, attr_flags)
3931 : {
3932 0 : struct mnt_namespace *ns;
3933 0 : struct fs_context *fc;
3934 0 : struct file *file;
3935 0 : struct path newmount;
3936 0 : struct mount *mnt;
3937 0 : struct fd f;
3938 0 : unsigned int mnt_flags = 0;
3939 0 : long ret;
3940 :
3941 0 : if (!may_mount())
3942 : return -EPERM;
3943 :
3944 0 : if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
3945 : return -EINVAL;
3946 :
3947 0 : if (attr_flags & ~FSMOUNT_VALID_FLAGS)
3948 : return -EINVAL;
3949 :
3950 0 : mnt_flags = attr_flags_to_mnt_flags(attr_flags);
3951 :
3952 0 : switch (attr_flags & MOUNT_ATTR__ATIME) {
3953 : case MOUNT_ATTR_STRICTATIME:
3954 : break;
3955 0 : case MOUNT_ATTR_NOATIME:
3956 0 : mnt_flags |= MNT_NOATIME;
3957 0 : break;
3958 0 : case MOUNT_ATTR_RELATIME:
3959 0 : mnt_flags |= MNT_RELATIME;
3960 0 : break;
3961 : default:
3962 : return -EINVAL;
3963 : }
3964 :
3965 0 : f = fdget(fs_fd);
3966 0 : if (!f.file)
3967 : return -EBADF;
3968 :
3969 0 : ret = -EINVAL;
3970 0 : if (f.file->f_op != &fscontext_fops)
3971 0 : goto err_fsfd;
3972 :
3973 0 : fc = f.file->private_data;
3974 :
3975 0 : ret = mutex_lock_interruptible(&fc->uapi_mutex);
3976 0 : if (ret < 0)
3977 0 : goto err_fsfd;
3978 :
3979 : /* There must be a valid superblock or we can't mount it */
3980 0 : ret = -EINVAL;
3981 0 : if (!fc->root)
3982 0 : goto err_unlock;
3983 :
3984 0 : ret = -EPERM;
3985 0 : if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
3986 0 : pr_warn("VFS: Mount too revealing\n");
3987 0 : goto err_unlock;
3988 : }
3989 :
3990 0 : ret = -EBUSY;
3991 0 : if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
3992 0 : goto err_unlock;
3993 :
3994 0 : if (fc->sb_flags & SB_MANDLOCK)
3995 0 : warn_mandlock();
3996 :
3997 0 : newmount.mnt = vfs_create_mount(fc);
3998 0 : if (IS_ERR(newmount.mnt)) {
3999 0 : ret = PTR_ERR(newmount.mnt);
4000 0 : goto err_unlock;
4001 : }
4002 0 : newmount.dentry = dget(fc->root);
4003 0 : newmount.mnt->mnt_flags = mnt_flags;
4004 :
4005 : /* We've done the mount bit - now move the file context into more or
4006 : * less the same state as if we'd done an fspick(). We don't want to
4007 : * do any memory allocation or anything like that at this point as we
4008 : * don't want to have to handle any errors incurred.
4009 : */
4010 0 : vfs_clean_context(fc);
4011 :
4012 0 : ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
4013 0 : if (IS_ERR(ns)) {
4014 0 : ret = PTR_ERR(ns);
4015 0 : goto err_path;
4016 : }
4017 0 : mnt = real_mount(newmount.mnt);
4018 0 : mnt->mnt_ns = ns;
4019 0 : ns->root = mnt;
4020 0 : ns->mounts = 1;
4021 0 : list_add(&mnt->mnt_list, &ns->list);
4022 0 : mntget(newmount.mnt);
4023 :
4024 : /* Attach to an apparent O_PATH fd with a note that we need to unmount
4025 : * it, not just simply put it.
4026 : */
4027 0 : file = dentry_open(&newmount, O_PATH, fc->cred);
4028 0 : if (IS_ERR(file)) {
4029 0 : dissolve_on_fput(newmount.mnt);
4030 0 : ret = PTR_ERR(file);
4031 0 : goto err_path;
4032 : }
4033 0 : file->f_mode |= FMODE_NEED_UNMOUNT;
4034 :
4035 0 : ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
4036 0 : if (ret >= 0)
4037 0 : fd_install(ret, file);
4038 : else
4039 0 : fput(file);
4040 :
4041 0 : err_path:
4042 0 : path_put(&newmount);
4043 0 : err_unlock:
4044 0 : mutex_unlock(&fc->uapi_mutex);
4045 0 : err_fsfd:
4046 0 : fdput(f);
4047 0 : return ret;
4048 : }
4049 :
4050 : /*
4051 : * Move a mount from one place to another. In combination with
4052 : * fsopen()/fsmount() this is used to install a new mount and in combination
4053 : * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
4054 : * a mount subtree.
4055 : *
4056 : * Note the flags value is a combination of MOVE_MOUNT_* flags.
4057 : */
4058 261068 : SYSCALL_DEFINE5(move_mount,
4059 : int, from_dfd, const char __user *, from_pathname,
4060 : int, to_dfd, const char __user *, to_pathname,
4061 : unsigned int, flags)
4062 : {
4063 130534 : struct path from_path, to_path;
4064 130534 : unsigned int lflags;
4065 130534 : int ret = 0;
4066 :
4067 130534 : if (!may_mount())
4068 : return -EPERM;
4069 :
4070 130534 : if (flags & ~MOVE_MOUNT__MASK)
4071 : return -EINVAL;
4072 :
4073 130534 : if ((flags & (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) ==
4074 : (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP))
4075 : return -EINVAL;
4076 :
4077 : /* If someone gives a pathname, they aren't permitted to move
4078 : * from an fd that requires unmount as we can't get at the flag
4079 : * to clear it afterwards.
4080 : */
4081 130534 : lflags = 0;
4082 130534 : if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW;
4083 130534 : if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
4084 130534 : if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
4085 :
4086 130534 : ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
4087 130534 : if (ret < 0)
4088 0 : return ret;
4089 :
4090 130534 : lflags = 0;
4091 130534 : if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW;
4092 130534 : if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
4093 130534 : if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
4094 :
4095 130534 : ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
4096 130534 : if (ret < 0)
4097 0 : goto out_from;
4098 :
4099 130534 : ret = security_move_mount(&from_path, &to_path);
4100 130534 : if (ret < 0)
4101 : goto out_to;
4102 :
4103 130534 : if (flags & MOVE_MOUNT_SET_GROUP)
4104 0 : ret = do_set_group(&from_path, &to_path);
4105 : else
4106 130534 : ret = do_move_mount(&from_path, &to_path,
4107 130534 : (flags & MOVE_MOUNT_BENEATH));
4108 :
4109 130534 : out_to:
4110 130534 : path_put(&to_path);
4111 130534 : out_from:
4112 130534 : path_put(&from_path);
4113 130534 : return ret;
4114 : }
4115 :
4116 : /*
4117 : * Return true if path is reachable from root
4118 : *
4119 : * namespace_sem or mount_lock is held
4120 : */
4121 5153 : bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
4122 : const struct path *root)
4123 : {
4124 21082 : while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
4125 15929 : dentry = mnt->mnt_mountpoint;
4126 15929 : mnt = mnt->mnt_parent;
4127 : }
4128 5153 : return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
4129 : }
4130 :
4131 0 : bool path_is_under(const struct path *path1, const struct path *path2)
4132 : {
4133 0 : bool res;
4134 0 : read_seqlock_excl(&mount_lock);
4135 0 : res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
4136 0 : read_sequnlock_excl(&mount_lock);
4137 0 : return res;
4138 : }
4139 : EXPORT_SYMBOL(path_is_under);
4140 :
4141 : /*
4142 : * pivot_root Semantics:
4143 : * Moves the root file system of the current process to the directory put_old,
4144 : * makes new_root as the new root file system of the current process, and sets
4145 : * root/cwd of all processes which had them on the current root to new_root.
4146 : *
4147 : * Restrictions:
4148 : * The new_root and put_old must be directories, and must not be on the
4149 : * same file system as the current process root. The put_old must be
4150 : * underneath new_root, i.e. adding a non-zero number of /.. to the string
4151 : * pointed to by put_old must yield the same directory as new_root. No other
4152 : * file system may be mounted on put_old. After all, new_root is a mountpoint.
4153 : *
4154 : * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
4155 : * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
4156 : * in this situation.
4157 : *
4158 : * Notes:
4159 : * - we don't move root/cwd if they are not at the root (reason: if something
4160 : * cared enough to change them, it's probably wrong to force them elsewhere)
4161 : * - it's okay to pick a root that isn't the root of a file system, e.g.
4162 : * /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
4163 : * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
4164 : * first.
4165 : */
4166 0 : SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
4167 : const char __user *, put_old)
4168 : {
4169 0 : struct path new, old, root;
4170 0 : struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
4171 0 : struct mountpoint *old_mp, *root_mp;
4172 0 : int error;
4173 :
4174 0 : if (!may_mount())
4175 : return -EPERM;
4176 :
4177 0 : error = user_path_at(AT_FDCWD, new_root,
4178 : LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
4179 0 : if (error)
4180 0 : goto out0;
4181 :
4182 0 : error = user_path_at(AT_FDCWD, put_old,
4183 : LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
4184 0 : if (error)
4185 0 : goto out1;
4186 :
4187 0 : error = security_sb_pivotroot(&old, &new);
4188 0 : if (error)
4189 : goto out2;
4190 :
4191 0 : get_fs_root(current->fs, &root);
4192 0 : old_mp = lock_mount(&old);
4193 0 : error = PTR_ERR(old_mp);
4194 0 : if (IS_ERR(old_mp))
4195 0 : goto out3;
4196 :
4197 0 : error = -EINVAL;
4198 0 : new_mnt = real_mount(new.mnt);
4199 0 : root_mnt = real_mount(root.mnt);
4200 0 : old_mnt = real_mount(old.mnt);
4201 0 : ex_parent = new_mnt->mnt_parent;
4202 0 : root_parent = root_mnt->mnt_parent;
4203 0 : if (IS_MNT_SHARED(old_mnt) ||
4204 0 : IS_MNT_SHARED(ex_parent) ||
4205 0 : IS_MNT_SHARED(root_parent))
4206 0 : goto out4;
4207 0 : if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
4208 0 : goto out4;
4209 0 : if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
4210 0 : goto out4;
4211 0 : error = -ENOENT;
4212 0 : if (d_unlinked(new.dentry))
4213 0 : goto out4;
4214 0 : error = -EBUSY;
4215 0 : if (new_mnt == root_mnt || old_mnt == root_mnt)
4216 0 : goto out4; /* loop, on the same file system */
4217 0 : error = -EINVAL;
4218 0 : if (!path_mounted(&root))
4219 0 : goto out4; /* not a mountpoint */
4220 0 : if (!mnt_has_parent(root_mnt))
4221 0 : goto out4; /* not attached */
4222 0 : if (!path_mounted(&new))
4223 0 : goto out4; /* not a mountpoint */
4224 0 : if (!mnt_has_parent(new_mnt))
4225 0 : goto out4; /* not attached */
4226 : /* make sure we can reach put_old from new_root */
4227 0 : if (!is_path_reachable(old_mnt, old.dentry, &new))
4228 0 : goto out4;
4229 : /* make certain new is below the root */
4230 0 : if (!is_path_reachable(new_mnt, new.dentry, &root))
4231 0 : goto out4;
4232 0 : lock_mount_hash();
4233 0 : umount_mnt(new_mnt);
4234 0 : root_mp = unhash_mnt(root_mnt); /* we'll need its mountpoint */
4235 0 : if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
4236 0 : new_mnt->mnt.mnt_flags |= MNT_LOCKED;
4237 0 : root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
4238 : }
4239 : /* mount old root on put_old */
4240 0 : attach_mnt(root_mnt, old_mnt, old_mp, false);
4241 : /* mount new_root on / */
4242 0 : attach_mnt(new_mnt, root_parent, root_mp, false);
4243 0 : mnt_add_count(root_parent, -1);
4244 0 : touch_mnt_namespace(current->nsproxy->mnt_ns);
4245 : /* A moved mount should not expire automatically */
4246 0 : list_del_init(&new_mnt->mnt_expire);
4247 0 : put_mountpoint(root_mp);
4248 0 : unlock_mount_hash();
4249 0 : chroot_fs_refs(&root, &new);
4250 0 : error = 0;
4251 0 : out4:
4252 0 : unlock_mount(old_mp);
4253 0 : if (!error)
4254 0 : mntput_no_expire(ex_parent);
4255 0 : out3:
4256 0 : path_put(&root);
4257 : out2:
4258 0 : path_put(&old);
4259 0 : out1:
4260 0 : path_put(&new);
4261 0 : out0:
4262 0 : return error;
4263 : }
4264 :
4265 : static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
4266 : {
4267 2508 : unsigned int flags = mnt->mnt.mnt_flags;
4268 :
4269 : /* flags to clear */
4270 2508 : flags &= ~kattr->attr_clr;
4271 : /* flags to raise */
4272 2508 : flags |= kattr->attr_set;
4273 :
4274 2508 : return flags;
4275 : }
4276 :
4277 1254 : static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
4278 : {
4279 1254 : struct vfsmount *m = &mnt->mnt;
4280 1254 : struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;
4281 :
4282 1254 : if (!kattr->mnt_idmap)
4283 : return 0;
4284 :
4285 : /*
4286 : * Creating an idmapped mount with the filesystem wide idmapping
4287 : * doesn't make sense so block that. We don't allow mushy semantics.
4288 : */
4289 1254 : if (!check_fsmapping(kattr->mnt_idmap, m->mnt_sb))
4290 : return -EINVAL;
4291 :
4292 : /*
4293 : * Once a mount has been idmapped we don't allow it to change its
4294 : * mapping. It makes things simpler and callers can just create
4295 : * another bind-mount they can idmap if they want to.
4296 : */
4297 1254 : if (is_idmapped_mnt(m))
4298 : return -EPERM;
4299 :
4300 : /* The underlying filesystem doesn't support idmapped mounts yet. */
4301 1254 : if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
4302 : return -EINVAL;
4303 :
4304 : /* We're not controlling the superblock. */
4305 1254 : if (!ns_capable(fs_userns, CAP_SYS_ADMIN))
4306 : return -EPERM;
4307 :
4308 : /* Mount has already been visible in the filesystem hierarchy. */
4309 1254 : if (!is_anon_ns(mnt->mnt_ns))
4310 0 : return -EINVAL;
4311 :
4312 : return 0;
4313 : }
4314 :
4315 : /**
4316 : * mnt_allow_writers() - check whether the attribute change allows writers
4317 : * @kattr: the new mount attributes
4318 : * @mnt: the mount to which @kattr will be applied
4319 : *
4320 : * Check whether thew new mount attributes in @kattr allow concurrent writers.
4321 : *
4322 : * Return: true if writers need to be held, false if not
4323 : */
4324 : static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
4325 : const struct mount *mnt)
4326 : {
4327 1254 : return (!(kattr->attr_set & MNT_READONLY) ||
4328 0 : (mnt->mnt.mnt_flags & MNT_READONLY)) &&
4329 1254 : !kattr->mnt_idmap;
4330 : }
4331 :
4332 1254 : static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
4333 : {
4334 1254 : struct mount *m;
4335 1254 : int err;
4336 :
4337 2218 : for (m = mnt; m; m = next_mnt(m, mnt)) {
4338 1254 : if (!can_change_locked_flags(m, recalc_flags(kattr, m))) {
4339 : err = -EPERM;
4340 : break;
4341 : }
4342 :
4343 1254 : err = can_idmap_mount(kattr, m);
4344 1254 : if (err)
4345 : break;
4346 :
4347 2508 : if (!mnt_allow_writers(kattr, m)) {
4348 1254 : err = mnt_hold_writers(m);
4349 1254 : if (err)
4350 : break;
4351 : }
4352 :
4353 1254 : if (!kattr->recurse)
4354 : return 0;
4355 : }
4356 :
4357 482 : if (err) {
4358 : struct mount *p;
4359 :
4360 : /*
4361 : * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will
4362 : * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all
4363 : * mounts and needs to take care to include the first mount.
4364 : */
4365 0 : for (p = mnt; p; p = next_mnt(p, mnt)) {
4366 : /* If we had to hold writers unblock them. */
4367 0 : if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
4368 0 : mnt_unhold_writers(p);
4369 :
4370 : /*
4371 : * We're done once the first mount we changed got
4372 : * MNT_WRITE_HOLD unset.
4373 : */
4374 0 : if (p == m)
4375 : break;
4376 : }
4377 : }
4378 : return err;
4379 : }
4380 :
4381 1254 : static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
4382 : {
4383 1254 : if (!kattr->mnt_idmap)
4384 : return;
4385 :
4386 : /*
4387 : * Pairs with smp_load_acquire() in mnt_idmap().
4388 : *
4389 : * Since we only allow a mount to change the idmapping once and
4390 : * verified this in can_idmap_mount() we know that the mount has
4391 : * @nop_mnt_idmap attached to it. So there's no need to drop any
4392 : * references.
4393 : */
4394 1254 : smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
4395 : }
4396 :
4397 1254 : static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
4398 : {
4399 1254 : struct mount *m;
4400 :
4401 2218 : for (m = mnt; m; m = next_mnt(m, mnt)) {
4402 1254 : unsigned int flags;
4403 :
4404 1254 : do_idmap_mount(kattr, m);
4405 1254 : flags = recalc_flags(kattr, m);
4406 1254 : WRITE_ONCE(m->mnt.mnt_flags, flags);
4407 :
4408 : /* If we had to hold writers unblock them. */
4409 1254 : if (m->mnt.mnt_flags & MNT_WRITE_HOLD)
4410 1254 : mnt_unhold_writers(m);
4411 :
4412 1254 : if (kattr->propagation)
4413 0 : change_mnt_propagation(m, kattr->propagation);
4414 1254 : if (!kattr->recurse)
4415 : break;
4416 : }
4417 1254 : touch_mnt_namespace(mnt->mnt_ns);
4418 1254 : }
4419 :
4420 1254 : static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
4421 : {
4422 1254 : struct mount *mnt = real_mount(path->mnt);
4423 1254 : int err = 0;
4424 :
4425 1254 : if (!path_mounted(path))
4426 : return -EINVAL;
4427 :
4428 1254 : if (kattr->mnt_userns) {
4429 1254 : struct mnt_idmap *mnt_idmap;
4430 :
4431 1254 : mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns);
4432 1254 : if (IS_ERR(mnt_idmap))
4433 0 : return PTR_ERR(mnt_idmap);
4434 1254 : kattr->mnt_idmap = mnt_idmap;
4435 : }
4436 :
4437 1254 : if (kattr->propagation) {
4438 : /*
4439 : * Only take namespace_lock() if we're actually changing
4440 : * propagation.
4441 : */
4442 0 : namespace_lock();
4443 0 : if (kattr->propagation == MS_SHARED) {
4444 0 : err = invent_group_ids(mnt, kattr->recurse);
4445 0 : if (err) {
4446 0 : namespace_unlock();
4447 0 : return err;
4448 : }
4449 : }
4450 : }
4451 :
4452 1254 : err = -EINVAL;
4453 1254 : lock_mount_hash();
4454 :
4455 : /* Ensure that this isn't anything purely vfs internal. */
4456 2508 : if (!is_mounted(&mnt->mnt))
4457 0 : goto out;
4458 :
4459 : /*
4460 : * If this is an attached mount make sure it's located in the callers
4461 : * mount namespace. If it's not don't let the caller interact with it.
4462 : * If this is a detached mount make sure it has an anonymous mount
4463 : * namespace attached to it, i.e. we've created it via OPEN_TREE_CLONE.
4464 : */
4465 1254 : if (!(mnt_has_parent(mnt) ? check_mnt(mnt) : is_anon_ns(mnt->mnt_ns)))
4466 0 : goto out;
4467 :
4468 : /*
4469 : * First, we get the mount tree in a shape where we can change mount
4470 : * properties without failure. If we succeeded to do so we commit all
4471 : * changes and if we failed we clean up.
4472 : */
4473 1254 : err = mount_setattr_prepare(kattr, mnt);
4474 1254 : if (!err)
4475 1254 : mount_setattr_commit(kattr, mnt);
4476 :
4477 0 : out:
4478 1254 : unlock_mount_hash();
4479 :
4480 1254 : if (kattr->propagation) {
4481 0 : if (err)
4482 0 : cleanup_group_ids(mnt, NULL);
4483 0 : namespace_unlock();
4484 : }
4485 :
4486 : return err;
4487 : }
4488 :
4489 1350 : static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
4490 : struct mount_kattr *kattr, unsigned int flags)
4491 : {
4492 1350 : int err = 0;
4493 1350 : struct ns_common *ns;
4494 1350 : struct user_namespace *mnt_userns;
4495 1350 : struct fd f;
4496 :
4497 1350 : if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
4498 : return 0;
4499 :
4500 : /*
4501 : * We currently do not support clearing an idmapped mount. If this ever
4502 : * is a use-case we can revisit this but for now let's keep it simple
4503 : * and not allow it.
4504 : */
4505 1350 : if (attr->attr_clr & MOUNT_ATTR_IDMAP)
4506 : return -EINVAL;
4507 :
4508 1350 : if (attr->userns_fd > INT_MAX)
4509 : return -EINVAL;
4510 :
4511 1254 : f = fdget(attr->userns_fd);
4512 1254 : if (!f.file)
4513 : return -EBADF;
4514 :
4515 1254 : if (!proc_ns_file(f.file)) {
4516 0 : err = -EINVAL;
4517 0 : goto out_fput;
4518 : }
4519 :
4520 1254 : ns = get_proc_ns(file_inode(f.file));
4521 1254 : if (ns->ops->type != CLONE_NEWUSER) {
4522 0 : err = -EINVAL;
4523 0 : goto out_fput;
4524 : }
4525 :
4526 : /*
4527 : * The initial idmapping cannot be used to create an idmapped
4528 : * mount. We use the initial idmapping as an indicator of a mount
4529 : * that is not idmapped. It can simply be passed into helpers that
4530 : * are aware of idmapped mounts as a convenient shortcut. A user
4531 : * can just create a dedicated identity mapping to achieve the same
4532 : * result.
4533 : */
4534 1254 : mnt_userns = container_of(ns, struct user_namespace, ns);
4535 1254 : if (mnt_userns == &init_user_ns) {
4536 0 : err = -EPERM;
4537 0 : goto out_fput;
4538 : }
4539 :
4540 : /* We're not controlling the target namespace. */
4541 1254 : if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) {
4542 0 : err = -EPERM;
4543 0 : goto out_fput;
4544 : }
4545 :
4546 2508 : kattr->mnt_userns = get_user_ns(mnt_userns);
4547 :
4548 1254 : out_fput:
4549 1254 : fdput(f);
4550 1254 : return err;
4551 : }
4552 :
4553 1350 : static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
4554 : struct mount_kattr *kattr, unsigned int flags)
4555 : {
4556 1350 : unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
4557 :
4558 1350 : if (flags & AT_NO_AUTOMOUNT)
4559 0 : lookup_flags &= ~LOOKUP_AUTOMOUNT;
4560 1350 : if (flags & AT_SYMLINK_NOFOLLOW)
4561 0 : lookup_flags &= ~LOOKUP_FOLLOW;
4562 1350 : if (flags & AT_EMPTY_PATH)
4563 1350 : lookup_flags |= LOOKUP_EMPTY;
4564 :
4565 1350 : *kattr = (struct mount_kattr) {
4566 : .lookup_flags = lookup_flags,
4567 1350 : .recurse = !!(flags & AT_RECURSIVE),
4568 : };
4569 :
4570 1350 : if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
4571 : return -EINVAL;
4572 1350 : if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
4573 : return -EINVAL;
4574 1350 : kattr->propagation = attr->propagation;
4575 :
4576 1350 : if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
4577 : return -EINVAL;
4578 :
4579 1350 : kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
4580 1350 : kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);
4581 :
4582 : /*
4583 : * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap,
4584 : * users wanting to transition to a different atime setting cannot
4585 : * simply specify the atime setting in @attr_set, but must also
4586 : * specify MOUNT_ATTR__ATIME in the @attr_clr field.
4587 : * So ensure that MOUNT_ATTR__ATIME can't be partially set in
4588 : * @attr_clr and that @attr_set can't have any atime bits set if
4589 : * MOUNT_ATTR__ATIME isn't set in @attr_clr.
4590 : */
4591 1350 : if (attr->attr_clr & MOUNT_ATTR__ATIME) {
4592 0 : if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME)
4593 : return -EINVAL;
4594 :
4595 : /*
4596 : * Clear all previous time settings as they are mutually
4597 : * exclusive.
4598 : */
4599 0 : kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME;
4600 0 : switch (attr->attr_set & MOUNT_ATTR__ATIME) {
4601 0 : case MOUNT_ATTR_RELATIME:
4602 0 : kattr->attr_set |= MNT_RELATIME;
4603 0 : break;
4604 0 : case MOUNT_ATTR_NOATIME:
4605 0 : kattr->attr_set |= MNT_NOATIME;
4606 0 : break;
4607 : case MOUNT_ATTR_STRICTATIME:
4608 : break;
4609 : default:
4610 : return -EINVAL;
4611 : }
4612 : } else {
4613 1350 : if (attr->attr_set & MOUNT_ATTR__ATIME)
4614 : return -EINVAL;
4615 : }
4616 :
4617 1350 : return build_mount_idmapped(attr, usize, kattr, flags);
4618 : }
4619 :
4620 1254 : static void finish_mount_kattr(struct mount_kattr *kattr)
4621 : {
4622 1254 : put_user_ns(kattr->mnt_userns);
4623 1254 : kattr->mnt_userns = NULL;
4624 :
4625 1254 : if (kattr->mnt_idmap)
4626 1254 : mnt_idmap_put(kattr->mnt_idmap);
4627 1254 : }
4628 :
4629 2892 : SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
4630 : unsigned int, flags, struct mount_attr __user *, uattr,
4631 : size_t, usize)
4632 : {
4633 1446 : int err;
4634 1446 : struct path target;
4635 1446 : struct mount_attr attr;
4636 1446 : struct mount_kattr kattr;
4637 :
4638 1446 : BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
4639 :
4640 1446 : if (flags & ~(AT_EMPTY_PATH |
4641 : AT_RECURSIVE |
4642 : AT_SYMLINK_NOFOLLOW |
4643 : AT_NO_AUTOMOUNT))
4644 : return -EINVAL;
4645 :
4646 1446 : if (unlikely(usize > PAGE_SIZE))
4647 : return -E2BIG;
4648 1446 : if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
4649 : return -EINVAL;
4650 :
4651 1350 : if (!may_mount())
4652 : return -EPERM;
4653 :
4654 1350 : err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
4655 0 : if (err)
4656 0 : return err;
4657 :
4658 : /* Don't bother walking through the mounts if this is a nop. */
4659 1350 : if (attr.attr_set == 0 &&
4660 0 : attr.attr_clr == 0 &&
4661 0 : attr.propagation == 0)
4662 : return 0;
4663 :
4664 1350 : err = build_mount_kattr(&attr, usize, &kattr, flags);
4665 1350 : if (err)
4666 96 : return err;
4667 :
4668 1254 : err = user_path_at(dfd, path, kattr.lookup_flags, &target);
4669 1254 : if (!err) {
4670 1254 : err = do_mount_setattr(&target, &kattr);
4671 1254 : path_put(&target);
4672 : }
4673 1254 : finish_mount_kattr(&kattr);
4674 1254 : return err;
4675 : }
4676 :
4677 0 : static void __init init_mount_tree(void)
4678 : {
4679 0 : struct vfsmount *mnt;
4680 0 : struct mount *m;
4681 0 : struct mnt_namespace *ns;
4682 0 : struct path root;
4683 :
4684 0 : mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
4685 0 : if (IS_ERR(mnt))
4686 0 : panic("Can't create rootfs");
4687 :
4688 0 : ns = alloc_mnt_ns(&init_user_ns, false);
4689 0 : if (IS_ERR(ns))
4690 0 : panic("Can't allocate initial namespace");
4691 0 : m = real_mount(mnt);
4692 0 : m->mnt_ns = ns;
4693 0 : ns->root = m;
4694 0 : ns->mounts = 1;
4695 0 : list_add(&m->mnt_list, &ns->list);
4696 0 : init_task.nsproxy->mnt_ns = ns;
4697 0 : get_mnt_ns(ns);
4698 :
4699 0 : root.mnt = mnt;
4700 0 : root.dentry = mnt->mnt_root;
4701 0 : mnt->mnt_flags |= MNT_LOCKED;
4702 :
4703 0 : set_fs_pwd(current->fs, &root);
4704 0 : set_fs_root(current->fs, &root);
4705 0 : }
4706 :
4707 0 : void __init mnt_init(void)
4708 : {
4709 0 : int err;
4710 :
4711 0 : mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
4712 : 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
4713 :
4714 0 : mount_hashtable = alloc_large_system_hash("Mount-cache",
4715 : sizeof(struct hlist_head),
4716 : mhash_entries, 19,
4717 : HASH_ZERO,
4718 : &m_hash_shift, &m_hash_mask, 0, 0);
4719 0 : mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
4720 : sizeof(struct hlist_head),
4721 : mphash_entries, 19,
4722 : HASH_ZERO,
4723 : &mp_hash_shift, &mp_hash_mask, 0, 0);
4724 :
4725 0 : if (!mount_hashtable || !mountpoint_hashtable)
4726 0 : panic("Failed to allocate mount hash table\n");
4727 :
4728 0 : kernfs_init();
4729 :
4730 0 : err = sysfs_init();
4731 0 : if (err)
4732 0 : printk(KERN_WARNING "%s: sysfs_init error: %d\n",
4733 : __func__, err);
4734 0 : fs_kobj = kobject_create_and_add("fs", NULL);
4735 0 : if (!fs_kobj)
4736 0 : printk(KERN_WARNING "%s: kobj create error\n", __func__);
4737 0 : shmem_init();
4738 0 : init_rootfs();
4739 0 : init_mount_tree();
4740 0 : }
4741 :
4742 8054967 : void put_mnt_ns(struct mnt_namespace *ns)
4743 : {
4744 8054967 : if (!refcount_dec_and_test(&ns->ns.count))
4745 : return;
4746 5080 : drop_collected_mounts(&ns->root->mnt);
4747 5080 : free_mnt_ns(ns);
4748 : }
4749 :
4750 0 : struct vfsmount *kern_mount(struct file_system_type *type)
4751 : {
4752 0 : struct vfsmount *mnt;
4753 0 : mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
4754 0 : if (!IS_ERR(mnt)) {
4755 : /*
4756 : * it is a longterm mount, don't release mnt until
4757 : * we unmount before file sys is unregistered
4758 : */
4759 0 : real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
4760 : }
4761 0 : return mnt;
4762 : }
4763 : EXPORT_SYMBOL_GPL(kern_mount);
4764 :
4765 0 : void kern_unmount(struct vfsmount *mnt)
4766 : {
4767 : /* release long term mount so mount point can be released */
4768 0 : if (!IS_ERR(mnt)) {
4769 0 : mnt_make_shortterm(mnt);
4770 0 : synchronize_rcu(); /* yecchhh... */
4771 0 : mntput(mnt);
4772 : }
4773 0 : }
4774 : EXPORT_SYMBOL(kern_unmount);
4775 :
4776 52294 : void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
4777 : {
4778 52294 : unsigned int i;
4779 :
4780 208912 : for (i = 0; i < num; i++)
4781 156618 : mnt_make_shortterm(mnt[i]);
4782 52294 : synchronize_rcu_expedited();
4783 261609 : for (i = 0; i < num; i++)
4784 156977 : mntput(mnt[i]);
4785 52338 : }
4786 : EXPORT_SYMBOL(kern_unmount_array);
4787 :
4788 0 : bool our_mnt(struct vfsmount *mnt)
4789 : {
4790 0 : return check_mnt(real_mount(mnt));
4791 : }
4792 :
4793 1521 : bool current_chrooted(void)
4794 : {
4795 : /* Does the current process have a non-standard root */
4796 1521 : struct path ns_root;
4797 1521 : struct path fs_root;
4798 1521 : bool chrooted;
4799 :
4800 : /* Find the namespace root */
4801 1521 : ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt;
4802 1521 : ns_root.dentry = ns_root.mnt->mnt_root;
4803 1521 : path_get(&ns_root);
4804 3042 : while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
4805 : ;
4806 :
4807 1521 : get_fs_root(current->fs, &fs_root);
4808 :
4809 1521 : chrooted = !path_equal(&fs_root, &ns_root);
4810 :
4811 1521 : path_put(&fs_root);
4812 1521 : path_put(&ns_root);
4813 :
4814 1521 : return chrooted;
4815 : }
4816 :
4817 0 : static bool mnt_already_visible(struct mnt_namespace *ns,
4818 : const struct super_block *sb,
4819 : int *new_mnt_flags)
4820 : {
4821 0 : int new_flags = *new_mnt_flags;
4822 0 : struct mount *mnt;
4823 0 : bool visible = false;
4824 :
4825 0 : down_read(&namespace_sem);
4826 0 : lock_ns_list(ns);
4827 0 : list_for_each_entry(mnt, &ns->list, mnt_list) {
4828 0 : struct mount *child;
4829 0 : int mnt_flags;
4830 :
4831 0 : if (mnt_is_cursor(mnt))
4832 0 : continue;
4833 :
4834 0 : if (mnt->mnt.mnt_sb->s_type != sb->s_type)
4835 0 : continue;
4836 :
4837 : /* This mount is not fully visible if it's root directory
4838 : * is not the root directory of the filesystem.
4839 : */
4840 0 : if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
4841 0 : continue;
4842 :
4843 : /* A local view of the mount flags */
4844 0 : mnt_flags = mnt->mnt.mnt_flags;
4845 :
4846 : /* Don't miss readonly hidden in the superblock flags */
4847 0 : if (sb_rdonly(mnt->mnt.mnt_sb))
4848 0 : mnt_flags |= MNT_LOCK_READONLY;
4849 :
4850 : /* Verify the mount flags are equal to or more permissive
4851 : * than the proposed new mount.
4852 : */
4853 0 : if ((mnt_flags & MNT_LOCK_READONLY) &&
4854 0 : !(new_flags & MNT_READONLY))
4855 0 : continue;
4856 0 : if ((mnt_flags & MNT_LOCK_ATIME) &&
4857 0 : ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
4858 0 : continue;
4859 :
4860 : /* This mount is not fully visible if there are any
4861 : * locked child mounts that cover anything except for
4862 : * empty directories.
4863 : */
4864 0 : list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
4865 0 : struct inode *inode = child->mnt_mountpoint->d_inode;
4866 : /* Only worry about locked mounts */
4867 0 : if (!(child->mnt.mnt_flags & MNT_LOCKED))
4868 0 : continue;
4869 : /* Is the directory permanetly empty? */
4870 0 : if (!is_empty_dir_inode(inode))
4871 0 : goto next;
4872 : }
4873 : /* Preserve the locked attributes */
4874 0 : *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
4875 : MNT_LOCK_ATIME);
4876 0 : visible = true;
4877 0 : goto found;
4878 0 : next: ;
4879 : }
4880 0 : found:
4881 0 : unlock_ns_list(ns);
4882 0 : up_read(&namespace_sem);
4883 0 : return visible;
4884 : }
4885 :
4886 122285 : static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
4887 : {
4888 122285 : const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
4889 122285 : struct mnt_namespace *ns = current->nsproxy->mnt_ns;
4890 122285 : unsigned long s_iflags;
4891 :
4892 122285 : if (ns->user_ns == &init_user_ns)
4893 : return false;
4894 :
4895 : /* Can this filesystem be too revealing? */
4896 13 : s_iflags = sb->s_iflags;
4897 13 : if (!(s_iflags & SB_I_USERNS_VISIBLE))
4898 : return false;
4899 :
4900 0 : if ((s_iflags & required_iflags) != required_iflags) {
4901 0 : WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
4902 : required_iflags);
4903 0 : return true;
4904 : }
4905 :
4906 0 : return !mnt_already_visible(ns, sb, new_mnt_flags);
4907 : }
4908 :
4909 64832785 : bool mnt_may_suid(struct vfsmount *mnt)
4910 : {
4911 : /*
4912 : * Foreign mounts (accessed via fchdir or through /proc
4913 : * symlinks) are always treated as if they are nosuid. This
4914 : * prevents namespaces from trusting potentially unsafe
4915 : * suid/sgid bits, file caps, or security labels that originate
4916 : * in other namespaces.
4917 : */
4918 129665901 : return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
4919 64832800 : current_in_userns(mnt->mnt_sb->s_user_ns);
4920 : }
4921 :
4922 0 : static struct ns_common *mntns_get(struct task_struct *task)
4923 : {
4924 0 : struct ns_common *ns = NULL;
4925 0 : struct nsproxy *nsproxy;
4926 :
4927 0 : task_lock(task);
4928 0 : nsproxy = task->nsproxy;
4929 0 : if (nsproxy) {
4930 0 : ns = &nsproxy->mnt_ns->ns;
4931 0 : get_mnt_ns(to_mnt_ns(ns));
4932 : }
4933 0 : task_unlock(task);
4934 :
4935 0 : return ns;
4936 : }
4937 :
4938 0 : static void mntns_put(struct ns_common *ns)
4939 : {
4940 0 : put_mnt_ns(to_mnt_ns(ns));
4941 0 : }
4942 :
4943 0 : static int mntns_install(struct nsset *nsset, struct ns_common *ns)
4944 : {
4945 0 : struct nsproxy *nsproxy = nsset->nsproxy;
4946 0 : struct fs_struct *fs = nsset->fs;
4947 0 : struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
4948 0 : struct user_namespace *user_ns = nsset->cred->user_ns;
4949 0 : struct path root;
4950 0 : int err;
4951 :
4952 0 : if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
4953 0 : !ns_capable(user_ns, CAP_SYS_CHROOT) ||
4954 0 : !ns_capable(user_ns, CAP_SYS_ADMIN))
4955 0 : return -EPERM;
4956 :
4957 0 : if (is_anon_ns(mnt_ns))
4958 : return -EINVAL;
4959 :
4960 0 : if (fs->users != 1)
4961 : return -EINVAL;
4962 :
4963 0 : get_mnt_ns(mnt_ns);
4964 0 : old_mnt_ns = nsproxy->mnt_ns;
4965 0 : nsproxy->mnt_ns = mnt_ns;
4966 :
4967 : /* Find the root */
4968 0 : err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
4969 : "/", LOOKUP_DOWN, &root);
4970 0 : if (err) {
4971 : /* revert to old namespace */
4972 0 : nsproxy->mnt_ns = old_mnt_ns;
4973 0 : put_mnt_ns(mnt_ns);
4974 0 : return err;
4975 : }
4976 :
4977 0 : put_mnt_ns(old_mnt_ns);
4978 :
4979 : /* Update the pwd and root */
4980 0 : set_fs_pwd(fs, &root);
4981 0 : set_fs_root(fs, &root);
4982 :
4983 0 : path_put(&root);
4984 0 : return 0;
4985 : }
4986 :
4987 0 : static struct user_namespace *mntns_owner(struct ns_common *ns)
4988 : {
4989 0 : return to_mnt_ns(ns)->user_ns;
4990 : }
4991 :
4992 : const struct proc_ns_operations mntns_operations = {
4993 : .name = "mnt",
4994 : .type = CLONE_NEWNS,
4995 : .get = mntns_get,
4996 : .put = mntns_put,
4997 : .install = mntns_install,
4998 : .owner = mntns_owner,
4999 : };
5000 :
5001 : #ifdef CONFIG_SYSCTL
5002 : static struct ctl_table fs_namespace_sysctls[] = {
5003 : {
5004 : .procname = "mount-max",
5005 : .data = &sysctl_mount_max,
5006 : .maxlen = sizeof(unsigned int),
5007 : .mode = 0644,
5008 : .proc_handler = proc_dointvec_minmax,
5009 : .extra1 = SYSCTL_ONE,
5010 : },
5011 : { }
5012 : };
5013 :
5014 0 : static int __init init_fs_namespace_sysctls(void)
5015 : {
5016 0 : register_sysctl_init("fs", fs_namespace_sysctls);
5017 0 : return 0;
5018 : }
5019 : fs_initcall(init_fs_namespace_sysctls);
5020 :
5021 : #endif /* CONFIG_SYSCTL */
|