~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/namespace.c

Version: ~ [ linux-5.9.1 ] ~ [ linux-5.8.16 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.72 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.152 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.202 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.240 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.240 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.85 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  *  linux/fs/namespace.c
  3  *
  4  * (C) Copyright Al Viro 2000, 2001
  5  *      Released under GPL v2.
  6  *
  7  * Based on code from fs/super.c, copyright Linus Torvalds and others.
  8  * Heavily rewritten.
  9  */
 10 
 11 #include <linux/syscalls.h>
 12 #include <linux/export.h>
 13 #include <linux/capability.h>
 14 #include <linux/mnt_namespace.h>
 15 #include <linux/user_namespace.h>
 16 #include <linux/namei.h>
 17 #include <linux/security.h>
 18 #include <linux/idr.h>
 19 #include <linux/acct.h>         /* acct_auto_close_mnt */
 20 #include <linux/init.h>         /* init_rootfs */
 21 #include <linux/fs_struct.h>    /* get_fs_root et.al. */
 22 #include <linux/fsnotify.h>     /* fsnotify_vfsmount_delete */
 23 #include <linux/uaccess.h>
 24 #include <linux/proc_ns.h>
 25 #include <linux/magic.h>
 26 #include <linux/bootmem.h>
 27 #include "pnode.h"
 28 #include "internal.h"
 29 
 30 /* Maximum number of mounts in a mount namespace */
 31 unsigned int sysctl_mount_max __read_mostly = 100000;
 32 
 33 static unsigned int m_hash_mask __read_mostly;
 34 static unsigned int m_hash_shift __read_mostly;
 35 static unsigned int mp_hash_mask __read_mostly;
 36 static unsigned int mp_hash_shift __read_mostly;
 37 
 38 static __initdata unsigned long mhash_entries;
 39 static int __init set_mhash_entries(char *str)
 40 {
 41         if (!str)
 42                 return 0;
 43         mhash_entries = simple_strtoul(str, &str, 0);
 44         return 1;
 45 }
 46 __setup("mhash_entries=", set_mhash_entries);
 47 
 48 static __initdata unsigned long mphash_entries;
 49 static int __init set_mphash_entries(char *str)
 50 {
 51         if (!str)
 52                 return 0;
 53         mphash_entries = simple_strtoul(str, &str, 0);
 54         return 1;
 55 }
 56 __setup("mphash_entries=", set_mphash_entries);
 57 
 58 static u64 event;
 59 static DEFINE_IDA(mnt_id_ida);
 60 static DEFINE_IDA(mnt_group_ida);
 61 static DEFINE_SPINLOCK(mnt_id_lock);
 62 static int mnt_id_start = 0;
 63 static int mnt_group_start = 1;
 64 
 65 static struct hlist_head *mount_hashtable __read_mostly;
 66 static struct hlist_head *mountpoint_hashtable __read_mostly;
 67 static struct kmem_cache *mnt_cache __read_mostly;
 68 static DECLARE_RWSEM(namespace_sem);
 69 
 70 /* /sys/fs */
 71 struct kobject *fs_kobj;
 72 EXPORT_SYMBOL_GPL(fs_kobj);
 73 
 74 /*
 75  * vfsmount lock may be taken for read to prevent changes to the
 76  * vfsmount hash, ie. during mountpoint lookups or walking back
 77  * up the tree.
 78  *
 79  * It should be taken for write in all cases where the vfsmount
 80  * tree or hash is modified or when a vfsmount structure is modified.
 81  */
 82 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
 83 
 84 static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
 85 {
 86         unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
 87         tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
 88         tmp = tmp + (tmp >> m_hash_shift);
 89         return &mount_hashtable[tmp & m_hash_mask];
 90 }
 91 
 92 static inline struct hlist_head *mp_hash(struct dentry *dentry)
 93 {
 94         unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
 95         tmp = tmp + (tmp >> mp_hash_shift);
 96         return &mountpoint_hashtable[tmp & mp_hash_mask];
 97 }
 98 
 99 /*
100  * allocation is serialized by namespace_sem, but we need the spinlock to
101  * serialize with freeing.
102  */
103 static int mnt_alloc_id(struct mount *mnt)
104 {
105         int res;
106 
107 retry:
108         ida_pre_get(&mnt_id_ida, GFP_KERNEL);
109         spin_lock(&mnt_id_lock);
110         res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
111         if (!res)
112                 mnt_id_start = mnt->mnt_id + 1;
113         spin_unlock(&mnt_id_lock);
114         if (res == -EAGAIN)
115                 goto retry;
116 
117         return res;
118 }
119 
120 static void mnt_free_id(struct mount *mnt)
121 {
122         int id = mnt->mnt_id;
123         spin_lock(&mnt_id_lock);
124         ida_remove(&mnt_id_ida, id);
125         if (mnt_id_start > id)
126                 mnt_id_start = id;
127         spin_unlock(&mnt_id_lock);
128 }
129 
130 /*
131  * Allocate a new peer group ID
132  *
133  * mnt_group_ida is protected by namespace_sem
134  */
135 static int mnt_alloc_group_id(struct mount *mnt)
136 {
137         int res;
138 
139         if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
140                 return -ENOMEM;
141 
142         res = ida_get_new_above(&mnt_group_ida,
143                                 mnt_group_start,
144                                 &mnt->mnt_group_id);
145         if (!res)
146                 mnt_group_start = mnt->mnt_group_id + 1;
147 
148         return res;
149 }
150 
151 /*
152  * Release a peer group ID
153  */
154 void mnt_release_group_id(struct mount *mnt)
155 {
156         int id = mnt->mnt_group_id;
157         ida_remove(&mnt_group_ida, id);
158         if (mnt_group_start > id)
159                 mnt_group_start = id;
160         mnt->mnt_group_id = 0;
161 }
162 
163 /*
164  * vfsmount lock must be held for read
165  */
166 static inline void mnt_add_count(struct mount *mnt, int n)
167 {
168 #ifdef CONFIG_SMP
169         this_cpu_add(mnt->mnt_pcp->mnt_count, n);
170 #else
171         preempt_disable();
172         mnt->mnt_count += n;
173         preempt_enable();
174 #endif
175 }
176 
177 /*
178  * vfsmount lock must be held for write
179  */
180 unsigned int mnt_get_count(struct mount *mnt)
181 {
182 #ifdef CONFIG_SMP
183         unsigned int count = 0;
184         int cpu;
185 
186         for_each_possible_cpu(cpu) {
187                 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
188         }
189 
190         return count;
191 #else
192         return mnt->mnt_count;
193 #endif
194 }
195 
196 static struct mount *alloc_vfsmnt(const char *name)
197 {
198         struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
199         if (mnt) {
200                 int err;
201 
202                 err = mnt_alloc_id(mnt);
203                 if (err)
204                         goto out_free_cache;
205 
206                 if (name) {
207                         mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
208                         if (!mnt->mnt_devname)
209                                 goto out_free_id;
210                 }
211 
212 #ifdef CONFIG_SMP
213                 mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
214                 if (!mnt->mnt_pcp)
215                         goto out_free_devname;
216 
217                 this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
218 #else
219                 mnt->mnt_count = 1;
220                 mnt->mnt_writers = 0;
221 #endif
222 
223                 INIT_HLIST_NODE(&mnt->mnt_hash);
224                 INIT_LIST_HEAD(&mnt->mnt_child);
225                 INIT_LIST_HEAD(&mnt->mnt_mounts);
226                 INIT_LIST_HEAD(&mnt->mnt_list);
227                 INIT_LIST_HEAD(&mnt->mnt_expire);
228                 INIT_LIST_HEAD(&mnt->mnt_share);
229                 INIT_LIST_HEAD(&mnt->mnt_slave_list);
230                 INIT_LIST_HEAD(&mnt->mnt_slave);
231 #ifdef CONFIG_FSNOTIFY
232                 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
233 #endif
234         }
235         return mnt;
236 
237 #ifdef CONFIG_SMP
238 out_free_devname:
239         kfree(mnt->mnt_devname);
240 #endif
241 out_free_id:
242         mnt_free_id(mnt);
243 out_free_cache:
244         kmem_cache_free(mnt_cache, mnt);
245         return NULL;
246 }
247 
248 /*
249  * Most r/o checks on a fs are for operations that take
250  * discrete amounts of time, like a write() or unlink().
251  * We must keep track of when those operations start
252  * (for permission checks) and when they end, so that
253  * we can determine when writes are able to occur to
254  * a filesystem.
255  */
256 /*
257  * __mnt_is_readonly: check whether a mount is read-only
258  * @mnt: the mount to check for its write status
259  *
260  * This shouldn't be used directly ouside of the VFS.
261  * It does not guarantee that the filesystem will stay
262  * r/w, just that it is right *now*.  This can not and
263  * should not be used in place of IS_RDONLY(inode).
264  * mnt_want/drop_write() will _keep_ the filesystem
265  * r/w.
266  */
267 int __mnt_is_readonly(struct vfsmount *mnt)
268 {
269         if (mnt->mnt_flags & MNT_READONLY)
270                 return 1;
271         if (mnt->mnt_sb->s_flags & MS_RDONLY)
272                 return 1;
273         return 0;
274 }
275 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
276 
277 static inline void mnt_inc_writers(struct mount *mnt)
278 {
279 #ifdef CONFIG_SMP
280         this_cpu_inc(mnt->mnt_pcp->mnt_writers);
281 #else
282         mnt->mnt_writers++;
283 #endif
284 }
285 
286 static inline void mnt_dec_writers(struct mount *mnt)
287 {
288 #ifdef CONFIG_SMP
289         this_cpu_dec(mnt->mnt_pcp->mnt_writers);
290 #else
291         mnt->mnt_writers--;
292 #endif
293 }
294 
295 static unsigned int mnt_get_writers(struct mount *mnt)
296 {
297 #ifdef CONFIG_SMP
298         unsigned int count = 0;
299         int cpu;
300 
301         for_each_possible_cpu(cpu) {
302                 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
303         }
304 
305         return count;
306 #else
307         return mnt->mnt_writers;
308 #endif
309 }
310 
311 static int mnt_is_readonly(struct vfsmount *mnt)
312 {
313         if (mnt->mnt_sb->s_readonly_remount)
314                 return 1;
315         /* Order wrt setting s_flags/s_readonly_remount in do_remount() */
316         smp_rmb();
317         return __mnt_is_readonly(mnt);
318 }
319 
320 /*
321  * Most r/o & frozen checks on a fs are for operations that take discrete
322  * amounts of time, like a write() or unlink().  We must keep track of when
323  * those operations start (for permission checks) and when they end, so that we
324  * can determine when writes are able to occur to a filesystem.
325  */
326 /**
327  * __mnt_want_write - get write access to a mount without freeze protection
328  * @m: the mount on which to take a write
329  *
330  * This tells the low-level filesystem that a write is about to be performed to
331  * it, and makes sure that writes are allowed (mnt it read-write) before
332  * returning success. This operation does not protect against filesystem being
333  * frozen. When the write operation is finished, __mnt_drop_write() must be
334  * called. This is effectively a refcount.
335  */
336 int __mnt_want_write(struct vfsmount *m)
337 {
338         struct mount *mnt = real_mount(m);
339         int ret = 0;
340 
341         preempt_disable();
342         mnt_inc_writers(mnt);
343         /*
344          * The store to mnt_inc_writers must be visible before we pass
345          * MNT_WRITE_HOLD loop below, so that the slowpath can see our
346          * incremented count after it has set MNT_WRITE_HOLD.
347          */
348         smp_mb();
349         while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
350                 cpu_relax();
351         /*
352          * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
353          * be set to match its requirements. So we must not load that until
354          * MNT_WRITE_HOLD is cleared.
355          */
356         smp_rmb();
357         if (mnt_is_readonly(m)) {
358                 mnt_dec_writers(mnt);
359                 ret = -EROFS;
360         }
361         preempt_enable();
362 
363         return ret;
364 }
365 
366 /**
367  * mnt_want_write - get write access to a mount
368  * @m: the mount on which to take a write
369  *
370  * This tells the low-level filesystem that a write is about to be performed to
371  * it, and makes sure that writes are allowed (mount is read-write, filesystem
372  * is not frozen) before returning success.  When the write operation is
373  * finished, mnt_drop_write() must be called.  This is effectively a refcount.
374  */
375 int mnt_want_write(struct vfsmount *m)
376 {
377         int ret;
378 
379         sb_start_write(m->mnt_sb);
380         ret = __mnt_want_write(m);
381         if (ret)
382                 sb_end_write(m->mnt_sb);
383         return ret;
384 }
385 EXPORT_SYMBOL_GPL(mnt_want_write);
386 
387 /**
388  * mnt_clone_write - get write access to a mount
389  * @mnt: the mount on which to take a write
390  *
391  * This is effectively like mnt_want_write, except
392  * it must only be used to take an extra write reference
393  * on a mountpoint that we already know has a write reference
394  * on it. This allows some optimisation.
395  *
396  * After finished, mnt_drop_write must be called as usual to
397  * drop the reference.
398  */
399 int mnt_clone_write(struct vfsmount *mnt)
400 {
401         /* superblock may be r/o */
402         if (__mnt_is_readonly(mnt))
403                 return -EROFS;
404         preempt_disable();
405         mnt_inc_writers(real_mount(mnt));
406         preempt_enable();
407         return 0;
408 }
409 EXPORT_SYMBOL_GPL(mnt_clone_write);
410 
411 /**
412  * __mnt_want_write_file - get write access to a file's mount
413  * @file: the file who's mount on which to take a write
414  *
415  * This is like __mnt_want_write, but it takes a file and can
416  * do some optimisations if the file is open for write already
417  */
418 int __mnt_want_write_file(struct file *file)
419 {
420         if (!(file->f_mode & FMODE_WRITER))
421                 return __mnt_want_write(file->f_path.mnt);
422         else
423                 return mnt_clone_write(file->f_path.mnt);
424 }
425 
426 /**
427  * mnt_want_write_file - get write access to a file's mount
428  * @file: the file who's mount on which to take a write
429  *
430  * This is like mnt_want_write, but it takes a file and can
431  * do some optimisations if the file is open for write already
432  */
433 int mnt_want_write_file(struct file *file)
434 {
435         int ret;
436 
437         sb_start_write(file->f_path.mnt->mnt_sb);
438         ret = __mnt_want_write_file(file);
439         if (ret)
440                 sb_end_write(file->f_path.mnt->mnt_sb);
441         return ret;
442 }
443 EXPORT_SYMBOL_GPL(mnt_want_write_file);
444 
445 /**
446  * __mnt_drop_write - give up write access to a mount
447  * @mnt: the mount on which to give up write access
448  *
449  * Tells the low-level filesystem that we are done
450  * performing writes to it.  Must be matched with
451  * __mnt_want_write() call above.
452  */
453 void __mnt_drop_write(struct vfsmount *mnt)
454 {
455         preempt_disable();
456         mnt_dec_writers(real_mount(mnt));
457         preempt_enable();
458 }
459 
460 /**
461  * mnt_drop_write - give up write access to a mount
462  * @mnt: the mount on which to give up write access
463  *
464  * Tells the low-level filesystem that we are done performing writes to it and
465  * also allows filesystem to be frozen again.  Must be matched with
466  * mnt_want_write() call above.
467  */
468 void mnt_drop_write(struct vfsmount *mnt)
469 {
470         __mnt_drop_write(mnt);
471         sb_end_write(mnt->mnt_sb);
472 }
473 EXPORT_SYMBOL_GPL(mnt_drop_write);
474 
475 void __mnt_drop_write_file(struct file *file)
476 {
477         __mnt_drop_write(file->f_path.mnt);
478 }
479 
480 void mnt_drop_write_file(struct file *file)
481 {
482         mnt_drop_write(file->f_path.mnt);
483 }
484 EXPORT_SYMBOL(mnt_drop_write_file);
485 
486 static int mnt_make_readonly(struct mount *mnt)
487 {
488         int ret = 0;
489 
490         lock_mount_hash();
491         mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
492         /*
493          * After storing MNT_WRITE_HOLD, we'll read the counters. This store
494          * should be visible before we do.
495          */
496         smp_mb();
497 
498         /*
499          * With writers on hold, if this value is zero, then there are
500          * definitely no active writers (although held writers may subsequently
501          * increment the count, they'll have to wait, and decrement it after
502          * seeing MNT_READONLY).
503          *
504          * It is OK to have counter incremented on one CPU and decremented on
505          * another: the sum will add up correctly. The danger would be when we
506          * sum up each counter, if we read a counter before it is incremented,
507          * but then read another CPU's count which it has been subsequently
508          * decremented from -- we would see more decrements than we should.
509          * MNT_WRITE_HOLD protects against this scenario, because
510          * mnt_want_write first increments count, then smp_mb, then spins on
511          * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
512          * we're counting up here.
513          */
514         if (mnt_get_writers(mnt) > 0)
515                 ret = -EBUSY;
516         else
517                 mnt->mnt.mnt_flags |= MNT_READONLY;
518         /*
519          * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
520          * that become unheld will see MNT_READONLY.
521          */
522         smp_wmb();
523         mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
524         unlock_mount_hash();
525         return ret;
526 }
527 
528 static void __mnt_unmake_readonly(struct mount *mnt)
529 {
530         lock_mount_hash();
531         mnt->mnt.mnt_flags &= ~MNT_READONLY;
532         unlock_mount_hash();
533 }
534 
535 int sb_prepare_remount_readonly(struct super_block *sb)
536 {
537         struct mount *mnt;
538         int err = 0;
539 
540         /* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
541         if (atomic_long_read(&sb->s_remove_count))
542                 return -EBUSY;
543 
544         lock_mount_hash();
545         list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
546                 if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
547                         mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
548                         smp_mb();
549                         if (mnt_get_writers(mnt) > 0) {
550                                 err = -EBUSY;
551                                 break;
552                         }
553                 }
554         }
555         if (!err && atomic_long_read(&sb->s_remove_count))
556                 err = -EBUSY;
557 
558         if (!err) {
559                 sb->s_readonly_remount = 1;
560                 smp_wmb();
561         }
562         list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
563                 if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
564                         mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
565         }
566         unlock_mount_hash();
567 
568         return err;
569 }
570 
571 static void free_vfsmnt(struct mount *mnt)
572 {
573         kfree(mnt->mnt_devname);
574 #ifdef CONFIG_SMP
575         free_percpu(mnt->mnt_pcp);
576 #endif
577         kmem_cache_free(mnt_cache, mnt);
578 }
579 
580 static void delayed_free_vfsmnt(struct rcu_head *head)
581 {
582         free_vfsmnt(container_of(head, struct mount, mnt_rcu));
583 }
584 
585 /* call under rcu_read_lock */
586 bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
587 {
588         struct mount *mnt;
589         if (read_seqretry(&mount_lock, seq))
590                 return false;
591         if (bastard == NULL)
592                 return true;
593         mnt = real_mount(bastard);
594         mnt_add_count(mnt, 1);
595         smp_mb();                       // see mntput_no_expire()
596         if (likely(!read_seqretry(&mount_lock, seq)))
597                 return true;
598         if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
599                 mnt_add_count(mnt, -1);
600                 return false;
601         }
602         lock_mount_hash();
603         if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
604                 mnt_add_count(mnt, -1);
605                 unlock_mount_hash();
606                 return false;
607         }
608         unlock_mount_hash();
609         rcu_read_unlock();
610         mntput(bastard);
611         rcu_read_lock();
612         return false;
613 }
614 
615 /*
616  * find the first mount at @dentry on vfsmount @mnt.
617  * call under rcu_read_lock()
618  */
619 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
620 {
621         struct hlist_head *head = m_hash(mnt, dentry);
622         struct mount *p;
623 
624         hlist_for_each_entry_rcu(p, head, mnt_hash)
625                 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
626                         return p;
627         return NULL;
628 }
629 
630 /*
631  * find the last mount at @dentry on vfsmount @mnt.
632  * mount_lock must be held.
633  */
634 struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
635 {
636         struct mount *p, *res;
637         res = p = __lookup_mnt(mnt, dentry);
638         if (!p)
639                 goto out;
640         hlist_for_each_entry_continue(p, mnt_hash) {
641                 if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
642                         break;
643                 res = p;
644         }
645 out:
646         return res;
647 }
648 
649 /*
650  * lookup_mnt - Return the first child mount mounted at path
651  *
652  * "First" means first mounted chronologically.  If you create the
653  * following mounts:
654  *
655  * mount /dev/sda1 /mnt
656  * mount /dev/sda2 /mnt
657  * mount /dev/sda3 /mnt
658  *
659  * Then lookup_mnt() on the base /mnt dentry in the root mount will
660  * return successively the root dentry and vfsmount of /dev/sda1, then
661  * /dev/sda2, then /dev/sda3, then NULL.
662  *
663  * lookup_mnt takes a reference to the found vfsmount.
664  */
665 struct vfsmount *lookup_mnt(struct path *path)
666 {
667         struct mount *child_mnt;
668         struct vfsmount *m;
669         unsigned seq;
670 
671         rcu_read_lock();
672         do {
673                 seq = read_seqbegin(&mount_lock);
674                 child_mnt = __lookup_mnt(path->mnt, path->dentry);
675                 m = child_mnt ? &child_mnt->mnt : NULL;
676         } while (!legitimize_mnt(m, seq));
677         rcu_read_unlock();
678         return m;
679 }
680 
681 static struct mountpoint *new_mountpoint(struct dentry *dentry)
682 {
683         struct hlist_head *chain = mp_hash(dentry);
684         struct mountpoint *mp;
685         int ret;
686 
687         hlist_for_each_entry(mp, chain, m_hash) {
688                 if (mp->m_dentry == dentry) {
689                         /* might be worth a WARN_ON() */
690                         if (d_unlinked(dentry))
691                                 return ERR_PTR(-ENOENT);
692                         mp->m_count++;
693                         return mp;
694                 }
695         }
696 
697         mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
698         if (!mp)
699                 return ERR_PTR(-ENOMEM);
700 
701         ret = d_set_mounted(dentry);
702         if (ret) {
703                 kfree(mp);
704                 return ERR_PTR(ret);
705         }
706 
707         mp->m_dentry = dentry;
708         mp->m_count = 1;
709         hlist_add_head(&mp->m_hash, chain);
710         return mp;
711 }
712 
713 static void put_mountpoint(struct mountpoint *mp)
714 {
715         if (!--mp->m_count) {
716                 struct dentry *dentry = mp->m_dentry;
717                 spin_lock(&dentry->d_lock);
718                 dentry->d_flags &= ~DCACHE_MOUNTED;
719                 spin_unlock(&dentry->d_lock);
720                 hlist_del(&mp->m_hash);
721                 kfree(mp);
722         }
723 }
724 
725 static inline int check_mnt(struct mount *mnt)
726 {
727         return mnt->mnt_ns == current->nsproxy->mnt_ns;
728 }
729 
730 /*
731  * vfsmount lock must be held for write
732  */
733 static void touch_mnt_namespace(struct mnt_namespace *ns)
734 {
735         if (ns) {
736                 ns->event = ++event;
737                 wake_up_interruptible(&ns->poll);
738         }
739 }
740 
741 /*
742  * vfsmount lock must be held for write
743  */
744 static void __touch_mnt_namespace(struct mnt_namespace *ns)
745 {
746         if (ns && ns->event != event) {
747                 ns->event = event;
748                 wake_up_interruptible(&ns->poll);
749         }
750 }
751 
752 /*
753  * vfsmount lock must be held for write
754  */
755 static void detach_mnt(struct mount *mnt, struct path *old_path)
756 {
757         old_path->dentry = mnt->mnt_mountpoint;
758         old_path->mnt = &mnt->mnt_parent->mnt;
759         mnt->mnt_parent = mnt;
760         mnt->mnt_mountpoint = mnt->mnt.mnt_root;
761         list_del_init(&mnt->mnt_child);
762         hlist_del_init_rcu(&mnt->mnt_hash);
763         put_mountpoint(mnt->mnt_mp);
764         mnt->mnt_mp = NULL;
765 }
766 
767 /*
768  * vfsmount lock must be held for write
769  */
770 void mnt_set_mountpoint(struct mount *mnt,
771                         struct mountpoint *mp,
772                         struct mount *child_mnt)
773 {
774         mp->m_count++;
775         mnt_add_count(mnt, 1);  /* essentially, that's mntget */
776         child_mnt->mnt_mountpoint = dget(mp->m_dentry);
777         child_mnt->mnt_parent = mnt;
778         child_mnt->mnt_mp = mp;
779 }
780 
781 /*
782  * vfsmount lock must be held for write
783  */
784 static void attach_mnt(struct mount *mnt,
785                         struct mount *parent,
786                         struct mountpoint *mp)
787 {
788         mnt_set_mountpoint(parent, mp, mnt);
789         hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
790         list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
791 }
792 
793 static void attach_shadowed(struct mount *mnt,
794                         struct mount *parent,
795                         struct mount *shadows)
796 {
797         if (shadows) {
798                 hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash);
799                 list_add(&mnt->mnt_child, &shadows->mnt_child);
800         } else {
801                 hlist_add_head_rcu(&mnt->mnt_hash,
802                                 m_hash(&parent->mnt, mnt->mnt_mountpoint));
803                 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
804         }
805 }
806 
807 /*
808  * vfsmount lock must be held for write
809  */
810 static void commit_tree(struct mount *mnt, struct mount *shadows)
811 {
812         struct mount *parent = mnt->mnt_parent;
813         struct mount *m;
814         LIST_HEAD(head);
815         struct mnt_namespace *n = parent->mnt_ns;
816 
817         BUG_ON(parent == mnt);
818 
819         list_add_tail(&head, &mnt->mnt_list);
820         list_for_each_entry(m, &head, mnt_list)
821                 m->mnt_ns = n;
822 
823         list_splice(&head, n->list.prev);
824 
825         n->mounts += n->pending_mounts;
826         n->pending_mounts = 0;
827 
828         attach_shadowed(mnt, parent, shadows);
829         touch_mnt_namespace(n);
830 }
831 
832 static struct mount *next_mnt(struct mount *p, struct mount *root)
833 {
834         struct list_head *next = p->mnt_mounts.next;
835         if (next == &p->mnt_mounts) {
836                 while (1) {
837                         if (p == root)
838                                 return NULL;
839                         next = p->mnt_child.next;
840                         if (next != &p->mnt_parent->mnt_mounts)
841                                 break;
842                         p = p->mnt_parent;
843                 }
844         }
845         return list_entry(next, struct mount, mnt_child);
846 }
847 
848 static struct mount *skip_mnt_tree(struct mount *p)
849 {
850         struct list_head *prev = p->mnt_mounts.prev;
851         while (prev != &p->mnt_mounts) {
852                 p = list_entry(prev, struct mount, mnt_child);
853                 prev = p->mnt_mounts.prev;
854         }
855         return p;
856 }
857 
858 struct vfsmount *
859 vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
860 {
861         struct mount *mnt;
862         struct dentry *root;
863 
864         if (!type)
865                 return ERR_PTR(-ENODEV);
866 
867         mnt = alloc_vfsmnt(name);
868         if (!mnt)
869                 return ERR_PTR(-ENOMEM);
870 
871         if (flags & MS_KERNMOUNT)
872                 mnt->mnt.mnt_flags = MNT_INTERNAL;
873 
874         root = mount_fs(type, flags, name, data);
875         if (IS_ERR(root)) {
876                 mnt_free_id(mnt);
877                 free_vfsmnt(mnt);
878                 return ERR_CAST(root);
879         }
880 
881         mnt->mnt.mnt_root = root;
882         mnt->mnt.mnt_sb = root->d_sb;
883         mnt->mnt_mountpoint = mnt->mnt.mnt_root;
884         mnt->mnt_parent = mnt;
885         lock_mount_hash();
886         list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
887         unlock_mount_hash();
888         return &mnt->mnt;
889 }
890 EXPORT_SYMBOL_GPL(vfs_kern_mount);
891 
892 static struct mount *clone_mnt(struct mount *old, struct dentry *root,
893                                         int flag)
894 {
895         struct super_block *sb = old->mnt.mnt_sb;
896         struct mount *mnt;
897         int err;
898 
899         mnt = alloc_vfsmnt(old->mnt_devname);
900         if (!mnt)
901                 return ERR_PTR(-ENOMEM);
902 
903         if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
904                 mnt->mnt_group_id = 0; /* not a peer of original */
905         else
906                 mnt->mnt_group_id = old->mnt_group_id;
907 
908         if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
909                 err = mnt_alloc_group_id(mnt);
910                 if (err)
911                         goto out_free;
912         }
913 
914         mnt->mnt.mnt_flags = old->mnt.mnt_flags;
915         mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
916         /* Don't allow unprivileged users to change mount flags */
917         if (flag & CL_UNPRIVILEGED) {
918                 mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
919 
920                 if (mnt->mnt.mnt_flags & MNT_READONLY)
921                         mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
922 
923                 if (mnt->mnt.mnt_flags & MNT_NODEV)
924                         mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;
925 
926                 if (mnt->mnt.mnt_flags & MNT_NOSUID)
927                         mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;
928 
929                 if (mnt->mnt.mnt_flags & MNT_NOEXEC)
930                         mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
931         }
932 
933         /* Don't allow unprivileged users to reveal what is under a mount */
934         if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
935                 mnt->mnt.mnt_flags |= MNT_LOCKED;
936 
937         atomic_inc(&sb->s_active);
938         mnt->mnt.mnt_sb = sb;
939         mnt->mnt.mnt_root = dget(root);
940         mnt->mnt_mountpoint = mnt->mnt.mnt_root;
941         mnt->mnt_parent = mnt;
942         lock_mount_hash();
943         list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
944         unlock_mount_hash();
945 
946         if ((flag & CL_SLAVE) ||
947             ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
948                 list_add(&mnt->mnt_slave, &old->mnt_slave_list);
949                 mnt->mnt_master = old;
950                 CLEAR_MNT_SHARED(mnt);
951         } else if (!(flag & CL_PRIVATE)) {
952                 if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
953                         list_add(&mnt->mnt_share, &old->mnt_share);
954                 if (IS_MNT_SLAVE(old))
955                         list_add(&mnt->mnt_slave, &old->mnt_slave);
956                 mnt->mnt_master = old->mnt_master;
957         }
958         if (flag & CL_MAKE_SHARED)
959                 set_mnt_shared(mnt);
960 
961         /* stick the duplicate mount on the same expiry list
962          * as the original if that was on one */
963         if (flag & CL_EXPIRE) {
964                 if (!list_empty(&old->mnt_expire))
965                         list_add(&mnt->mnt_expire, &old->mnt_expire);
966         }
967 
968         return mnt;
969 
970  out_free:
971         mnt_free_id(mnt);
972         free_vfsmnt(mnt);
973         return ERR_PTR(err);
974 }
975 
976 static void mntput_no_expire(struct mount *mnt)
977 {
978 put_again:
979         rcu_read_lock();
980         if (likely(ACCESS_ONCE(mnt->mnt_ns))) {
981                 /*
982                  * Since we don't do lock_mount_hash() here,
983                  * ->mnt_ns can change under us.  However, if it's
984                  * non-NULL, then there's a reference that won't
985                  * be dropped until after an RCU delay done after
986                  * turning ->mnt_ns NULL.  So if we observe it
987                  * non-NULL under rcu_read_lock(), the reference
988                  * we are dropping is not the final one.
989                  */
990                 mnt_add_count(mnt, -1);
991                 rcu_read_unlock();
992                 return;
993         }
994         lock_mount_hash();
995         /*
996          * make sure that if legitimize_mnt() has not seen us grab
997          * mount_lock, we'll see their refcount increment here.
998          */
999         smp_mb();
1000         mnt_add_count(mnt, -1);
1001         if (mnt_get_count(mnt)) {
1002                 rcu_read_unlock();
1003                 unlock_mount_hash();
1004                 return;
1005         }
1006         if (unlikely(mnt->mnt_pinned)) {
1007                 mnt_add_count(mnt, mnt->mnt_pinned + 1);
1008                 mnt->mnt_pinned = 0;
1009                 rcu_read_unlock();
1010                 unlock_mount_hash();
1011                 acct_auto_close_mnt(&mnt->mnt);
1012                 goto put_again;
1013         }
1014         if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
1015                 rcu_read_unlock();
1016                 unlock_mount_hash();
1017                 return;
1018         }
1019         mnt->mnt.mnt_flags |= MNT_DOOMED;
1020         rcu_read_unlock();
1021 
1022         list_del(&mnt->mnt_instance);
1023         unlock_mount_hash();
1024 
1025         /*
1026          * This probably indicates that somebody messed
1027          * up a mnt_want/drop_write() pair.  If this
1028          * happens, the filesystem was probably unable
1029          * to make r/w->r/o transitions.
1030          */
1031         /*
1032          * The locking used to deal with mnt_count decrement provides barriers,
1033          * so mnt_get_writers() below is safe.
1034          */
1035         WARN_ON(mnt_get_writers(mnt));
1036         fsnotify_vfsmount_delete(&mnt->mnt);
1037         dput(mnt->mnt.mnt_root);
1038         deactivate_super(mnt->mnt.mnt_sb);
1039         mnt_free_id(mnt);
1040         call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
1041 }
1042 
1043 void mntput(struct vfsmount *mnt)
1044 {
1045         if (mnt) {
1046                 struct mount *m = real_mount(mnt);
1047                 /* avoid cacheline pingpong, hope gcc doesn't get "smart" */
1048                 if (unlikely(m->mnt_expiry_mark))
1049                         m->mnt_expiry_mark = 0;
1050                 mntput_no_expire(m);
1051         }
1052 }
1053 EXPORT_SYMBOL(mntput);
1054 
1055 struct vfsmount *mntget(struct vfsmount *mnt)
1056 {
1057         if (mnt)
1058                 mnt_add_count(real_mount(mnt), 1);
1059         return mnt;
1060 }
1061 EXPORT_SYMBOL(mntget);
1062 
1063 void mnt_pin(struct vfsmount *mnt)
1064 {
1065         lock_mount_hash();
1066         real_mount(mnt)->mnt_pinned++;
1067         unlock_mount_hash();
1068 }
1069 EXPORT_SYMBOL(mnt_pin);
1070 
1071 void mnt_unpin(struct vfsmount *m)
1072 {
1073         struct mount *mnt = real_mount(m);
1074         lock_mount_hash();
1075         if (mnt->mnt_pinned) {
1076                 mnt_add_count(mnt, 1);
1077                 mnt->mnt_pinned--;
1078         }
1079         unlock_mount_hash();
1080 }
1081 EXPORT_SYMBOL(mnt_unpin);
1082 
1083 static inline void mangle(struct seq_file *m, const char *s)
1084 {
1085         seq_escape(m, s, " \t\n\\");
1086 }
1087 
1088 /*
1089  * Simple .show_options callback for filesystems which don't want to
1090  * implement more complex mount option showing.
1091  *
1092  * See also save_mount_options().
1093  */
1094 int generic_show_options(struct seq_file *m, struct dentry *root)
1095 {
1096         const char *options;
1097 
1098         rcu_read_lock();
1099         options = rcu_dereference(root->d_sb->s_options);
1100 
1101         if (options != NULL && options[0]) {
1102                 seq_putc(m, ',');
1103                 mangle(m, options);
1104         }
1105         rcu_read_unlock();
1106 
1107         return 0;
1108 }
1109 EXPORT_SYMBOL(generic_show_options);
1110 
1111 /*
1112  * If filesystem uses generic_show_options(), this function should be
1113  * called from the fill_super() callback.
1114  *
1115  * The .remount_fs callback usually needs to be handled in a special
1116  * way, to make sure, that previous options are not overwritten if the
1117  * remount fails.
1118  *
1119  * Also note, that if the filesystem's .remount_fs function doesn't
1120  * reset all options to their default value, but changes only newly
1121  * given options, then the displayed options will not reflect reality
1122  * any more.
1123  */
1124 void save_mount_options(struct super_block *sb, char *options)
1125 {
1126         BUG_ON(sb->s_options);
1127         rcu_assign_pointer(sb->s_options, kstrdup(options, GFP_KERNEL));
1128 }
1129 EXPORT_SYMBOL(save_mount_options);
1130 
1131 void replace_mount_options(struct super_block *sb, char *options)
1132 {
1133         char *old = sb->s_options;
1134         rcu_assign_pointer(sb->s_options, options);
1135         if (old) {
1136                 synchronize_rcu();
1137                 kfree(old);
1138         }
1139 }
1140 EXPORT_SYMBOL(replace_mount_options);
1141 
1142 #ifdef CONFIG_PROC_FS
1143 /* iterator; we want it to have access to namespace_sem, thus here... */
1144 static void *m_start(struct seq_file *m, loff_t *pos)
1145 {
1146         struct proc_mounts *p = proc_mounts(m);
1147 
1148         down_read(&namespace_sem);
1149         if (p->cached_event == p->ns->event) {
1150                 void *v = p->cached_mount;
1151                 if (*pos == p->cached_index)
1152                         return v;
1153                 if (*pos == p->cached_index + 1) {
1154                         v = seq_list_next(v, &p->ns->list, &p->cached_index);
1155                         return p->cached_mount = v;
1156                 }
1157         }
1158 
1159         p->cached_event = p->ns->event;
1160         p->cached_mount = seq_list_start(&p->ns->list, *pos);
1161         p->cached_index = *pos;
1162         return p->cached_mount;
1163 }
1164 
1165 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
1166 {
1167         struct proc_mounts *p = proc_mounts(m);
1168 
1169         p->cached_mount = seq_list_next(v, &p->ns->list, pos);
1170         p->cached_index = *pos;
1171         return p->cached_mount;
1172 }
1173 
1174 static void m_stop(struct seq_file *m, void *v)
1175 {
1176         up_read(&namespace_sem);
1177 }
1178 
1179 static int m_show(struct seq_file *m, void *v)
1180 {
1181         struct proc_mounts *p = proc_mounts(m);
1182         struct mount *r = list_entry(v, struct mount, mnt_list);
1183         return p->show(m, &r->mnt);
1184 }
1185 
1186 const struct seq_operations mounts_op = {
1187         .start  = m_start,
1188         .next   = m_next,
1189         .stop   = m_stop,
1190         .show   = m_show,
1191 };
1192 #endif  /* CONFIG_PROC_FS */
1193 
1194 /**
1195  * may_umount_tree - check if a mount tree is busy
1196  * @mnt: root of mount tree
1197  *
1198  * This is called to check if a tree of mounts has any
1199  * open files, pwds, chroots or sub mounts that are
1200  * busy.
1201  */
1202 int may_umount_tree(struct vfsmount *m)
1203 {
1204         struct mount *mnt = real_mount(m);
1205         int actual_refs = 0;
1206         int minimum_refs = 0;
1207         struct mount *p;
1208         BUG_ON(!m);
1209 
1210         /* write lock needed for mnt_get_count */
1211         lock_mount_hash();
1212         for (p = mnt; p; p = next_mnt(p, mnt)) {
1213                 actual_refs += mnt_get_count(p);
1214                 minimum_refs += 2;
1215         }
1216         unlock_mount_hash();
1217 
1218         if (actual_refs > minimum_refs)
1219                 return 0;
1220 
1221         return 1;
1222 }
1223 
1224 EXPORT_SYMBOL(may_umount_tree);
1225 
1226 /**
1227  * may_umount - check if a mount point is busy
1228  * @mnt: root of mount
1229  *
1230  * This is called to check if a mount point has any
1231  * open files, pwds, chroots or sub mounts. If the
1232  * mount has sub mounts this will return busy
1233  * regardless of whether the sub mounts are busy.
1234  *
1235  * Doesn't take quota and stuff into account. IOW, in some cases it will
1236  * give false negatives. The main reason why it's here is that we need
1237  * a non-destructive way to look for easily umountable filesystems.
1238  */
1239 int may_umount(struct vfsmount *mnt)
1240 {
1241         int ret = 1;
1242         down_read(&namespace_sem);
1243         lock_mount_hash();
1244         if (propagate_mount_busy(real_mount(mnt), 2))
1245                 ret = 0;
1246         unlock_mount_hash();
1247         up_read(&namespace_sem);
1248         return ret;
1249 }
1250 
1251 EXPORT_SYMBOL(may_umount);
1252 
1253 static HLIST_HEAD(unmounted);   /* protected by namespace_sem */
1254 
1255 static void namespace_unlock(void)
1256 {
1257         struct mount *mnt;
1258         struct hlist_head head = unmounted;
1259 
1260         if (likely(hlist_empty(&head))) {
1261                 up_write(&namespace_sem);
1262                 return;
1263         }
1264 
1265         head.first->pprev = &head.first;
1266         INIT_HLIST_HEAD(&unmounted);
1267 
1268         /* undo decrements we'd done in umount_tree() */
1269         hlist_for_each_entry(mnt, &head, mnt_hash)
1270                 if (mnt->mnt_ex_mountpoint.mnt)
1271                         mntget(mnt->mnt_ex_mountpoint.mnt);
1272 
1273         up_write(&namespace_sem);
1274 
1275         synchronize_rcu();
1276 
1277         while (!hlist_empty(&head)) {
1278                 mnt = hlist_entry(head.first, struct mount, mnt_hash);
1279                 hlist_del_init(&mnt->mnt_hash);
1280                 if (mnt->mnt_ex_mountpoint.mnt)
1281                         path_put(&mnt->mnt_ex_mountpoint);
1282                 mntput(&mnt->mnt);
1283         }
1284 }
1285 
1286 static inline void namespace_lock(void)
1287 {
1288         down_write(&namespace_sem);
1289 }
1290 
1291 enum umount_tree_flags {
1292         UMOUNT_SYNC = 1,
1293         UMOUNT_PROPAGATE = 2,
1294 };
1295 /*
1296  * mount_lock must be held
1297  * namespace_sem must be held for write
1298  */
1299 static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
1300 {
1301         HLIST_HEAD(tmp_list);
1302         struct mount *p;
1303         struct mount *last = NULL;
1304 
1305         for (p = mnt; p; p = next_mnt(p, mnt)) {
1306                 hlist_del_init_rcu(&p->mnt_hash);
1307                 hlist_add_head(&p->mnt_hash, &tmp_list);
1308         }
1309 
1310         hlist_for_each_entry(p, &tmp_list, mnt_hash)
1311                 list_del_init(&p->mnt_child);
1312 
1313         if (how & UMOUNT_PROPAGATE)
1314                 propagate_umount(&tmp_list);
1315 
1316         hlist_for_each_entry(p, &tmp_list, mnt_hash) {
1317                 struct mnt_namespace *ns;
1318                 list_del_init(&p->mnt_expire);
1319                 list_del_init(&p->mnt_list);
1320                 ns = p->mnt_ns;
1321                 if (ns) {
1322                         ns->mounts--;
1323                         __touch_mnt_namespace(ns);
1324                 }
1325                 p->mnt_ns = NULL;
1326                 if (how & UMOUNT_SYNC)
1327                         p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
1328                 if (mnt_has_parent(p)) {
1329                         put_mountpoint(p->mnt_mp);
1330                         mnt_add_count(p->mnt_parent, -1);
1331                         /* move the reference to mountpoint into ->mnt_ex_mountpoint */
1332                         p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
1333                         p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
1334                         p->mnt_mountpoint = p->mnt.mnt_root;
1335                         p->mnt_parent = p;
1336                         p->mnt_mp = NULL;
1337                 }
1338                 change_mnt_propagation(p, MS_PRIVATE);
1339                 last = p;
1340         }
1341         if (last) {
1342                 last->mnt_hash.next = unmounted.first;
1343                 if (unmounted.first)
1344                         unmounted.first->pprev = &last->mnt_hash.next;
1345                 unmounted.first = tmp_list.first;
1346                 unmounted.first->pprev = &unmounted.first;
1347         }
1348 }
1349 
1350 static void shrink_submounts(struct mount *mnt);
1351 
1352 static int do_umount(struct mount *mnt, int flags)
1353 {
1354         struct super_block *sb = mnt->mnt.mnt_sb;
1355         int retval;
1356 
1357         retval = security_sb_umount(&mnt->mnt, flags);
1358         if (retval)
1359                 return retval;
1360 
1361         /*
1362          * Allow userspace to request a mountpoint be expired rather than
1363          * unmounting unconditionally. Unmount only happens if:
1364          *  (1) the mark is already set (the mark is cleared by mntput())
1365          *  (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
1366          */
1367         if (flags & MNT_EXPIRE) {
1368                 if (&mnt->mnt == current->fs->root.mnt ||
1369                     flags & (MNT_FORCE | MNT_DETACH))
1370                         return -EINVAL;
1371 
1372                 /*
1373                  * probably don't strictly need the lock here if we examined
1374                  * all race cases, but it's a slowpath.
1375                  */
1376                 lock_mount_hash();
1377                 if (mnt_get_count(mnt) != 2) {
1378                         unlock_mount_hash();
1379                         return -EBUSY;
1380                 }
1381                 unlock_mount_hash();
1382 
1383                 if (!xchg(&mnt->mnt_expiry_mark, 1))
1384                         return -EAGAIN;
1385         }
1386 
1387         /*
1388          * If we may have to abort operations to get out of this
1389          * mount, and they will themselves hold resources we must
1390          * allow the fs to do things. In the Unix tradition of
1391          * 'Gee thats tricky lets do it in userspace' the umount_begin
1392          * might fail to complete on the first run through as other tasks
1393          * must return, and the like. Thats for the mount program to worry
1394          * about for the moment.
1395          */
1396 
1397         if (flags & MNT_FORCE && sb->s_op->umount_begin) {
1398                 sb->s_op->umount_begin(sb);
1399         }
1400 
1401         /*
1402          * No sense to grab the lock for this test, but test itself looks
1403          * somewhat bogus. Suggestions for better replacement?
1404          * Ho-hum... In principle, we might treat that as umount + switch
1405          * to rootfs. GC would eventually take care of the old vfsmount.
1406          * Actually it makes sense, especially if rootfs would contain a
1407          * /reboot - static binary that would close all descriptors and
1408          * call reboot(9). Then init(8) could umount root and exec /reboot.
1409          */
1410         if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
1411                 /*
1412                  * Special case for "unmounting" root ...
1413                  * we just try to remount it readonly.
1414                  */
1415                 if (!capable(CAP_SYS_ADMIN))
1416                         return -EPERM;
1417                 down_write(&sb->s_umount);
1418                 if (!(sb->s_flags & MS_RDONLY))
1419                         retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
1420                 up_write(&sb->s_umount);
1421                 return retval;
1422         }
1423 
1424         namespace_lock();
1425         lock_mount_hash();
1426 
1427         /* Recheck MNT_LOCKED with the locks held */
1428         retval = -EINVAL;
1429         if (mnt->mnt.mnt_flags & MNT_LOCKED)
1430                 goto out;
1431 
1432         event++;
1433         if (flags & MNT_DETACH) {
1434                 if (!list_empty(&mnt->mnt_list))
1435                         umount_tree(mnt, UMOUNT_PROPAGATE);
1436                 retval = 0;
1437         } else {
1438                 shrink_submounts(mnt);
1439                 retval = -EBUSY;
1440                 if (!propagate_mount_busy(mnt, 2)) {
1441                         if (!list_empty(&mnt->mnt_list))
1442                                 umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
1443                         retval = 0;
1444                 }
1445         }
1446 out:
1447         unlock_mount_hash();
1448         namespace_unlock();
1449         return retval;
1450 }
1451 
1452 /* 
1453  * Is the caller allowed to modify his namespace?
1454  */
1455 static inline bool may_mount(void)
1456 {
1457         return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
1458 }
1459 
1460 /*
1461  * Now umount can handle mount points as well as block devices.
1462  * This is important for filesystems which use unnamed block devices.
1463  *
1464  * We now support a flag for forced unmount like the other 'big iron'
1465  * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
1466  */
1467 
1468 SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1469 {
1470         struct path path;
1471         struct mount *mnt;
1472         int retval;
1473         int lookup_flags = 0;
1474 
1475         if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
1476                 return -EINVAL;
1477 
1478         if (!may_mount())
1479                 return -EPERM;
1480 
1481         if (!(flags & UMOUNT_NOFOLLOW))
1482                 lookup_flags |= LOOKUP_FOLLOW;
1483 
1484         retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
1485         if (retval)
1486                 goto out;
1487         mnt = real_mount(path.mnt);
1488         retval = -EINVAL;
1489         if (path.dentry != path.mnt->mnt_root)
1490                 goto dput_and_out;
1491         if (!check_mnt(mnt))
1492                 goto dput_and_out;
1493         if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
1494                 goto dput_and_out;
1495         retval = -EPERM;
1496         if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
1497                 goto dput_and_out;
1498 
1499         retval = do_umount(mnt, flags);
1500 dput_and_out:
1501         /* we mustn't call path_put() as that would clear mnt_expiry_mark */
1502         dput(path.dentry);
1503         mntput_no_expire(mnt);
1504 out:
1505         return retval;
1506 }
1507 
1508 #ifdef __ARCH_WANT_SYS_OLDUMOUNT
1509 
1510 /*
1511  *      The 2.0 compatible umount. No flags.
1512  */
1513 SYSCALL_DEFINE1(oldumount, char __user *, name)
1514 {
1515         return sys_umount(name, 0);
1516 }
1517 
1518 #endif
1519 
1520 static bool is_mnt_ns_file(struct dentry *dentry)
1521 {
1522         /* Is this a proxy for a mount namespace? */
1523         struct inode *inode = dentry->d_inode;
1524         struct proc_ns *ei;
1525 
1526         if (!proc_ns_inode(inode))
1527                 return false;
1528 
1529         ei = get_proc_ns(inode);
1530         if (ei->ns_ops != &mntns_operations)
1531                 return false;
1532 
1533         return true;
1534 }
1535 
1536 static bool mnt_ns_loop(struct dentry *dentry)
1537 {
1538         /* Could bind mounting the mount namespace inode cause a
1539          * mount namespace loop?
1540          */
1541         struct mnt_namespace *mnt_ns;
1542         if (!is_mnt_ns_file(dentry))
1543                 return false;
1544 
1545         mnt_ns = get_proc_ns(dentry->d_inode)->ns;
1546         return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
1547 }
1548 
1549 struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1550                                         int flag)
1551 {
1552         struct mount *res, *p, *q, *r, *parent;
1553 
1554         if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
1555                 return ERR_PTR(-EINVAL);
1556 
1557         if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
1558                 return ERR_PTR(-EINVAL);
1559 
1560         res = q = clone_mnt(mnt, dentry, flag);
1561         if (IS_ERR(q))
1562                 return q;
1563 
1564         q->mnt.mnt_flags &= ~MNT_LOCKED;
1565         q->mnt_mountpoint = mnt->mnt_mountpoint;
1566 
1567         p = mnt;
1568         list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
1569                 struct mount *s;
1570                 if (!is_subdir(r->mnt_mountpoint, dentry))
1571                         continue;
1572 
1573                 for (s = r; s; s = next_mnt(s, r)) {
1574                         struct mount *t = NULL;
1575                         if (!(flag & CL_COPY_UNBINDABLE) &&
1576                             IS_MNT_UNBINDABLE(s)) {
1577                                 if (s->mnt.mnt_flags & MNT_LOCKED) {
1578                                         /* Both unbindable and locked. */
1579                                         q = ERR_PTR(-EPERM);
1580                                         goto out;
1581                                 } else {
1582                                         s = skip_mnt_tree(s);
1583                                         continue;
1584                                 }
1585                         }
1586                         if (!(flag & CL_COPY_MNT_NS_FILE) &&
1587                             is_mnt_ns_file(s->mnt.mnt_root)) {
1588                                 s = skip_mnt_tree(s);
1589                                 continue;
1590                         }
1591                         while (p != s->mnt_parent) {
1592                                 p = p->mnt_parent;
1593                                 q = q->mnt_parent;
1594                         }
1595                         p = s;
1596                         parent = q;
1597                         q = clone_mnt(p, p->mnt.mnt_root, flag);
1598                         if (IS_ERR(q))
1599                                 goto out;
1600                         lock_mount_hash();
1601                         list_add_tail(&q->mnt_list, &res->mnt_list);
1602                         mnt_set_mountpoint(parent, p->mnt_mp, q);
1603                         if (!list_empty(&parent->mnt_mounts)) {
1604                                 t = list_last_entry(&parent->mnt_mounts,
1605                                         struct mount, mnt_child);
1606                                 if (t->mnt_mp != p->mnt_mp)
1607                                         t = NULL;
1608                         }
1609                         attach_shadowed(q, parent, t);
1610                         unlock_mount_hash();
1611                 }
1612         }
1613         return res;
1614 out:
1615         if (res) {
1616                 lock_mount_hash();
1617                 umount_tree(res, UMOUNT_SYNC);
1618                 unlock_mount_hash();
1619         }
1620         return q;
1621 }
1622 
1623 /* Caller should check returned pointer for errors */
1624 
1625 struct vfsmount *collect_mounts(struct path *path)
1626 {
1627         struct mount *tree;
1628         namespace_lock();
1629         if (!check_mnt(real_mount(path->mnt)))
1630                 tree = ERR_PTR(-EINVAL);
1631         else
1632                 tree = copy_tree(real_mount(path->mnt), path->dentry,
1633                                  CL_COPY_ALL | CL_PRIVATE);
1634         namespace_unlock();
1635         if (IS_ERR(tree))
1636                 return ERR_CAST(tree);
1637         return &tree->mnt;
1638 }
1639 
1640 void drop_collected_mounts(struct vfsmount *mnt)
1641 {
1642         namespace_lock();
1643         lock_mount_hash();
1644         umount_tree(real_mount(mnt), 0);
1645         unlock_mount_hash();
1646         namespace_unlock();
1647 }
1648 
1649 int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
1650                    struct vfsmount *root)
1651 {
1652         struct mount *mnt;
1653         int res = f(root, arg);
1654         if (res)
1655                 return res;
1656         list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
1657                 res = f(&mnt->mnt, arg);
1658                 if (res)
1659                         return res;
1660         }
1661         return 0;
1662 }
1663 
1664 static void cleanup_group_ids(struct mount *mnt, struct mount *end)
1665 {
1666         struct mount *p;
1667 
1668         for (p = mnt; p != end; p = next_mnt(p, mnt)) {
1669                 if (p->mnt_group_id && !IS_MNT_SHARED(p))
1670                         mnt_release_group_id(p);
1671         }
1672 }
1673 
1674 static int invent_group_ids(struct mount *mnt, bool recurse)
1675 {
1676         struct mount *p;
1677 
1678         for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
1679                 if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
1680                         int err = mnt_alloc_group_id(p);
1681                         if (err) {
1682                                 cleanup_group_ids(mnt, p);
1683                                 return err;
1684                         }
1685                 }
1686         }
1687 
1688         return 0;
1689 }
1690 
1691 int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
1692 {
1693         unsigned int max = ACCESS_ONCE(sysctl_mount_max);
1694         unsigned int mounts = 0, old, pending, sum;
1695         struct mount *p;
1696 
1697         for (p = mnt; p; p = next_mnt(p, mnt))
1698                 mounts++;
1699 
1700         old = ns->mounts;
1701         pending = ns->pending_mounts;
1702         sum = old + pending;
1703         if ((old > sum) ||
1704             (pending > sum) ||
1705             (max < sum) ||
1706             (mounts > (max - sum)))
1707                 return -ENOSPC;
1708 
1709         ns->pending_mounts = pending + mounts;
1710         return 0;
1711 }
1712 
1713 /*
1714  *  @source_mnt : mount tree to be attached
1715  *  @nd         : place the mount tree @source_mnt is attached
1716  *  @parent_nd  : if non-null, detach the source_mnt from its parent and
1717  *                 store the parent mount and mountpoint dentry.
1718  *                 (done when source_mnt is moved)
1719  *
1720  *  NOTE: in the table below explains the semantics when a source mount
1721  *  of a given type is attached to a destination mount of a given type.
1722  * ---------------------------------------------------------------------------
1723  * |         BIND MOUNT OPERATION                                            |
1724  * |**************************************************************************
1725  * | source-->| shared        |       private  |       slave    | unbindable |
1726  * | dest     |               |                |                |            |
1727  * |   |      |               |                |                |            |
1728  * |   v      |               |                |                |            |
1729  * |**************************************************************************
1730  * |  shared  | shared (++)   |     shared (+) |     shared(+++)|  invalid   |
1731  * |          |               |                |                |            |
1732  * |non-shared| shared (+)    |      private   |      slave (*) |  invalid   |
1733  * ***************************************************************************
1734  * A bind operation clones the source mount and mounts the clone on the
1735  * destination mount.
1736  *
1737  * (++)  the cloned mount is propagated to all the mounts in the propagation
1738  *       tree of the destination mount and the cloned mount is added to
1739  *       the peer group of the source mount.
1740  * (+)   the cloned mount is created under the destination mount and is marked
1741  *       as shared. The cloned mount is added to the peer group of the source
1742  *       mount.
1743  * (+++) the mount is propagated to all the mounts in the propagation tree
1744  *       of the destination mount and the cloned mount is made slave
1745  *       of the same master as that of the source mount. The cloned mount
1746  *       is marked as 'shared and slave'.
1747  * (*)   the cloned mount is made a slave of the same master as that of the
1748  *       source mount.
1749  *
1750  * ---------------------------------------------------------------------------
1751  * |                    MOVE MOUNT OPERATION                                 |
1752  * |**************************************************************************
1753  * | source-->| shared        |       private  |       slave    | unbindable |
1754  * | dest     |               |                |                |            |
1755  * |   |      |               |                |                |            |
1756  * |   v      |               |                |                |            |
1757  * |**************************************************************************
1758  * |  shared  | shared (+)    |     shared (+) |    shared(+++) |  invalid   |
1759  * |          |               |                |                |            |
1760  * |non-shared| shared (+*)   |      private   |    slave (*)   | unbindable |
1761  * ***************************************************************************
1762  *
1763  * (+)  the mount is moved to the destination. And is then propagated to
1764  *      all the mounts in the propagation tree of the destination mount.
1765  * (+*)  the mount is moved to the destination.
1766  * (+++)  the mount is moved to the destination and is then propagated to
1767  *      all the mounts belonging to the destination mount's propagation tree.
1768  *      the mount is marked as 'shared and slave'.
1769  * (*)  the mount continues to be a slave at the new location.
1770  *
1771  * if the source mount is a tree, the operations explained above is
1772  * applied to each mount in the tree.
1773  * Must be called without spinlocks held, since this function can sleep
1774  * in allocations.
1775  */
1776 static int attach_recursive_mnt(struct mount *source_mnt,
1777                         struct mount *dest_mnt,
1778                         struct mountpoint *dest_mp,
1779                         struct path *parent_path)
1780 {
1781         HLIST_HEAD(tree_list);
1782         struct mnt_namespace *ns = dest_mnt->mnt_ns;
1783         struct mount *child, *p;
1784         struct hlist_node *n;
1785         int err;
1786 
1787         /* Is there space to add these mounts to the mount namespace? */
1788         if (!parent_path) {
1789                 err = count_mounts(ns, source_mnt);
1790                 if (err)
1791                         goto out;
1792         }
1793 
1794         if (IS_MNT_SHARED(dest_mnt)) {
1795                 err = invent_group_ids(source_mnt, true);
1796                 if (err)
1797                         goto out;
1798                 err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
1799                 lock_mount_hash();
1800                 if (err)
1801                         goto out_cleanup_ids;
1802                 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
1803                         set_mnt_shared(p);
1804         } else {
1805                 lock_mount_hash();
1806         }
1807         if (parent_path) {
1808                 detach_mnt(source_mnt, parent_path);
1809                 attach_mnt(source_mnt, dest_mnt, dest_mp);
1810                 touch_mnt_namespace(source_mnt->mnt_ns);
1811         } else {
1812                 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
1813                 commit_tree(source_mnt, NULL);
1814         }
1815 
1816         hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
1817                 struct mount *q;
1818                 hlist_del_init(&child->mnt_hash);
1819                 q = __lookup_mnt_last(&child->mnt_parent->mnt,
1820                                       child->mnt_mountpoint);
1821                 commit_tree(child, q);
1822         }
1823         unlock_mount_hash();
1824 
1825         return 0;
1826 
1827  out_cleanup_ids:
1828         while (!hlist_empty(&tree_list)) {
1829                 child = hlist_entry(tree_list.first, struct mount, mnt_hash);
1830                 child->mnt_parent->mnt_ns->pending_mounts = 0;
1831                 umount_tree(child, UMOUNT_SYNC);
1832         }
1833         unlock_mount_hash();
1834         cleanup_group_ids(source_mnt, NULL);
1835  out:
1836         ns->pending_mounts = 0;
1837         return err;
1838 }
1839 
1840 static struct mountpoint *lock_mount(struct path *path)
1841 {
1842         struct vfsmount *mnt;
1843         struct dentry *dentry = path->dentry;
1844 retry:
1845         mutex_lock(&dentry->d_inode->i_mutex);
1846         if (unlikely(cant_mount(dentry))) {
1847                 mutex_unlock(&dentry->d_inode->i_mutex);
1848                 return ERR_PTR(-ENOENT);
1849         }
1850         namespace_lock();
1851         mnt = lookup_mnt(path);
1852         if (likely(!mnt)) {
1853                 struct mountpoint *mp = new_mountpoint(dentry);
1854                 if (IS_ERR(mp)) {
1855                         namespace_unlock();
1856                         mutex_unlock(&dentry->d_inode->i_mutex);
1857                         return mp;
1858                 }
1859                 return mp;
1860         }
1861         namespace_unlock();
1862         mutex_unlock(&path->dentry->d_inode->i_mutex);
1863         path_put(path);
1864         path->mnt = mnt;
1865         dentry = path->dentry = dget(mnt->mnt_root);
1866         goto retry;
1867 }
1868 
1869 static void unlock_mount(struct mountpoint *where)
1870 {
1871         struct dentry *dentry = where->m_dentry;
1872         put_mountpoint(where);
1873         namespace_unlock();
1874         mutex_unlock(&dentry->d_inode->i_mutex);
1875 }
1876 
1877 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
1878 {
1879         if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER)
1880                 return -EINVAL;
1881 
1882         if (S_ISDIR(mp->m_dentry->d_inode->i_mode) !=
1883               S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode))
1884                 return -ENOTDIR;
1885 
1886         return attach_recursive_mnt(mnt, p, mp, NULL);
1887 }
1888 
1889 /*
1890  * Sanity check the flags to change_mnt_propagation.
1891  */
1892 
1893 static int flags_to_propagation_type(int flags)
1894 {
1895         int type = flags & ~(MS_REC | MS_SILENT);
1896 
1897         /* Fail if any non-propagation flags are set */
1898         if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
1899                 return 0;
1900         /* Only one propagation flag should be set */
1901         if (!is_power_of_2(type))
1902                 return 0;
1903         return type;
1904 }
1905 
1906 /*
1907  * recursively change the type of the mountpoint.
1908  */
1909 static int do_change_type(struct path *path, int flag)
1910 {
1911         struct mount *m;
1912         struct mount *mnt = real_mount(path->mnt);
1913         int recurse = flag & MS_REC;
1914         int type;
1915         int err = 0;
1916 
1917         if (path->dentry != path->mnt->mnt_root)
1918                 return -EINVAL;
1919 
1920         type = flags_to_propagation_type(flag);
1921         if (!type)
1922                 return -EINVAL;
1923 
1924         namespace_lock();
1925         if (type == MS_SHARED) {
1926                 err = invent_group_ids(mnt, recurse);
1927                 if (err)
1928                         goto out_unlock;
1929         }
1930 
1931         lock_mount_hash();
1932         for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
1933                 change_mnt_propagation(m, type);
1934         unlock_mount_hash();
1935 
1936  out_unlock:
1937         namespace_unlock();
1938         return err;
1939 }
1940 
1941 static bool has_locked_children(struct mount *mnt, struct dentry *dentry)
1942 {
1943         struct mount *child;
1944         list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
1945                 if (!is_subdir(child->mnt_mountpoint, dentry))
1946                         continue;
1947 
1948                 if (child->mnt.mnt_flags & MNT_LOCKED)
1949                         return true;
1950         }
1951         return false;
1952 }
1953 
1954 /*
1955  * do loopback mount.
1956  */
1957 static int do_loopback(struct path *path, const char *old_name,
1958                                 int recurse)
1959 {
1960         struct path old_path;
1961         struct mount *mnt = NULL, *old, *parent;
1962         struct mountpoint *mp;
1963         int err;
1964         if (!old_name || !*old_name)
1965                 return -EINVAL;
1966         err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
1967         if (err)
1968                 return err;
1969 
1970         err = -EINVAL;
1971         if (mnt_ns_loop(old_path.dentry))
1972                 goto out; 
1973 
1974         mp = lock_mount(path);
1975         err = PTR_ERR(mp);
1976         if (IS_ERR(mp))
1977                 goto out;
1978 
1979         old = real_mount(old_path.mnt);
1980         parent = real_mount(path->mnt);
1981 
1982         err = -EINVAL;
1983         if (IS_MNT_UNBINDABLE(old))
1984                 goto out2;
1985 
1986         if (!check_mnt(parent) || !check_mnt(old))
1987                 goto out2;
1988 
1989         if (!recurse && has_locked_children(old, old_path.dentry))
1990                 goto out2;
1991 
1992         if (recurse)
1993                 mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
1994         else
1995                 mnt = clone_mnt(old, old_path.dentry, 0);
1996 
1997         if (IS_ERR(mnt)) {
1998                 err = PTR_ERR(mnt);
1999                 goto out2;
2000         }
2001 
2002         mnt->mnt.mnt_flags &= ~MNT_LOCKED;
2003 
2004         err = graft_tree(mnt, parent, mp);
2005         if (err) {
2006                 lock_mount_hash();
2007                 umount_tree(mnt, UMOUNT_SYNC);
2008                 unlock_mount_hash();
2009         }
2010 out2:
2011         unlock_mount(mp);
2012 out:
2013         path_put(&old_path);
2014         return err;
2015 }
2016 
2017 static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
2018 {
2019         int error = 0;
2020         int readonly_request = 0;
2021 
2022         if (ms_flags & MS_RDONLY)
2023                 readonly_request = 1;
2024         if (readonly_request == __mnt_is_readonly(mnt))
2025                 return 0;
2026 
2027         if (readonly_request)
2028                 error = mnt_make_readonly(real_mount(mnt));
2029         else
2030                 __mnt_unmake_readonly(real_mount(mnt));
2031         return error;
2032 }
2033 
2034 /*
2035  * change filesystem flags. dir should be a physical root of filesystem.
2036  * If you've mounted a non-root directory somewhere and want to do remount
2037  * on it - tough luck.
2038  */
2039 static int do_remount(struct path *path, int flags, int mnt_flags,
2040                       void *data)
2041 {
2042         int err;
2043         struct super_block *sb = path->mnt->mnt_sb;
2044         struct mount *mnt = real_mount(path->mnt);
2045 
2046         if (!check_mnt(mnt))
2047                 return -EINVAL;
2048 
2049         if (path->dentry != path->mnt->mnt_root)
2050                 return -EINVAL;
2051 
2052         /* Don't allow changing of locked mnt flags.
2053          *
2054          * No locks need to be held here while testing the various
2055          * MNT_LOCK flags because those flags can never be cleared
2056          * once they are set.
2057          */
2058         if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
2059             !(mnt_flags & MNT_READONLY)) {
2060                 return -EPERM;
2061         }
2062         if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
2063             !(mnt_flags & MNT_NODEV)) {
2064                 /* Was the nodev implicitly added in mount? */
2065                 if ((mnt->mnt_ns->user_ns != &init_user_ns) &&
2066                     !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) {
2067                         mnt_flags |= MNT_NODEV;
2068                 } else {
2069                         return -EPERM;
2070                 }
2071         }
2072         if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
2073             !(mnt_flags & MNT_NOSUID)) {
2074                 return -EPERM;
2075         }
2076         if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
2077             !(mnt_flags & MNT_NOEXEC)) {
2078                 return -EPERM;
2079         }
2080         if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
2081             ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
2082                 return -EPERM;
2083         }
2084 
2085         err = security_sb_remount(sb, data);
2086         if (err)
2087                 return err;
2088 
2089         down_write(&sb->s_umount);
2090         if (flags & MS_BIND)
2091                 err = change_mount_flags(path->mnt, flags);
2092         else if (!capable(CAP_SYS_ADMIN))
2093                 err = -EPERM;
2094         else
2095                 err = do_remount_sb(sb, flags, data, 0);
2096         if (!err) {
2097                 lock_mount_hash();
2098                 mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
2099                 mnt->mnt.mnt_flags = mnt_flags;
2100                 touch_mnt_namespace(mnt->mnt_ns);
2101                 unlock_mount_hash();
2102         }
2103         up_write(&sb->s_umount);
2104         return err;
2105 }
2106 
2107 static inline int tree_contains_unbindable(struct mount *mnt)
2108 {
2109         struct mount *p;
2110         for (p = mnt; p; p = next_mnt(p, mnt)) {
2111                 if (IS_MNT_UNBINDABLE(p))
2112                         return 1;
2113         }
2114         return 0;
2115 }
2116 
2117 static int do_move_mount(struct path *path, const char *old_name)
2118 {
2119         struct path old_path, parent_path;
2120         struct mount *p;
2121         struct mount *old;
2122         struct mountpoint *mp;
2123         int err;
2124         if (!old_name || !*old_name)
2125                 return -EINVAL;
2126         err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
2127         if (err)
2128                 return err;
2129 
2130         mp = lock_mount(path);
2131         err = PTR_ERR(mp);
2132         if (IS_ERR(mp))
2133                 goto out;
2134 
2135         old = real_mount(old_path.mnt);
2136         p = real_mount(path->mnt);
2137 
2138         err = -EINVAL;
2139         if (!check_mnt(p) || !check_mnt(old))
2140                 goto out1;
2141 
2142         if (old->mnt.mnt_flags & MNT_LOCKED)
2143                 goto out1;
2144 
2145         err = -EINVAL;
2146         if (old_path.dentry != old_path.mnt->mnt_root)
2147                 goto out1;
2148 
2149         if (!mnt_has_parent(old))
2150                 goto out1;
2151 
2152         if (S_ISDIR(path->dentry->d_inode->i_mode) !=
2153               S_ISDIR(old_path.dentry->d_inode->i_mode))
2154                 goto out1;
2155         /*
2156          * Don't move a mount residing in a shared parent.
2157          */
2158         if (IS_MNT_SHARED(old->mnt_parent))
2159                 goto out1;
2160         /*
2161          * Don't move a mount tree containing unbindable mounts to a destination
2162          * mount which is shared.
2163          */
2164         if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
2165                 goto out1;
2166         err = -ELOOP;
2167         for (; mnt_has_parent(p); p = p->mnt_parent)
2168                 if (p == old)
2169                         goto out1;
2170 
2171         err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
2172         if (err)
2173                 goto out1;
2174 
2175         /* if the mount is moved, it should no longer be expire
2176          * automatically */
2177         list_del_init(&old->mnt_expire);
2178 out1:
2179         unlock_mount(mp);
2180 out:
2181         if (!err)
2182                 path_put(&parent_path);
2183         path_put(&old_path);
2184         return err;
2185 }
2186 
2187 static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
2188 {
2189         int err;
2190         const char *subtype = strchr(fstype, '.');
2191         if (subtype) {
2192                 subtype++;
2193                 err = -EINVAL;
2194                 if (!subtype[0])
2195                         goto err;
2196         } else
2197                 subtype = "";
2198 
2199         mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
2200         err = -ENOMEM;
2201         if (!mnt->mnt_sb->s_subtype)
2202                 goto err;
2203         return mnt;
2204 
2205  err:
2206         mntput(mnt);
2207         return ERR_PTR(err);
2208 }
2209 
2210 /*
2211  * add a mount into a namespace's mount tree
2212  */
2213 static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
2214 {
2215         struct mountpoint *mp;
2216         struct mount *parent;
2217         int err;
2218 
2219         mnt_flags &= ~MNT_INTERNAL_FLAGS;
2220 
2221         mp = lock_mount(path);
2222         if (IS_ERR(mp))
2223                 return PTR_ERR(mp);
2224 
2225         parent = real_mount(path->mnt);
2226         err = -EINVAL;
2227         if (unlikely(!check_mnt(parent))) {
2228                 /* that's acceptable only for automounts done in private ns */
2229                 if (!(mnt_flags & MNT_SHRINKABLE))
2230                         goto unlock;
2231                 /* ... and for those we'd better have mountpoint still alive */
2232                 if (!parent->mnt_ns)
2233                         goto unlock;
2234         }
2235 
2236         /* Refuse the same filesystem on the same mount point */
2237         err = -EBUSY;
2238         if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
2239             path->mnt->mnt_root == path->dentry)
2240                 goto unlock;
2241 
2242         err = -EINVAL;
2243         if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode))
2244                 goto unlock;
2245 
2246         newmnt->mnt.mnt_flags = mnt_flags;
2247         err = graft_tree(newmnt, parent, mp);
2248 
2249 unlock:
2250         unlock_mount(mp);
2251         return err;
2252 }
2253 
2254 static bool fs_fully_visible(struct file_system_type *fs_type, int *new_mnt_flags);
2255 
2256 /*
2257  * create a new mount for userspace and request it to be added into the
2258  * namespace's tree
2259  */
2260 static int do_new_mount(struct path *path, const char *fstype, int flags,
2261                         int mnt_flags, const char *name, void *data)
2262 {
2263         struct file_system_type *type;
2264         struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2265         struct vfsmount *mnt;
2266         int err;
2267 
2268         if (!fstype)
2269                 return -EINVAL;
2270 
2271         type = get_fs_type(fstype);
2272         if (!type)
2273                 return -ENODEV;
2274 
2275         if (user_ns != &init_user_ns) {
2276                 if (!(type->fs_flags & FS_USERNS_MOUNT)) {
2277                         put_filesystem(type);
2278                         return -EPERM;
2279                 }
2280                 /* Only in special cases allow devices from mounts
2281                  * created outside the initial user namespace.
2282                  */
2283                 if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
2284                         flags |= MS_NODEV;
2285                         mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
2286                 }
2287                 if (type->fs_flags & FS_USERNS_VISIBLE) {
2288                         if (!fs_fully_visible(type, &mnt_flags))
2289                                 return -EPERM;
2290                 }
2291         }
2292 
2293         mnt = vfs_kern_mount(type, flags, name, data);
2294         if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
2295             !mnt->mnt_sb->s_subtype)
2296                 mnt = fs_set_subtype(mnt, fstype);
2297 
2298         put_filesystem(type);
2299         if (IS_ERR(mnt))
2300                 return PTR_ERR(mnt);
2301 
2302         err = do_add_mount(real_mount(mnt), path, mnt_flags);
2303         if (err)
2304                 mntput(mnt);
2305         return err;
2306 }
2307 
2308 int finish_automount(struct vfsmount *m, struct path *path)
2309 {
2310         struct mount *mnt = real_mount(m);
2311         int err;
2312         /* The new mount record should have at least 2 refs to prevent it being
2313          * expired before we get a chance to add it
2314          */
2315         BUG_ON(mnt_get_count(mnt) < 2);
2316 
2317         if (m->mnt_sb == path->mnt->mnt_sb &&
2318             m->mnt_root == path->dentry) {
2319                 err = -ELOOP;
2320                 goto fail;
2321         }
2322 
2323         err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
2324         if (!err)
2325                 return 0;
2326 fail:
2327         /* remove m from any expiration list it may be on */
2328         if (!list_empty(&mnt->mnt_expire)) {
2329                 namespace_lock();
2330                 list_del_init(&mnt->mnt_expire);
2331                 namespace_unlock();
2332         }
2333         mntput(m);
2334         mntput(m);
2335         return err;
2336 }
2337 
2338 /**
2339  * mnt_set_expiry - Put a mount on an expiration list
2340  * @mnt: The mount to list.
2341  * @expiry_list: The list to add the mount to.
2342  */
2343 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
2344 {
2345         namespace_lock();
2346 
2347         list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
2348 
2349         namespace_unlock();
2350 }
2351 EXPORT_SYMBOL(mnt_set_expiry);
2352 
2353 /*
2354  * process a list of expirable mountpoints with the intent of discarding any
2355  * mountpoints that aren't in use and haven't been touched since last we came
2356  * here
2357  */
2358 void mark_mounts_for_expiry(struct list_head *mounts)
2359 {
2360         struct mount *mnt, *next;
2361         LIST_HEAD(graveyard);
2362 
2363         if (list_empty(mounts))
2364                 return;
2365 
2366         namespace_lock();
2367         lock_mount_hash();
2368 
2369         /* extract from the expiration list every vfsmount that matches the
2370          * following criteria:
2371          * - only referenced by its parent vfsmount
2372          * - still marked for expiry (marked on the last call here; marks are
2373          *   cleared by mntput())
2374          */
2375         list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
2376                 if (!xchg(&mnt->mnt_expiry_mark, 1) ||
2377                         propagate_mount_busy(mnt, 1))
2378                         continue;
2379                 list_move(&mnt->mnt_expire, &graveyard);
2380         }
2381         while (!list_empty(&graveyard)) {
2382                 mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
2383                 touch_mnt_namespace(mnt->mnt_ns);
2384                 umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
2385         }
2386         unlock_mount_hash();
2387         namespace_unlock();
2388 }
2389 
2390 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
2391 
2392 /*
2393  * Ripoff of 'select_parent()'
2394  *
2395  * search the list of submounts for a given mountpoint, and move any
2396  * shrinkable submounts to the 'graveyard' list.
2397  */
2398 static int select_submounts(struct mount *parent, struct list_head *graveyard)
2399 {
2400         struct mount *this_parent = parent;
2401         struct list_head *next;
2402         int found = 0;
2403 
2404 repeat:
2405         next = this_parent->mnt_mounts.next;
2406 resume:
2407         while (next != &this_parent->mnt_mounts) {
2408                 struct list_head *tmp = next;
2409                 struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
2410 
2411                 next = tmp->next;
2412                 if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
2413                         continue;
2414                 /*
2415                  * Descend a level if the d_mounts list is non-empty.
2416                  */
2417                 if (!list_empty(&mnt->mnt_mounts)) {
2418                         this_parent = mnt;
2419                         goto repeat;
2420                 }
2421 
2422                 if (!propagate_mount_busy(mnt, 1)) {
2423                         list_move_tail(&mnt->mnt_expire, graveyard);
2424                         found++;
2425                 }
2426         }
2427         /*
2428          * All done at this level ... ascend and resume the search
2429          */
2430         if (this_parent != parent) {
2431                 next = this_parent->mnt_child.next;
2432                 this_parent = this_parent->mnt_parent;
2433                 goto resume;
2434         }
2435         return found;
2436 }
2437 
2438 /*
2439  * process a list of expirable mountpoints with the intent of discarding any
2440  * submounts of a specific parent mountpoint
2441  *
2442  * mount_lock must be held for write
2443  */
2444 static void shrink_submounts(struct mount *mnt)
2445 {
2446         LIST_HEAD(graveyard);
2447         struct mount *m;
2448 
2449         /* extract submounts of 'mountpoint' from the expiration list */
2450         while (select_submounts(mnt, &graveyard)) {
2451                 while (!list_empty(&graveyard)) {
2452                         m = list_first_entry(&graveyard, struct mount,
2453                                                 mnt_expire);
2454                         touch_mnt_namespace(m->mnt_ns);
2455                         umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
2456                 }
2457         }
2458 }
2459 
2460 /*
2461  * Some copy_from_user() implementations do not return the exact number of
2462  * bytes remaining to copy on a fault.  But copy_mount_options() requires that.
2463  * Note that this function differs from copy_from_user() in that it will oops
2464  * on bad values of `to', rather than returning a short copy.
2465  */
2466 static long exact_copy_from_user(void *to, const void __user * from,
2467                                  unsigned long n)
2468 {
2469         char *t = to;
2470         const char __user *f = from;
2471         char c;
2472 
2473         if (!access_ok(VERIFY_READ, from, n))
2474                 return n;
2475 
2476         while (n) {
2477                 if (__get_user(c, f)) {
2478                         memset(t, 0, n);
2479                         break;
2480                 }
2481                 *t++ = c;
2482                 f++;
2483                 n--;
2484         }
2485         return n;
2486 }
2487 
2488 int copy_mount_options(const void __user * data, unsigned long *where)
2489 {
2490         int i;
2491         unsigned long page;
2492         unsigned long size;
2493 
2494         *where = 0;
2495         if (!data)
2496                 return 0;
2497 
2498         if (!(page = __get_free_page(GFP_KERNEL)))
2499                 return -ENOMEM;
2500 
2501         /* We only care that *some* data at the address the user
2502          * gave us is valid.  Just in case, we'll zero
2503          * the remainder of the page.
2504          */
2505         /* copy_from_user cannot cross TASK_SIZE ! */
2506         size = TASK_SIZE - (unsigned long)data;
2507         if (size > PAGE_SIZE)
2508                 size = PAGE_SIZE;
2509 
2510         i = size - exact_copy_from_user((void *)page, data, size);
2511         if (!i) {
2512                 free_page(page);
2513                 return -EFAULT;
2514         }
2515         if (i != PAGE_SIZE)
2516                 memset((char *)page + i, 0, PAGE_SIZE - i);
2517         *where = page;
2518         return 0;
2519 }
2520 
2521 char *copy_mount_string(const void __user *data)
2522 {
2523         return data ? strndup_user(data, PAGE_SIZE) : NULL;
2524 }
2525 
2526 /*
2527  * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
2528  * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
2529  *
2530  * data is a (void *) that can point to any structure up to
2531  * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
2532  * information (or be NULL).
2533  *
2534  * Pre-0.97 versions of mount() didn't have a flags word.
2535  * When the flags word was introduced its top half was required
2536  * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
2537  * Therefore, if this magic number is present, it carries no information
2538  * and must be discarded.
2539  */
2540 long do_mount(const char *dev_name, const char *dir_name,
2541                 const char *type_page, unsigned long flags, void *data_page)
2542 {
2543         struct path path;
2544         int retval = 0;
2545         int mnt_flags = 0;
2546 
2547         /* Discard magic */
2548         if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
2549                 flags &= ~MS_MGC_MSK;
2550 
2551         /* Basic sanity checks */
2552 
2553         if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
2554                 return -EINVAL;
2555 
2556         if (data_page)
2557                 ((char *)data_page)[PAGE_SIZE - 1] = 0;
2558 
2559         /* ... and get the mountpoint */
2560         retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
2561         if (retval)
2562                 return retval;
2563 
2564         retval = security_sb_mount(dev_name, &path,
2565                                    type_page, flags, data_page);
2566         if (!retval && !may_mount())
2567                 retval = -EPERM;
2568         if (retval)
2569                 goto dput_out;
2570 
2571         /* Default to relatime unless overriden */
2572         if (!(flags & MS_NOATIME))
2573                 mnt_flags |= MNT_RELATIME;
2574 
2575         /* Separate the per-mountpoint flags */
2576         if (flags & MS_NOSUID)
2577                 mnt_flags |= MNT_NOSUID;
2578         if (flags & MS_NODEV)
2579                 mnt_flags |= MNT_NODEV;
2580         if (flags & MS_NOEXEC)
2581                 mnt_flags |= MNT_NOEXEC;
2582         if (flags & MS_NOATIME)
2583                 mnt_flags |= MNT_NOATIME;
2584         if (flags & MS_NODIRATIME)
2585                 mnt_flags |= MNT_NODIRATIME;
2586         if (flags & MS_STRICTATIME)
2587                 mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
2588         if (flags & MS_RDONLY)
2589                 mnt_flags |= MNT_READONLY;
2590 
2591         /* The default atime for remount is preservation */
2592         if ((flags & MS_REMOUNT) &&
2593             ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
2594                        MS_STRICTATIME)) == 0)) {
2595                 mnt_flags &= ~MNT_ATIME_MASK;
2596                 mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
2597         }
2598 
2599         flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
2600                    MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
2601                    MS_STRICTATIME);
2602 
2603         if (flags & MS_REMOUNT)
2604                 retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
2605                                     data_page);
2606         else if (flags & MS_BIND)
2607                 retval = do_loopback(&path, dev_name, flags & MS_REC);
2608         else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
2609                 retval = do_change_type(&path, flags);
2610         else if (flags & MS_MOVE)
2611                 retval = do_move_mount(&path, dev_name);
2612         else
2613                 retval = do_new_mount(&path, type_page, flags, mnt_flags,
2614                                       dev_name, data_page);
2615 dput_out:
2616         path_put(&path);
2617         return retval;
2618 }
2619 
2620 static void free_mnt_ns(struct mnt_namespace *ns)
2621 {
2622         proc_free_inum(ns->proc_inum);
2623         put_user_ns(ns->user_ns);
2624         kfree(ns);
2625 }
2626 
2627 /*
2628  * Assign a sequence number so we can detect when we attempt to bind
2629  * mount a reference to an older mount namespace into the current
2630  * mount namespace, preventing reference counting loops.  A 64bit
2631  * number incrementing at 10Ghz will take 12,427 years to wrap which
2632  * is effectively never, so we can ignore the possibility.
2633  */
2634 static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
2635 
2636 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
2637 {
2638         struct mnt_namespace *new_ns;
2639         int ret;
2640 
2641         new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
2642         if (!new_ns)
2643                 return ERR_PTR(-ENOMEM);
2644         ret = proc_alloc_inum(&new_ns->proc_inum);
2645         if (ret) {
2646                 kfree(new_ns);
2647                 return ERR_PTR(ret);
2648         }
2649         new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
2650         atomic_set(&new_ns->count, 1);
2651         new_ns->root = NULL;
2652         INIT_LIST_HEAD(&new_ns->list);
2653         init_waitqueue_head(&new_ns->poll);
2654         new_ns->event = 0;
2655         new_ns->user_ns = get_user_ns(user_ns);
2656         new_ns->mounts = 0;
2657         new_ns->pending_mounts = 0;
2658         return new_ns;
2659 }
2660 
2661 struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
2662                 struct user_namespace *user_ns, struct fs_struct *new_fs)
2663 {
2664         struct mnt_namespace *new_ns;
2665         struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
2666         struct mount *p, *q;
2667         struct mount *old;
2668         struct mount *new;
2669         int copy_flags;
2670 
2671         BUG_ON(!ns);
2672 
2673         if (likely(!(flags & CLONE_NEWNS))) {
2674                 get_mnt_ns(ns);
2675                 return ns;
2676         }
2677 
2678         old = ns->root;
2679 
2680         new_ns = alloc_mnt_ns(user_ns);
2681         if (IS_ERR(new_ns))
2682                 return new_ns;
2683 
2684         namespace_lock();
2685         /* First pass: copy the tree topology */
2686         copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
2687         if (user_ns != ns->user_ns)
2688                 copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
2689         new = copy_tree(old, old->mnt.mnt_root, copy_flags);
2690         if (IS_ERR(new)) {
2691                 namespace_unlock();
2692                 free_mnt_ns(new_ns);
2693                 return ERR_CAST(new);
2694         }
2695         new_ns->root = new;
2696         list_add_tail(&new_ns->list, &new->mnt_list);
2697 
2698         /*
2699          * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
2700          * as belonging to new namespace.  We have already acquired a private
2701          * fs_struct, so tsk->fs->lock is not needed.
2702          */
2703         p = old;
2704         q = new;
2705         while (p) {
2706                 q->mnt_ns = new_ns;
2707                 new_ns->mounts++;
2708                 if (new_fs) {
2709                         if (&p->mnt == new_fs->root.mnt) {
2710                                 new_fs->root.mnt = mntget(&q->mnt);
2711                                 rootmnt = &p->mnt;
2712                         }
2713                         if (&p->mnt == new_fs->pwd.mnt) {
2714                                 new_fs->pwd.mnt = mntget(&q->mnt);
2715                                 pwdmnt = &p->mnt;
2716                         }
2717                 }
2718                 p = next_mnt(p, old);
2719                 q = next_mnt(q, new);
2720                 if (!q)
2721                         break;
2722                 while (p->mnt.mnt_root != q->mnt.mnt_root)
2723                         p = next_mnt(p, old);
2724         }
2725         namespace_unlock();
2726 
2727         if (rootmnt)
2728                 mntput(rootmnt);
2729         if (pwdmnt)
2730                 mntput(pwdmnt);
2731 
2732         return new_ns;
2733 }
2734 
2735 /**
2736  * create_mnt_ns - creates a private namespace and adds a root filesystem
2737  * @mnt: pointer to the new root filesystem mountpoint
2738  */
2739 static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
2740 {
2741         struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
2742         if (!IS_ERR(new_ns)) {
2743                 struct mount *mnt = real_mount(m);
2744                 mnt->mnt_ns = new_ns;
2745                 new_ns->root = mnt;
2746                 new_ns->mounts++;
2747                 list_add(&mnt->mnt_list, &new_ns->list);
2748         } else {
2749                 mntput(m);
2750         }
2751         return new_ns;
2752 }
2753 
2754 struct dentry *mount_subtree(struct vfsmount *mnt, const char *name)
2755 {
2756         struct mnt_namespace *ns;
2757         struct super_block *s;
2758         struct path path;
2759         int err;
2760 
2761         ns = create_mnt_ns(mnt);
2762         if (IS_ERR(ns))
2763                 return ERR_CAST(ns);
2764 
2765         err = vfs_path_lookup(mnt->mnt_root, mnt,
2766                         name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
2767 
2768         put_mnt_ns(ns);
2769 
2770         if (err)
2771                 return ERR_PTR(err);
2772 
2773         /* trade a vfsmount reference for active sb one */
2774         s = path.mnt->mnt_sb;
2775         atomic_inc(&s->s_active);
2776         mntput(path.mnt);
2777         /* lock the sucker */
2778         down_write(&s->s_umount);
2779         /* ... and return the root of (sub)tree on it */
2780         return path.dentry;
2781 }
2782 EXPORT_SYMBOL(mount_subtree);
2783 
2784 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2785                 char __user *, type, unsigned long, flags, void __user *, data)
2786 {
2787         int ret;
2788         char *kernel_type;
2789         struct filename *kernel_dir;
2790         char *kernel_dev;
2791         unsigned long data_page;
2792 
2793         kernel_type = copy_mount_string(type);
2794         ret = PTR_ERR(kernel_type);
2795         if (IS_ERR(kernel_type))
2796                 goto out_type;
2797 
2798         kernel_dir = getname(dir_name);
2799         if (IS_ERR(kernel_dir)) {
2800                 ret = PTR_ERR(kernel_dir);
2801                 goto out_dir;
2802         }
2803 
2804         kernel_dev = copy_mount_string(dev_name);
2805         ret = PTR_ERR(kernel_dev);
2806         if (IS_ERR(kernel_dev))
2807                 goto out_dev;
2808 
2809         ret = copy_mount_options(data, &data_page);
2810         if (ret < 0)
2811                 goto out_data;
2812 
2813         ret = do_mount(kernel_dev, kernel_dir->name, kernel_type, flags,
2814                 (void *) data_page);
2815 
2816         free_page(data_page);
2817 out_data:
2818         kfree(kernel_dev);
2819 out_dev:
2820         putname(kernel_dir);
2821 out_dir:
2822         kfree(kernel_type);
2823 out_type:
2824         return ret;
2825 }
2826 
2827 /*
2828  * Return true if path is reachable from root
2829  *
2830  * namespace_sem or mount_lock is held
2831  */
2832 bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
2833                          const struct path *root)
2834 {
2835         while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
2836                 dentry = mnt->mnt_mountpoint;
2837                 mnt = mnt->mnt_parent;
2838         }
2839         return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
2840 }
2841 
2842 int path_is_under(struct path *path1, struct path *path2)
2843 {
2844         int res;
2845         read_seqlock_excl(&mount_lock);
2846         res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
2847         read_sequnlock_excl(&mount_lock);
2848         return res;
2849 }
2850 EXPORT_SYMBOL(path_is_under);
2851 
2852 /*
2853  * pivot_root Semantics:
2854  * Moves the root file system of the current process to the directory put_old,
2855  * makes new_root as the new root file system of the current process, and sets
2856  * root/cwd of all processes which had them on the current root to new_root.
2857  *
2858  * Restrictions:
2859  * The new_root and put_old must be directories, and  must not be on the
2860  * same file  system as the current process root. The put_old  must  be
2861  * underneath new_root,  i.e. adding a non-zero number of /.. to the string
2862  * pointed to by put_old must yield the same directory as new_root. No other
2863  * file system may be mounted on put_old. After all, new_root is a mountpoint.
2864  *
2865  * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
2866  * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
2867  * in this situation.
2868  *
2869  * Notes:
2870  *  - we don't move root/cwd if they are not at the root (reason: if something
2871  *    cared enough to change them, it's probably wrong to force them elsewhere)
2872  *  - it's okay to pick a root that isn't the root of a file system, e.g.
2873  *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
2874  *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
2875  *    first.
2876  */
2877 SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2878                 const char __user *, put_old)
2879 {
2880         struct path new, old, parent_path, root_parent, root;
2881         struct mount *new_mnt, *root_mnt, *old_mnt;
2882         struct mountpoint *old_mp, *root_mp;
2883         int error;
2884 
2885         if (!may_mount())
2886                 return -EPERM;
2887 
2888         error = user_path_dir(new_root, &new);
2889         if (error)
2890                 goto out0;
2891 
2892         error = user_path_dir(put_old, &old);
2893         if (error)
2894                 goto out1;
2895 
2896         error = security_sb_pivotroot(&old, &new);
2897         if (error)
2898                 goto out2;
2899 
2900         get_fs_root(current->fs, &root);
2901         old_mp = lock_mount(&old);
2902         error = PTR_ERR(old_mp);
2903         if (IS_ERR(old_mp))
2904                 goto out3;
2905 
2906         error = -EINVAL;
2907         new_mnt = real_mount(new.mnt);
2908         root_mnt = real_mount(root.mnt);
2909         old_mnt = real_mount(old.mnt);
2910         if (IS_MNT_SHARED(old_mnt) ||
2911                 IS_MNT_SHARED(new_mnt->mnt_parent) ||
2912                 IS_MNT_SHARED(root_mnt->mnt_parent))
2913                 goto out4;
2914         if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
2915                 goto out4;
2916         if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
2917                 goto out4;
2918         error = -ENOENT;
2919         if (d_unlinked(new.dentry))
2920                 goto out4;
2921         error = -EBUSY;
2922         if (new_mnt == root_mnt || old_mnt == root_mnt)
2923                 goto out4; /* loop, on the same file system  */
2924         error = -EINVAL;
2925         if (root.mnt->mnt_root != root.dentry)
2926                 goto out4; /* not a mountpoint */
2927         if (!mnt_has_parent(root_mnt))
2928                 goto out4; /* not attached */
2929         root_mp = root_mnt->mnt_mp;
2930         if (new.mnt->mnt_root != new.dentry)
2931                 goto out4; /* not a mountpoint */
2932         if (!mnt_has_parent(new_mnt))
2933                 goto out4; /* not attached */
2934         /* make sure we can reach put_old from new_root */
2935         if (!is_path_reachable(old_mnt, old.dentry, &new))
2936                 goto out4;
2937         /* make certain new is below the root */
2938         if (!is_path_reachable(new_mnt, new.dentry, &root))
2939                 goto out4;
2940         lock_mount_hash();
2941         root_mp->m_count++; /* pin it so it won't go away */
2942         detach_mnt(new_mnt, &parent_path);
2943         detach_mnt(root_mnt, &root_parent);
2944         if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
2945                 new_mnt->mnt.mnt_flags |= MNT_LOCKED;
2946                 root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
2947         }
2948         /* mount old root on put_old */
2949         attach_mnt(root_mnt, old_mnt, old_mp);
2950         /* mount new_root on / */
2951         attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
2952         touch_mnt_namespace(current->nsproxy->mnt_ns);
2953         unlock_mount_hash();
2954         chroot_fs_refs(&root, &new);
2955         put_mountpoint(root_mp);
2956         error = 0;
2957 out4:
2958         unlock_mount(old_mp);
2959         if (!error) {
2960                 path_put(&root_parent);
2961                 path_put(&parent_path);
2962         }
2963 out3:
2964         path_put(&root);
2965 out2:
2966         path_put(&old);
2967 out1:
2968         path_put(&new);
2969 out0:
2970         return error;
2971 }
2972 
2973 static void __init init_mount_tree(void)
2974 {
2975         struct vfsmount *mnt;
2976         struct mnt_namespace *ns;
2977         struct path root;
2978         struct file_system_type *type;
2979 
2980         type = get_fs_type("rootfs");
2981         if (!type)
2982                 panic("Can't find rootfs type");
2983         mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
2984         put_filesystem(type);
2985         if (IS_ERR(mnt))
2986                 panic("Can't create rootfs");
2987 
2988         ns = create_mnt_ns(mnt);
2989         if (IS_ERR(ns))
2990                 panic("Can't allocate initial namespace");
2991 
2992         init_task.nsproxy->mnt_ns = ns;
2993         get_mnt_ns(ns);
2994 
2995         root.mnt = mnt;
2996         root.dentry = mnt->mnt_root;
2997         mnt->mnt_flags |= MNT_LOCKED;
2998 
2999         set_fs_pwd(current->fs, &root);
3000         set_fs_root(current->fs, &root);
3001 }
3002 
3003 void __init mnt_init(void)
3004 {
3005         unsigned u;
3006         int err;
3007 
3008         mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
3009                         0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
3010 
3011         mount_hashtable = alloc_large_system_hash("Mount-cache",
3012                                 sizeof(struct hlist_head),
3013                                 mhash_entries, 19,
3014                                 0,
3015                                 &m_hash_shift, &m_hash_mask, 0, 0);
3016         mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
3017                                 sizeof(struct hlist_head),
3018                                 mphash_entries, 19,
3019                                 0,
3020                                 &mp_hash_shift, &mp_hash_mask, 0, 0);
3021 
3022         if (!mount_hashtable || !mountpoint_hashtable)
3023                 panic("Failed to allocate mount hash table\n");
3024 
3025         for (u = 0; u <= m_hash_mask; u++)
3026                 INIT_HLIST_HEAD(&mount_hashtable[u]);
3027         for (u = 0; u <= mp_hash_mask; u++)
3028                 INIT_HLIST_HEAD(&mountpoint_hashtable[u]);
3029 
3030         kernfs_init();
3031 
3032         err = sysfs_init();
3033         if (err)
3034                 printk(KERN_WARNING "%s: sysfs_init error: %d\n",
3035                         __func__, err);
3036         fs_kobj = kobject_create_and_add("fs", NULL);
3037         if (!fs_kobj)
3038                 printk(KERN_WARNING "%s: kobj create error\n", __func__);
3039         init_rootfs();
3040         init_mount_tree();
3041 }
3042 
3043 void put_mnt_ns(struct mnt_namespace *ns)
3044 {
3045         if (!atomic_dec_and_test(&ns->count))
3046                 return;
3047         drop_collected_mounts(&ns->root->mnt);
3048         free_mnt_ns(ns);
3049 }
3050 
3051 struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
3052 {
3053         struct vfsmount *mnt;
3054         mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
3055         if (!IS_ERR(mnt)) {
3056                 /*
3057                  * it is a longterm mount, don't release mnt until
3058                  * we unmount before file sys is unregistered
3059                 */
3060                 real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
3061         }
3062         return mnt;
3063 }
3064 EXPORT_SYMBOL_GPL(kern_mount_data);
3065 
3066 void kern_unmount(struct vfsmount *mnt)
3067 {
3068         /* release long term mount so mount point can be released */
3069         if (!IS_ERR_OR_NULL(mnt)) {
3070                 real_mount(mnt)->mnt_ns = NULL;
3071                 synchronize_rcu();      /* yecchhh... */
3072                 mntput(mnt);
3073         }
3074 }
3075 EXPORT_SYMBOL(kern_unmount);
3076 
3077 bool our_mnt(struct vfsmount *mnt)
3078 {
3079         return check_mnt(real_mount(mnt));
3080 }
3081 
3082 bool current_chrooted(void)
3083 {
3084         /* Does the current process have a non-standard root */
3085         struct path ns_root;
3086         struct path fs_root;
3087         bool chrooted;
3088 
3089         /* Find the namespace root */
3090         ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
3091         ns_root.dentry = ns_root.mnt->mnt_root;
3092         path_get(&ns_root);
3093         while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
3094                 ;
3095 
3096         get_fs_root(current->fs, &fs_root);
3097 
3098         chrooted = !path_equal(&fs_root, &ns_root);
3099 
3100         path_put(&fs_root);
3101         path_put(&ns_root);
3102 
3103         return chrooted;
3104 }
3105 
3106 static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
3107 {
3108         struct mnt_namespace *ns = current->nsproxy->mnt_ns;
3109         int new_flags = *new_mnt_flags;
3110         struct mount *mnt;
3111         bool visible = false;
3112 
3113         if (unlikely(!ns))
3114                 return false;
3115 
3116         down_read(&namespace_sem);
3117         list_for_each_entry(mnt, &ns->list, mnt_list) {
3118                 struct mount *child;
3119                 if (mnt->mnt.mnt_sb->s_type != type)
3120                         continue;
3121 
3122                 /* This mount is not fully visible if it's root directory
3123                  * is not the root directory of the filesystem.
3124                  */
3125                 if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
3126                         continue;
3127 
3128                 /* Verify the mount flags are equal to or more permissive
3129                  * than the proposed new mount.
3130                  */
3131                 if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
3132                     !(new_flags & MNT_READONLY))
3133                         continue;
3134                 if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
3135                     !(new_flags & MNT_NODEV))
3136                         continue;
3137                 if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
3138                     ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
3139                         continue;
3140 
3141                 /* This mount is not fully visible if there are any
3142                  * locked child mounts that cover anything except for
3143                  * empty directories.
3144                  */
3145                 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
3146                         struct inode *inode = child->mnt_mountpoint->d_inode;
3147                         /* Only worry about locked mounts */
3148                         if (!(child->mnt.mnt_flags & MNT_LOCKED))
3149                                 continue;
3150                         /* Is the directory permanetly empty? */
3151                         if (!is_empty_dir_inode(inode))
3152                                 goto next;
3153                 }
3154                 /* Preserve the locked attributes */
3155                 *new_mnt_flags |= mnt->mnt.mnt_flags & (MNT_LOCK_READONLY | \
3156                                                         MNT_LOCK_NODEV    | \
3157                                                         MNT_LOCK_ATIME);
3158                 visible = true;
3159                 goto found;
3160         next:   ;
3161         }
3162 found:
3163         up_read(&namespace_sem);
3164         return visible;
3165 }
3166 
3167 static void *mntns_get(struct task_struct *task)
3168 {
3169         struct mnt_namespace *ns = NULL;
3170         struct nsproxy *nsproxy;
3171 
3172         rcu_read_lock();
3173         nsproxy = task_nsproxy(task);
3174         if (nsproxy) {
3175                 ns = nsproxy->mnt_ns;
3176                 get_mnt_ns(ns);
3177         }
3178         rcu_read_unlock();
3179 
3180         return ns;
3181 }
3182 
3183 static void mntns_put(void *ns)
3184 {
3185         put_mnt_ns(ns);
3186 }
3187 
3188 static int mntns_install(struct nsproxy *nsproxy, void *ns)
3189 {
3190         struct fs_struct *fs = current->fs;
3191         struct mnt_namespace *mnt_ns = ns;
3192         struct path root;
3193 
3194         if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
3195             !ns_capable(current_user_ns(), CAP_SYS_CHROOT) ||
3196             !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
3197                 return -EPERM;
3198 
3199         if (fs->users != 1)
3200                 return -EINVAL;
3201 
3202         get_mnt_ns(mnt_ns);
3203         put_mnt_ns(nsproxy->mnt_ns);
3204         nsproxy->mnt_ns = mnt_ns;
3205 
3206         /* Find the root */
3207         root.mnt    = &mnt_ns->root->mnt;
3208         root.dentry = mnt_ns->root->mnt.mnt_root;
3209         path_get(&root);
3210         while(d_mountpoint(root.dentry) && follow_down_one(&root))
3211                 ;
3212 
3213         /* Update the pwd and root */
3214         set_fs_pwd(fs, &root);
3215         set_fs_root(fs, &root);
3216 
3217         path_put(&root);
3218         return 0;
3219 }
3220 
3221 static unsigned int mntns_inum(void *ns)
3222 {
3223         struct mnt_namespace *mnt_ns = ns;
3224         return mnt_ns->proc_inum;
3225 }
3226 
3227 const struct proc_ns_operations mntns_operations = {
3228         .name           = "mnt",
3229         .type           = CLONE_NEWNS,
3230         .get            = mntns_get,
3231         .put            = mntns_put,
3232         .install        = mntns_install,
3233         .inum           = mntns_inum,
3234 };
3235 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp