From 18445bc318e23ddd07adb4b38af2417f14d346cc Mon Sep 17 00:00:00 2001 From: j7b3y Date: Thu, 19 Dec 2024 22:24:42 +0900 Subject: [PATCH] add kernelsu next & susfs Change-Id: I0a74e960ff4fe953e5dc3441355bc1b99c779f17 --- KernelSU-Next | 1 + arch/arm64/configs/vendor/xiaomi_GKI.config | 4 + drivers/Kconfig | 1 + drivers/Makefile | 2 + drivers/kernelsu | 1 + fs/Makefile | 3 + fs/dcache.c | 12 + fs/dcache.c.orig | 3241 +++++++++++++++ fs/devpts/inode.c | 11 + fs/exec.c | 11 + fs/exec.c.orig | 2132 ++++++++++ fs/inode.c | 20 + fs/inode.c.orig | 2493 +++++++++++ fs/namei.c | 140 + fs/namespace.c | 146 + fs/namespace.c.orig | 4149 +++++++++++++++++++ fs/open.c | 12 + fs/overlayfs/inode.c | 9 + fs/overlayfs/readdir.c | 12 + fs/overlayfs/super.c | 12 + fs/proc/bootconfig.c | 11 + fs/proc/task_mmu.c | 13 + fs/proc_namespace.c | 18 + fs/readdir.c | 9 + fs/stat.c | 26 + fs/statfs.c | 22 + fs/sus_su.c | 140 + fs/susfs.c | 819 ++++ include/linux/sus_su.h | 9 + include/linux/susfs.h | 231 ++ kernel/kallsyms.c | 10 + kernel/sys.c | 11 + 32 files changed, 13731 insertions(+) create mode 160000 KernelSU-Next create mode 120000 drivers/kernelsu create mode 100644 fs/dcache.c.orig create mode 100644 fs/exec.c.orig create mode 100644 fs/inode.c.orig create mode 100644 fs/namespace.c.orig create mode 100644 fs/sus_su.c create mode 100644 fs/susfs.c create mode 100644 include/linux/sus_su.h create mode 100644 include/linux/susfs.h diff --git a/KernelSU-Next b/KernelSU-Next new file mode 160000 index 000000000000..8f71f686f467 --- /dev/null +++ b/KernelSU-Next @@ -0,0 +1 @@ +Subproject commit 8f71f686f4670734a6bf8287b8ab918352c42d79 diff --git a/arch/arm64/configs/vendor/xiaomi_GKI.config b/arch/arm64/configs/vendor/xiaomi_GKI.config index f6d684a9b270..d8d1801a5a8b 100644 --- a/arch/arm64/configs/vendor/xiaomi_GKI.config +++ b/arch/arm64/configs/vendor/xiaomi_GKI.config @@ -61,3 +61,7 @@ CONFIG_TOUCHSCREEN_XIAOMI_TOUCHFEATURE=m # USB CONFIG_USB_NET_AX88179_178A=m CONFIG_USB_NET_AX8817X=m + +# KSU Next +CONFIG_KSU=y +CONFIG_KSU_SUSFS=y \ No newline at end of file diff --git a/drivers/Kconfig b/drivers/Kconfig index dcecc9f6e33f..8cae523c94c2 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -235,4 +235,5 @@ source "drivers/interconnect/Kconfig" source "drivers/counter/Kconfig" source "drivers/most/Kconfig" +source "drivers/kernelsu/Kconfig" endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 576228037718..4e218bf0333b 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -189,3 +189,5 @@ obj-$(CONFIG_GNSS) += gnss/ obj-$(CONFIG_INTERCONNECT) += interconnect/ obj-$(CONFIG_COUNTER) += counter/ obj-$(CONFIG_MOST) += most/ + +obj-$(CONFIG_KSU) += kernelsu/ diff --git a/drivers/kernelsu b/drivers/kernelsu new file mode 120000 index 000000000000..b32a3654a683 --- /dev/null +++ b/drivers/kernelsu @@ -0,0 +1 @@ +../KernelSU-Next/kernel \ No newline at end of file diff --git a/fs/Makefile b/fs/Makefile index c7851875b668..bb53b87308d6 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -18,6 +18,9 @@ obj-y := open.o read_write.o file_table.o super.o \ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ kernel_read_file.o remap_range.o +obj-$(CONFIG_KSU_SUSFS) += susfs.o +obj-$(CONFIG_KSU_SUSFS_SUS_SU) += sus_su.o + ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o else diff --git a/fs/dcache.c b/fs/dcache.c index 4ff59d49ce48..a648354c974c 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2314,6 +2314,12 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, continue; if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0) continue; + +#ifdef CONFIG_KSU_SUSFS_SUS_PATH + if (dentry->d_inode && unlikely(dentry->d_inode->i_state & 16777216) && likely(current_cred()->user->android_kabi_reserved2 & 16777216)) { + continue; + } +#endif } *seqp = seq; return dentry; @@ -2397,6 +2403,12 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) if (dentry->d_name.hash != hash) continue; +#ifdef CONFIG_KSU_SUSFS_SUS_PATH + if (dentry->d_inode && unlikely(dentry->d_inode->i_state & 16777216) && likely(current_cred()->user->android_kabi_reserved2 & 16777216)) { + continue; + } +#endif + spin_lock(&dentry->d_lock); if (dentry->d_parent != parent) goto next; diff --git a/fs/dcache.c.orig b/fs/dcache.c.orig new file mode 100644 index 000000000000..4ff59d49ce48 --- /dev/null +++ b/fs/dcache.c.orig @@ -0,0 +1,3241 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * fs/dcache.c + * + * Complete reimplementation + * (C) 1997 Thomas Schoebel-Theuer, + * with heavy changes by Linus Torvalds + */ + +/* + * Notes on the allocation strategy: + * + * The dcache is a master of the icache - whenever a dcache entry + * exists, the inode will always exist. "iput()" is done either when + * the dcache entry is deleted or garbage collected. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" +#include "mount.h" + +/* + * Usage: + * dcache->d_inode->i_lock protects: + * - i_dentry, d_u.d_alias, d_inode of aliases + * dcache_hash_bucket lock protects: + * - the dcache hash table + * s_roots bl list spinlock protects: + * - the s_roots list (see __d_drop) + * dentry->d_sb->s_dentry_lru_lock protects: + * - the dcache lru lists and counters + * d_lock protects: + * - d_flags + * - d_name + * - d_lru + * - d_count + * - d_unhashed() + * - d_parent and d_subdirs + * - childrens' d_child and d_parent + * - d_u.d_alias, d_inode + * + * Ordering: + * dentry->d_inode->i_lock + * dentry->d_lock + * dentry->d_sb->s_dentry_lru_lock + * dcache_hash_bucket lock + * s_roots lock + * + * If there is an ancestor relationship: + * dentry->d_parent->...->d_parent->d_lock + * ... + * dentry->d_parent->d_lock + * dentry->d_lock + * + * If no ancestor relationship: + * arbitrary, since it's serialized on rename_lock + */ +int sysctl_vfs_cache_pressure __read_mostly = 100; +EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); + +__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); + +EXPORT_SYMBOL(rename_lock); + +static struct kmem_cache *dentry_cache __read_mostly; + +const struct qstr empty_name = QSTR_INIT("", 0); +EXPORT_SYMBOL(empty_name); +const struct qstr slash_name = QSTR_INIT("/", 1); +EXPORT_SYMBOL(slash_name); + +/* + * This is the single most critical data structure when it comes + * to the dcache: the hashtable for lookups. Somebody should try + * to make this good - I've just made it work. + * + * This hash-function tries to avoid losing too many bits of hash + * information, yet avoid using a prime hash-size or similar. + */ + +static unsigned int d_hash_shift __read_mostly; + +static struct hlist_bl_head *dentry_hashtable __read_mostly; + +static inline struct hlist_bl_head *d_hash(unsigned int hash) +{ + return dentry_hashtable + (hash >> d_hash_shift); +} + +#define IN_LOOKUP_SHIFT 10 +static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT]; + +static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent, + unsigned int hash) +{ + hash += (unsigned long) parent / L1_CACHE_BYTES; + return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT); +} + + +/* Statistics gathering. */ +struct dentry_stat_t dentry_stat = { + .age_limit = 45, +}; + +static DEFINE_PER_CPU(long, nr_dentry); +static DEFINE_PER_CPU(long, nr_dentry_unused); +static DEFINE_PER_CPU(long, nr_dentry_negative); + +#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) + +/* + * Here we resort to our own counters instead of using generic per-cpu counters + * for consistency with what the vfs inode code does. We are expected to harvest + * better code and performance by having our own specialized counters. + * + * Please note that the loop is done over all possible CPUs, not over all online + * CPUs. The reason for this is that we don't want to play games with CPUs going + * on and off. If one of them goes off, we will just keep their counters. + * + * glommer: See cffbc8a for details, and if you ever intend to change this, + * please update all vfs counters to match. + */ +static long get_nr_dentry(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_dentry, i); + return sum < 0 ? 0 : sum; +} + +static long get_nr_dentry_unused(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_dentry_unused, i); + return sum < 0 ? 0 : sum; +} + +static long get_nr_dentry_negative(void) +{ + int i; + long sum = 0; + + for_each_possible_cpu(i) + sum += per_cpu(nr_dentry_negative, i); + return sum < 0 ? 0 : sum; +} + +int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + dentry_stat.nr_dentry = get_nr_dentry(); + dentry_stat.nr_unused = get_nr_dentry_unused(); + dentry_stat.nr_negative = get_nr_dentry_negative(); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + +/* + * Compare 2 name strings, return 0 if they match, otherwise non-zero. + * The strings are both count bytes long, and count is non-zero. + */ +#ifdef CONFIG_DCACHE_WORD_ACCESS + +#include +/* + * NOTE! 'cs' and 'scount' come from a dentry, so it has a + * aligned allocation for this particular component. We don't + * strictly need the load_unaligned_zeropad() safety, but it + * doesn't hurt either. + * + * In contrast, 'ct' and 'tcount' can be from a pathname, and do + * need the careful unaligned handling. + */ +static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) +{ + unsigned long a,b,mask; + + for (;;) { + a = read_word_at_a_time(cs); + b = load_unaligned_zeropad(ct); + if (tcount < sizeof(unsigned long)) + break; + if (unlikely(a != b)) + return 1; + cs += sizeof(unsigned long); + ct += sizeof(unsigned long); + tcount -= sizeof(unsigned long); + if (!tcount) + return 0; + } + mask = bytemask_from_count(tcount); + return unlikely(!!((a ^ b) & mask)); +} + +#else + +static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) +{ + do { + if (*cs != *ct) + return 1; + cs++; + ct++; + tcount--; + } while (tcount); + return 0; +} + +#endif + +static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount) +{ + /* + * Be careful about RCU walk racing with rename: + * use 'READ_ONCE' to fetch the name pointer. + * + * NOTE! Even if a rename will mean that the length + * was not loaded atomically, we don't care. The + * RCU walk will check the sequence count eventually, + * and catch it. And we won't overrun the buffer, + * because we're reading the name pointer atomically, + * and a dentry name is guaranteed to be properly + * terminated with a NUL byte. + * + * End result: even if 'len' is wrong, we'll exit + * early because the data cannot match (there can + * be no NUL in the ct/tcount data) + */ + const unsigned char *cs = READ_ONCE(dentry->d_name.name); + + return dentry_string_cmp(cs, ct, tcount); +} + +struct external_name { + union { + atomic_t count; + struct rcu_head head; + } u; + unsigned char name[]; +}; + +static inline struct external_name *external_name(struct dentry *dentry) +{ + return container_of(dentry->d_name.name, struct external_name, name[0]); +} + +static void __d_free(struct rcu_head *head) +{ + struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); + + kmem_cache_free(dentry_cache, dentry); +} + +static void __d_free_external(struct rcu_head *head) +{ + struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); + kfree(external_name(dentry)); + kmem_cache_free(dentry_cache, dentry); +} + +static inline int dname_external(const struct dentry *dentry) +{ + return dentry->d_name.name != dentry->d_iname; +} + +void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + name->name = dentry->d_name; + if (unlikely(dname_external(dentry))) { + atomic_inc(&external_name(dentry)->u.count); + } else { + memcpy(name->inline_name, dentry->d_iname, + dentry->d_name.len + 1); + name->name.name = name->inline_name; + } + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(take_dentry_name_snapshot); + +void release_dentry_name_snapshot(struct name_snapshot *name) +{ + if (unlikely(name->name.name != name->inline_name)) { + struct external_name *p; + p = container_of(name->name.name, struct external_name, name[0]); + if (unlikely(atomic_dec_and_test(&p->u.count))) + kfree_rcu(p, u.head); + } +} +EXPORT_SYMBOL(release_dentry_name_snapshot); + +static inline void __d_set_inode_and_type(struct dentry *dentry, + struct inode *inode, + unsigned type_flags) +{ + unsigned flags; + + dentry->d_inode = inode; + flags = READ_ONCE(dentry->d_flags); + flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); + flags |= type_flags; + smp_store_release(&dentry->d_flags, flags); +} + +static inline void __d_clear_type_and_inode(struct dentry *dentry) +{ + unsigned flags = READ_ONCE(dentry->d_flags); + + flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); + WRITE_ONCE(dentry->d_flags, flags); + dentry->d_inode = NULL; + /* + * The negative counter only tracks dentries on the LRU. Don't inc if + * d_lru is on another list. + */ + if ((flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) + this_cpu_inc(nr_dentry_negative); +} + +static void dentry_free(struct dentry *dentry) +{ + WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); + if (unlikely(dname_external(dentry))) { + struct external_name *p = external_name(dentry); + if (likely(atomic_dec_and_test(&p->u.count))) { + call_rcu(&dentry->d_u.d_rcu, __d_free_external); + return; + } + } + /* if dentry was never visible to RCU, immediate free is OK */ + if (dentry->d_flags & DCACHE_NORCU) + __d_free(&dentry->d_u.d_rcu); + else + call_rcu(&dentry->d_u.d_rcu, __d_free); +} + +/* + * Release the dentry's inode, using the filesystem + * d_iput() operation if defined. + */ +static void dentry_unlink_inode(struct dentry * dentry) + __releases(dentry->d_lock) + __releases(dentry->d_inode->i_lock) +{ + struct inode *inode = dentry->d_inode; + + raw_write_seqcount_begin(&dentry->d_seq); + __d_clear_type_and_inode(dentry); + hlist_del_init(&dentry->d_u.d_alias); + raw_write_seqcount_end(&dentry->d_seq); + spin_unlock(&dentry->d_lock); + spin_unlock(&inode->i_lock); + if (!inode->i_nlink) + fsnotify_inoderemove(inode); + if (dentry->d_op && dentry->d_op->d_iput) + dentry->d_op->d_iput(dentry, inode); + else + iput(inode); +} + +/* + * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry + * is in use - which includes both the "real" per-superblock + * LRU list _and_ the DCACHE_SHRINK_LIST use. + * + * The DCACHE_SHRINK_LIST bit is set whenever the dentry is + * on the shrink list (ie not on the superblock LRU list). + * + * The per-cpu "nr_dentry_unused" counters are updated with + * the DCACHE_LRU_LIST bit. + * + * The per-cpu "nr_dentry_negative" counters are only updated + * when deleted from or added to the per-superblock LRU list, not + * from/to the shrink list. That is to avoid an unneeded dec/inc + * pair when moving from LRU to shrink list in select_collect(). + * + * These helper functions make sure we always follow the + * rules. d_lock must be held by the caller. + */ +#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x)) +static void d_lru_add(struct dentry *dentry) +{ + D_FLAG_VERIFY(dentry, 0); + dentry->d_flags |= DCACHE_LRU_LIST; + this_cpu_inc(nr_dentry_unused); + if (d_is_negative(dentry)) + this_cpu_inc(nr_dentry_negative); + WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); +} + +static void d_lru_del(struct dentry *dentry) +{ + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags &= ~DCACHE_LRU_LIST; + this_cpu_dec(nr_dentry_unused); + if (d_is_negative(dentry)) + this_cpu_dec(nr_dentry_negative); + WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); +} + +static void d_shrink_del(struct dentry *dentry) +{ + D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); + list_del_init(&dentry->d_lru); + dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); + this_cpu_dec(nr_dentry_unused); +} + +static void d_shrink_add(struct dentry *dentry, struct list_head *list) +{ + D_FLAG_VERIFY(dentry, 0); + list_add(&dentry->d_lru, list); + dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST; + this_cpu_inc(nr_dentry_unused); +} + +/* + * These can only be called under the global LRU lock, ie during the + * callback for freeing the LRU list. "isolate" removes it from the + * LRU lists entirely, while shrink_move moves it to the indicated + * private list. + */ +static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) +{ + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags &= ~DCACHE_LRU_LIST; + this_cpu_dec(nr_dentry_unused); + if (d_is_negative(dentry)) + this_cpu_dec(nr_dentry_negative); + list_lru_isolate(lru, &dentry->d_lru); +} + +static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, + struct list_head *list) +{ + D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); + dentry->d_flags |= DCACHE_SHRINK_LIST; + if (d_is_negative(dentry)) + this_cpu_dec(nr_dentry_negative); + list_lru_isolate_move(lru, &dentry->d_lru, list); +} + +/** + * d_drop - drop a dentry + * @dentry: dentry to drop + * + * d_drop() unhashes the entry from the parent dentry hashes, so that it won't + * be found through a VFS lookup any more. Note that this is different from + * deleting the dentry - d_delete will try to mark the dentry negative if + * possible, giving a successful _negative_ lookup, while d_drop will + * just make the cache lookup fail. + * + * d_drop() is used mainly for stuff that wants to invalidate a dentry for some + * reason (NFS timeouts or autofs deletes). + * + * __d_drop requires dentry->d_lock + * ___d_drop doesn't mark dentry as "unhashed" + * (dentry->d_hash.pprev will be LIST_POISON2, not NULL). + */ +static void ___d_drop(struct dentry *dentry) +{ + struct hlist_bl_head *b; + /* + * Hashed dentries are normally on the dentry hashtable, + * with the exception of those newly allocated by + * d_obtain_root, which are always IS_ROOT: + */ + if (unlikely(IS_ROOT(dentry))) + b = &dentry->d_sb->s_roots; + else + b = d_hash(dentry->d_name.hash); + + hlist_bl_lock(b); + __hlist_bl_del(&dentry->d_hash); + hlist_bl_unlock(b); +} + +void __d_drop(struct dentry *dentry) +{ + if (!d_unhashed(dentry)) { + ___d_drop(dentry); + dentry->d_hash.pprev = NULL; + write_seqcount_invalidate(&dentry->d_seq); + } +} +EXPORT_SYMBOL(__d_drop); + +void d_drop(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __d_drop(dentry); + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(d_drop); + +static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent) +{ + struct dentry *next; + /* + * Inform d_walk() and shrink_dentry_list() that we are no longer + * attached to the dentry tree + */ + dentry->d_flags |= DCACHE_DENTRY_KILLED; + if (unlikely(list_empty(&dentry->d_child))) + return; + __list_del_entry(&dentry->d_child); + /* + * Cursors can move around the list of children. While we'd been + * a normal list member, it didn't matter - ->d_child.next would've + * been updated. However, from now on it won't be and for the + * things like d_walk() it might end up with a nasty surprise. + * Normally d_walk() doesn't care about cursors moving around - + * ->d_lock on parent prevents that and since a cursor has no children + * of its own, we get through it without ever unlocking the parent. + * There is one exception, though - if we ascend from a child that + * gets killed as soon as we unlock it, the next sibling is found + * using the value left in its ->d_child.next. And if _that_ + * pointed to a cursor, and cursor got moved (e.g. by lseek()) + * before d_walk() regains parent->d_lock, we'll end up skipping + * everything the cursor had been moved past. + * + * Solution: make sure that the pointer left behind in ->d_child.next + * points to something that won't be moving around. I.e. skip the + * cursors. + */ + while (dentry->d_child.next != &parent->d_subdirs) { + next = list_entry(dentry->d_child.next, struct dentry, d_child); + if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR))) + break; + dentry->d_child.next = next->d_child.next; + } +} + +static void __dentry_kill(struct dentry *dentry) +{ + struct dentry *parent = NULL; + bool can_free = true; + if (!IS_ROOT(dentry)) + parent = dentry->d_parent; + + /* + * The dentry is now unrecoverably dead to the world. + */ + lockref_mark_dead(&dentry->d_lockref); + + /* + * inform the fs via d_prune that this dentry is about to be + * unhashed and destroyed. + */ + if (dentry->d_flags & DCACHE_OP_PRUNE) + dentry->d_op->d_prune(dentry); + + if (dentry->d_flags & DCACHE_LRU_LIST) { + if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) + d_lru_del(dentry); + } + /* if it was on the hash then remove it */ + __d_drop(dentry); + dentry_unlist(dentry, parent); + if (parent) + spin_unlock(&parent->d_lock); + if (dentry->d_inode) + dentry_unlink_inode(dentry); + else + spin_unlock(&dentry->d_lock); + this_cpu_dec(nr_dentry); + if (dentry->d_op && dentry->d_op->d_release) + dentry->d_op->d_release(dentry); + + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_SHRINK_LIST) { + dentry->d_flags |= DCACHE_MAY_FREE; + can_free = false; + } + spin_unlock(&dentry->d_lock); + if (likely(can_free)) + dentry_free(dentry); + cond_resched(); +} + +static struct dentry *__lock_parent(struct dentry *dentry) +{ + struct dentry *parent; + rcu_read_lock(); + spin_unlock(&dentry->d_lock); +again: + parent = READ_ONCE(dentry->d_parent); + spin_lock(&parent->d_lock); + /* + * We can't blindly lock dentry until we are sure + * that we won't violate the locking order. + * Any changes of dentry->d_parent must have + * been done with parent->d_lock held, so + * spin_lock() above is enough of a barrier + * for checking if it's still our child. + */ + if (unlikely(parent != dentry->d_parent)) { + spin_unlock(&parent->d_lock); + goto again; + } + rcu_read_unlock(); + if (parent != dentry) + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + else + parent = NULL; + return parent; +} + +static inline struct dentry *lock_parent(struct dentry *dentry) +{ + struct dentry *parent = dentry->d_parent; + if (IS_ROOT(dentry)) + return NULL; + if (likely(spin_trylock(&parent->d_lock))) + return parent; + return __lock_parent(dentry); +} + +static inline bool retain_dentry(struct dentry *dentry) +{ + WARN_ON(d_in_lookup(dentry)); + + /* Unreachable? Get rid of it */ + if (unlikely(d_unhashed(dentry))) + return false; + + if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) + return false; + + if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) { + if (dentry->d_op->d_delete(dentry)) + return false; + } + + if (unlikely(dentry->d_flags & DCACHE_DONTCACHE)) + return false; + + /* retain; LRU fodder */ + dentry->d_lockref.count--; + if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) + d_lru_add(dentry); + else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED))) + dentry->d_flags |= DCACHE_REFERENCED; + return true; +} + +void d_mark_dontcache(struct inode *inode) +{ + struct dentry *de; + + spin_lock(&inode->i_lock); + hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) { + spin_lock(&de->d_lock); + de->d_flags |= DCACHE_DONTCACHE; + spin_unlock(&de->d_lock); + } + inode->i_state |= I_DONTCACHE; + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(d_mark_dontcache); + +/* + * Finish off a dentry we've decided to kill. + * dentry->d_lock must be held, returns with it unlocked. + * Returns dentry requiring refcount drop, or NULL if we're done. + */ +static struct dentry *dentry_kill(struct dentry *dentry) + __releases(dentry->d_lock) +{ + struct inode *inode = dentry->d_inode; + struct dentry *parent = NULL; + + if (inode && unlikely(!spin_trylock(&inode->i_lock))) + goto slow_positive; + + if (!IS_ROOT(dentry)) { + parent = dentry->d_parent; + if (unlikely(!spin_trylock(&parent->d_lock))) { + parent = __lock_parent(dentry); + if (likely(inode || !dentry->d_inode)) + goto got_locks; + /* negative that became positive */ + if (parent) + spin_unlock(&parent->d_lock); + inode = dentry->d_inode; + goto slow_positive; + } + } + __dentry_kill(dentry); + return parent; + +slow_positive: + spin_unlock(&dentry->d_lock); + spin_lock(&inode->i_lock); + spin_lock(&dentry->d_lock); + parent = lock_parent(dentry); +got_locks: + if (unlikely(dentry->d_lockref.count != 1)) { + dentry->d_lockref.count--; + } else if (likely(!retain_dentry(dentry))) { + __dentry_kill(dentry); + return parent; + } + /* we are keeping it, after all */ + if (inode) + spin_unlock(&inode->i_lock); + if (parent) + spin_unlock(&parent->d_lock); + spin_unlock(&dentry->d_lock); + return NULL; +} + +/* + * Try to do a lockless dput(), and return whether that was successful. + * + * If unsuccessful, we return false, having already taken the dentry lock. + * + * The caller needs to hold the RCU read lock, so that the dentry is + * guaranteed to stay around even if the refcount goes down to zero! + */ +static inline bool fast_dput(struct dentry *dentry) +{ + int ret; + unsigned int d_flags; + + /* + * If we have a d_op->d_delete() operation, we sould not + * let the dentry count go to zero, so use "put_or_lock". + */ + if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) + return lockref_put_or_lock(&dentry->d_lockref); + + /* + * .. otherwise, we can try to just decrement the + * lockref optimistically. + */ + ret = lockref_put_return(&dentry->d_lockref); + + /* + * If the lockref_put_return() failed due to the lock being held + * by somebody else, the fast path has failed. We will need to + * get the lock, and then check the count again. + */ + if (unlikely(ret < 0)) { + spin_lock(&dentry->d_lock); + if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) { + spin_unlock(&dentry->d_lock); + return true; + } + dentry->d_lockref.count--; + goto locked; + } + + /* + * If we weren't the last ref, we're done. + */ + if (ret) + return true; + + /* + * Careful, careful. The reference count went down + * to zero, but we don't hold the dentry lock, so + * somebody else could get it again, and do another + * dput(), and we need to not race with that. + * + * However, there is a very special and common case + * where we don't care, because there is nothing to + * do: the dentry is still hashed, it does not have + * a 'delete' op, and it's referenced and already on + * the LRU list. + * + * NOTE! Since we aren't locked, these values are + * not "stable". However, it is sufficient that at + * some point after we dropped the reference the + * dentry was hashed and the flags had the proper + * value. Other dentry users may have re-gotten + * a reference to the dentry and change that, but + * our work is done - we can leave the dentry + * around with a zero refcount. + */ + smp_rmb(); + d_flags = READ_ONCE(dentry->d_flags); + d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED; + + /* Nothing to do? Dropping the reference was all we needed? */ + if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry)) + return true; + + /* + * Not the fast normal case? Get the lock. We've already decremented + * the refcount, but we'll need to re-check the situation after + * getting the lock. + */ + spin_lock(&dentry->d_lock); + + /* + * Did somebody else grab a reference to it in the meantime, and + * we're no longer the last user after all? Alternatively, somebody + * else could have killed it and marked it dead. Either way, we + * don't need to do anything else. + */ +locked: + if (dentry->d_lockref.count) { + spin_unlock(&dentry->d_lock); + return true; + } + + /* + * Re-get the reference we optimistically dropped. We hold the + * lock, and we just tested that it was zero, so we can just + * set it to 1. + */ + dentry->d_lockref.count = 1; + return false; +} + + +/* + * This is dput + * + * This is complicated by the fact that we do not want to put + * dentries that are no longer on any hash chain on the unused + * list: we'd much rather just get rid of them immediately. + * + * However, that implies that we have to traverse the dentry + * tree upwards to the parents which might _also_ now be + * scheduled for deletion (it may have been only waiting for + * its last child to go away). + * + * This tail recursion is done by hand as we don't want to depend + * on the compiler to always get this right (gcc generally doesn't). + * Real recursion would eat up our stack space. + */ + +/* + * dput - release a dentry + * @dentry: dentry to release + * + * Release a dentry. This will drop the usage count and if appropriate + * call the dentry unlink method as well as removing it from the queues and + * releasing its resources. If the parent dentries were scheduled for release + * they too may now get deleted. + */ +void dput(struct dentry *dentry) +{ + while (dentry) { + might_sleep(); + + rcu_read_lock(); + if (likely(fast_dput(dentry))) { + rcu_read_unlock(); + return; + } + + /* Slow case: now with the dentry lock held */ + rcu_read_unlock(); + + if (likely(retain_dentry(dentry))) { + spin_unlock(&dentry->d_lock); + return; + } + + dentry = dentry_kill(dentry); + } +} +EXPORT_SYMBOL(dput); + +static void __dput_to_list(struct dentry *dentry, struct list_head *list) +__must_hold(&dentry->d_lock) +{ + if (dentry->d_flags & DCACHE_SHRINK_LIST) { + /* let the owner of the list it's on deal with it */ + --dentry->d_lockref.count; + } else { + if (dentry->d_flags & DCACHE_LRU_LIST) + d_lru_del(dentry); + if (!--dentry->d_lockref.count) + d_shrink_add(dentry, list); + } +} + +void dput_to_list(struct dentry *dentry, struct list_head *list) +{ + rcu_read_lock(); + if (likely(fast_dput(dentry))) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + if (!retain_dentry(dentry)) + __dput_to_list(dentry, list); + spin_unlock(&dentry->d_lock); +} + +/* This must be called with d_lock held */ +static inline void __dget_dlock(struct dentry *dentry) +{ + dentry->d_lockref.count++; +} + +static inline void __dget(struct dentry *dentry) +{ + lockref_get(&dentry->d_lockref); +} + +struct dentry *dget_parent(struct dentry *dentry) +{ + int gotref; + struct dentry *ret; + unsigned seq; + + /* + * Do optimistic parent lookup without any + * locking. + */ + rcu_read_lock(); + seq = raw_seqcount_begin(&dentry->d_seq); + ret = READ_ONCE(dentry->d_parent); + gotref = lockref_get_not_zero(&ret->d_lockref); + rcu_read_unlock(); + if (likely(gotref)) { + if (!read_seqcount_retry(&dentry->d_seq, seq)) + return ret; + dput(ret); + } + +repeat: + /* + * Don't need rcu_dereference because we re-check it was correct under + * the lock. + */ + rcu_read_lock(); + ret = dentry->d_parent; + spin_lock(&ret->d_lock); + if (unlikely(ret != dentry->d_parent)) { + spin_unlock(&ret->d_lock); + rcu_read_unlock(); + goto repeat; + } + rcu_read_unlock(); + BUG_ON(!ret->d_lockref.count); + ret->d_lockref.count++; + spin_unlock(&ret->d_lock); + return ret; +} +EXPORT_SYMBOL(dget_parent); + +static struct dentry * __d_find_any_alias(struct inode *inode) +{ + struct dentry *alias; + + if (hlist_empty(&inode->i_dentry)) + return NULL; + alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); + __dget(alias); + return alias; +} + +/** + * d_find_any_alias - find any alias for a given inode + * @inode: inode to find an alias for + * + * If any aliases exist for the given inode, take and return a + * reference for one of them. If no aliases exist, return %NULL. + */ +struct dentry *d_find_any_alias(struct inode *inode) +{ + struct dentry *de; + + spin_lock(&inode->i_lock); + de = __d_find_any_alias(inode); + spin_unlock(&inode->i_lock); + return de; +} +EXPORT_SYMBOL(d_find_any_alias); + +/** + * d_find_alias - grab a hashed alias of inode + * @inode: inode in question + * + * If inode has a hashed alias, or is a directory and has any alias, + * acquire the reference to alias and return it. Otherwise return NULL. + * Notice that if inode is a directory there can be only one alias and + * it can be unhashed only if it has no children, or if it is the root + * of a filesystem, or if the directory was renamed and d_revalidate + * was the first vfs operation to notice. + * + * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer + * any other hashed alias over that one. + */ +static struct dentry *__d_find_alias(struct inode *inode) +{ + struct dentry *alias; + + if (S_ISDIR(inode->i_mode)) + return __d_find_any_alias(inode); + + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + spin_lock(&alias->d_lock); + if (!d_unhashed(alias)) { + __dget_dlock(alias); + spin_unlock(&alias->d_lock); + return alias; + } + spin_unlock(&alias->d_lock); + } + return NULL; +} + +struct dentry *d_find_alias(struct inode *inode) +{ + struct dentry *de = NULL; + + if (!hlist_empty(&inode->i_dentry)) { + spin_lock(&inode->i_lock); + de = __d_find_alias(inode); + spin_unlock(&inode->i_lock); + } + return de; +} +EXPORT_SYMBOL(d_find_alias); + +/* + * Try to kill dentries associated with this inode. + * WARNING: you must own a reference to inode. + */ +void d_prune_aliases(struct inode *inode) +{ + struct dentry *dentry; +restart: + spin_lock(&inode->i_lock); + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + spin_lock(&dentry->d_lock); + if (!dentry->d_lockref.count) { + struct dentry *parent = lock_parent(dentry); + if (likely(!dentry->d_lockref.count)) { + __dentry_kill(dentry); + dput(parent); + goto restart; + } + if (parent) + spin_unlock(&parent->d_lock); + } + spin_unlock(&dentry->d_lock); + } + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(d_prune_aliases); + +/* + * Lock a dentry from shrink list. + * Called under rcu_read_lock() and dentry->d_lock; the former + * guarantees that nothing we access will be freed under us. + * Note that dentry is *not* protected from concurrent dentry_kill(), + * d_delete(), etc. + * + * Return false if dentry has been disrupted or grabbed, leaving + * the caller to kick it off-list. Otherwise, return true and have + * that dentry's inode and parent both locked. + */ +static bool shrink_lock_dentry(struct dentry *dentry) +{ + struct inode *inode; + struct dentry *parent; + + if (dentry->d_lockref.count) + return false; + + inode = dentry->d_inode; + if (inode && unlikely(!spin_trylock(&inode->i_lock))) { + spin_unlock(&dentry->d_lock); + spin_lock(&inode->i_lock); + spin_lock(&dentry->d_lock); + if (unlikely(dentry->d_lockref.count)) + goto out; + /* changed inode means that somebody had grabbed it */ + if (unlikely(inode != dentry->d_inode)) + goto out; + } + + parent = dentry->d_parent; + if (IS_ROOT(dentry) || likely(spin_trylock(&parent->d_lock))) + return true; + + spin_unlock(&dentry->d_lock); + spin_lock(&parent->d_lock); + if (unlikely(parent != dentry->d_parent)) { + spin_unlock(&parent->d_lock); + spin_lock(&dentry->d_lock); + goto out; + } + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + if (likely(!dentry->d_lockref.count)) + return true; + spin_unlock(&parent->d_lock); +out: + if (inode) + spin_unlock(&inode->i_lock); + return false; +} + +void shrink_dentry_list(struct list_head *list) +{ + while (!list_empty(list)) { + struct dentry *dentry, *parent; + + dentry = list_entry(list->prev, struct dentry, d_lru); + spin_lock(&dentry->d_lock); + rcu_read_lock(); + if (!shrink_lock_dentry(dentry)) { + bool can_free = false; + rcu_read_unlock(); + d_shrink_del(dentry); + if (dentry->d_lockref.count < 0) + can_free = dentry->d_flags & DCACHE_MAY_FREE; + spin_unlock(&dentry->d_lock); + if (can_free) + dentry_free(dentry); + continue; + } + rcu_read_unlock(); + d_shrink_del(dentry); + parent = dentry->d_parent; + if (parent != dentry) + __dput_to_list(parent, list); + __dentry_kill(dentry); + } +} + +static enum lru_status dentry_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) +{ + struct list_head *freeable = arg; + struct dentry *dentry = container_of(item, struct dentry, d_lru); + + + /* + * we are inverting the lru lock/dentry->d_lock here, + * so use a trylock. If we fail to get the lock, just skip + * it + */ + if (!spin_trylock(&dentry->d_lock)) + return LRU_SKIP; + + /* + * Referenced dentries are still in use. If they have active + * counts, just remove them from the LRU. Otherwise give them + * another pass through the LRU. + */ + if (dentry->d_lockref.count) { + d_lru_isolate(lru, dentry); + spin_unlock(&dentry->d_lock); + return LRU_REMOVED; + } + + if (dentry->d_flags & DCACHE_REFERENCED) { + dentry->d_flags &= ~DCACHE_REFERENCED; + spin_unlock(&dentry->d_lock); + + /* + * The list move itself will be made by the common LRU code. At + * this point, we've dropped the dentry->d_lock but keep the + * lru lock. This is safe to do, since every list movement is + * protected by the lru lock even if both locks are held. + * + * This is guaranteed by the fact that all LRU management + * functions are intermediated by the LRU API calls like + * list_lru_add and list_lru_del. List movement in this file + * only ever occur through this functions or through callbacks + * like this one, that are called from the LRU API. + * + * The only exceptions to this are functions like + * shrink_dentry_list, and code that first checks for the + * DCACHE_SHRINK_LIST flag. Those are guaranteed to be + * operating only with stack provided lists after they are + * properly isolated from the main list. It is thus, always a + * local access. + */ + return LRU_ROTATE; + } + + d_lru_shrink_move(lru, dentry, freeable); + spin_unlock(&dentry->d_lock); + + return LRU_REMOVED; +} + +/** + * prune_dcache_sb - shrink the dcache + * @sb: superblock + * @sc: shrink control, passed to list_lru_shrink_walk() + * + * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This + * is done when we need more memory and called from the superblock shrinker + * function. + * + * This function may fail to free any resources if all the dentries are in + * use. + */ +long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) +{ + LIST_HEAD(dispose); + long freed; + + freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc, + dentry_lru_isolate, &dispose); + shrink_dentry_list(&dispose); + return freed; +} + +static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) +{ + struct list_head *freeable = arg; + struct dentry *dentry = container_of(item, struct dentry, d_lru); + + /* + * we are inverting the lru lock/dentry->d_lock here, + * so use a trylock. If we fail to get the lock, just skip + * it + */ + if (!spin_trylock(&dentry->d_lock)) + return LRU_SKIP; + + d_lru_shrink_move(lru, dentry, freeable); + spin_unlock(&dentry->d_lock); + + return LRU_REMOVED; +} + + +/** + * shrink_dcache_sb - shrink dcache for a superblock + * @sb: superblock + * + * Shrink the dcache for the specified super block. This is used to free + * the dcache before unmounting a file system. + */ +void shrink_dcache_sb(struct super_block *sb) +{ + do { + LIST_HEAD(dispose); + + list_lru_walk(&sb->s_dentry_lru, + dentry_lru_isolate_shrink, &dispose, 1024); + shrink_dentry_list(&dispose); + } while (list_lru_count(&sb->s_dentry_lru) > 0); +} +EXPORT_SYMBOL(shrink_dcache_sb); + +/** + * enum d_walk_ret - action to talke during tree walk + * @D_WALK_CONTINUE: contrinue walk + * @D_WALK_QUIT: quit walk + * @D_WALK_NORETRY: quit when retry is needed + * @D_WALK_SKIP: skip this dentry and its children + */ +enum d_walk_ret { + D_WALK_CONTINUE, + D_WALK_QUIT, + D_WALK_NORETRY, + D_WALK_SKIP, +}; + +/** + * d_walk - walk the dentry tree + * @parent: start of walk + * @data: data passed to @enter() and @finish() + * @enter: callback when first entering the dentry + * + * The @enter() callbacks are called with d_lock held. + */ +static void d_walk(struct dentry *parent, void *data, + enum d_walk_ret (*enter)(void *, struct dentry *)) +{ + struct dentry *this_parent; + struct list_head *next; + unsigned seq = 0; + enum d_walk_ret ret; + bool retry = true; + +again: + read_seqbegin_or_lock(&rename_lock, &seq); + this_parent = parent; + spin_lock(&this_parent->d_lock); + + ret = enter(data, this_parent); + switch (ret) { + case D_WALK_CONTINUE: + break; + case D_WALK_QUIT: + case D_WALK_SKIP: + goto out_unlock; + case D_WALK_NORETRY: + retry = false; + break; + } +repeat: + next = this_parent->d_subdirs.next; +resume: + while (next != &this_parent->d_subdirs) { + struct list_head *tmp = next; + struct dentry *dentry = list_entry(tmp, struct dentry, d_child); + next = tmp->next; + + if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR)) + continue; + + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + + ret = enter(data, dentry); + switch (ret) { + case D_WALK_CONTINUE: + break; + case D_WALK_QUIT: + spin_unlock(&dentry->d_lock); + goto out_unlock; + case D_WALK_NORETRY: + retry = false; + break; + case D_WALK_SKIP: + spin_unlock(&dentry->d_lock); + continue; + } + + if (!list_empty(&dentry->d_subdirs)) { + spin_unlock(&this_parent->d_lock); + spin_release(&dentry->d_lock.dep_map, _RET_IP_); + this_parent = dentry; + spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); + goto repeat; + } + spin_unlock(&dentry->d_lock); + } + /* + * All done at this level ... ascend and resume the search. + */ + rcu_read_lock(); +ascend: + if (this_parent != parent) { + struct dentry *child = this_parent; + this_parent = child->d_parent; + + spin_unlock(&child->d_lock); + spin_lock(&this_parent->d_lock); + + /* might go back up the wrong parent if we have had a rename. */ + if (need_seqretry(&rename_lock, seq)) + goto rename_retry; + /* go into the first sibling still alive */ + do { + next = child->d_child.next; + if (next == &this_parent->d_subdirs) + goto ascend; + child = list_entry(next, struct dentry, d_child); + } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)); + rcu_read_unlock(); + goto resume; + } + if (need_seqretry(&rename_lock, seq)) + goto rename_retry; + rcu_read_unlock(); + +out_unlock: + spin_unlock(&this_parent->d_lock); + done_seqretry(&rename_lock, seq); + return; + +rename_retry: + spin_unlock(&this_parent->d_lock); + rcu_read_unlock(); + BUG_ON(seq & 1); + if (!retry) + return; + seq = 1; + goto again; +} + +struct check_mount { + struct vfsmount *mnt; + unsigned int mounted; +}; + +static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry) +{ + struct check_mount *info = data; + struct path path = { .mnt = info->mnt, .dentry = dentry }; + + if (likely(!d_mountpoint(dentry))) + return D_WALK_CONTINUE; + if (__path_is_mountpoint(&path)) { + info->mounted = 1; + return D_WALK_QUIT; + } + return D_WALK_CONTINUE; +} + +/** + * path_has_submounts - check for mounts over a dentry in the + * current namespace. + * @parent: path to check. + * + * Return true if the parent or its subdirectories contain + * a mount point in the current namespace. + */ +int path_has_submounts(const struct path *parent) +{ + struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; + + read_seqlock_excl(&mount_lock); + d_walk(parent->dentry, &data, path_check_mount); + read_sequnlock_excl(&mount_lock); + + return data.mounted; +} +EXPORT_SYMBOL(path_has_submounts); + +/* + * Called by mount code to set a mountpoint and check if the mountpoint is + * reachable (e.g. NFS can unhash a directory dentry and then the complete + * subtree can become unreachable). + * + * Only one of d_invalidate() and d_set_mounted() must succeed. For + * this reason take rename_lock and d_lock on dentry and ancestors. + */ +int d_set_mounted(struct dentry *dentry) +{ + struct dentry *p; + int ret = -ENOENT; + write_seqlock(&rename_lock); + for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { + /* Need exclusion wrt. d_invalidate() */ + spin_lock(&p->d_lock); + if (unlikely(d_unhashed(p))) { + spin_unlock(&p->d_lock); + goto out; + } + spin_unlock(&p->d_lock); + } + spin_lock(&dentry->d_lock); + if (!d_unlinked(dentry)) { + ret = -EBUSY; + if (!d_mountpoint(dentry)) { + dentry->d_flags |= DCACHE_MOUNTED; + ret = 0; + } + } + spin_unlock(&dentry->d_lock); +out: + write_sequnlock(&rename_lock); + return ret; +} + +/* + * Search the dentry child list of the specified parent, + * and move any unused dentries to the end of the unused + * list for prune_dcache(). We descend to the next level + * whenever the d_subdirs list is non-empty and continue + * searching. + * + * It returns zero iff there are no unused children, + * otherwise it returns the number of children moved to + * the end of the unused list. This may not be the total + * number of unused children, because select_parent can + * drop the lock and return early due to latency + * constraints. + */ + +struct select_data { + struct dentry *start; + union { + long found; + struct dentry *victim; + }; + struct list_head dispose; +}; + +static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) +{ + struct select_data *data = _data; + enum d_walk_ret ret = D_WALK_CONTINUE; + + if (data->start == dentry) + goto out; + + if (dentry->d_flags & DCACHE_SHRINK_LIST) { + data->found++; + } else { + if (dentry->d_flags & DCACHE_LRU_LIST) + d_lru_del(dentry); + if (!dentry->d_lockref.count) { + d_shrink_add(dentry, &data->dispose); + data->found++; + } + } + /* + * We can return to the caller if we have found some (this + * ensures forward progress). We'll be coming back to find + * the rest. + */ + if (!list_empty(&data->dispose)) + ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; +out: + return ret; +} + +static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) +{ + struct select_data *data = _data; + enum d_walk_ret ret = D_WALK_CONTINUE; + + if (data->start == dentry) + goto out; + + if (dentry->d_flags & DCACHE_SHRINK_LIST) { + if (!dentry->d_lockref.count) { + rcu_read_lock(); + data->victim = dentry; + return D_WALK_QUIT; + } + } else { + if (dentry->d_flags & DCACHE_LRU_LIST) + d_lru_del(dentry); + if (!dentry->d_lockref.count) + d_shrink_add(dentry, &data->dispose); + } + /* + * We can return to the caller if we have found some (this + * ensures forward progress). We'll be coming back to find + * the rest. + */ + if (!list_empty(&data->dispose)) + ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; +out: + return ret; +} + +/** + * shrink_dcache_parent - prune dcache + * @parent: parent of entries to prune + * + * Prune the dcache to remove unused children of the parent dentry. + */ +void shrink_dcache_parent(struct dentry *parent) +{ + for (;;) { + struct select_data data = {.start = parent}; + + INIT_LIST_HEAD(&data.dispose); + d_walk(parent, &data, select_collect); + + if (!list_empty(&data.dispose)) { + shrink_dentry_list(&data.dispose); + continue; + } + + cond_resched(); + if (!data.found) + break; + data.victim = NULL; + d_walk(parent, &data, select_collect2); + if (data.victim) { + struct dentry *parent; + spin_lock(&data.victim->d_lock); + if (!shrink_lock_dentry(data.victim)) { + spin_unlock(&data.victim->d_lock); + rcu_read_unlock(); + } else { + rcu_read_unlock(); + parent = data.victim->d_parent; + if (parent != data.victim) + __dput_to_list(parent, &data.dispose); + __dentry_kill(data.victim); + } + } + if (!list_empty(&data.dispose)) + shrink_dentry_list(&data.dispose); + } +} +EXPORT_SYMBOL(shrink_dcache_parent); + +static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) +{ + /* it has busy descendents; complain about those instead */ + if (!list_empty(&dentry->d_subdirs)) + return D_WALK_CONTINUE; + + /* root with refcount 1 is fine */ + if (dentry == _data && dentry->d_lockref.count == 1) + return D_WALK_CONTINUE; + + printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} " + " still in use (%d) [unmount of %s %s]\n", + dentry, + dentry->d_inode ? + dentry->d_inode->i_ino : 0UL, + dentry, + dentry->d_lockref.count, + dentry->d_sb->s_type->name, + dentry->d_sb->s_id); + WARN_ON(1); + return D_WALK_CONTINUE; +} + +static void do_one_tree(struct dentry *dentry) +{ + shrink_dcache_parent(dentry); + d_walk(dentry, dentry, umount_check); + d_drop(dentry); + dput(dentry); +} + +/* + * destroy the dentries attached to a superblock on unmounting + */ +void shrink_dcache_for_umount(struct super_block *sb) +{ + struct dentry *dentry; + + WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked"); + + dentry = sb->s_root; + sb->s_root = NULL; + do_one_tree(dentry); + + while (!hlist_bl_empty(&sb->s_roots)) { + dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash)); + do_one_tree(dentry); + } +} + +static enum d_walk_ret find_submount(void *_data, struct dentry *dentry) +{ + struct dentry **victim = _data; + if (d_mountpoint(dentry)) { + __dget_dlock(dentry); + *victim = dentry; + return D_WALK_QUIT; + } + return D_WALK_CONTINUE; +} + +/** + * d_invalidate - detach submounts, prune dcache, and drop + * @dentry: dentry to invalidate (aka detach, prune and drop) + */ +void d_invalidate(struct dentry *dentry) +{ + bool had_submounts = false; + spin_lock(&dentry->d_lock); + if (d_unhashed(dentry)) { + spin_unlock(&dentry->d_lock); + return; + } + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + + /* Negative dentries can be dropped without further checks */ + if (!dentry->d_inode) + return; + + shrink_dcache_parent(dentry); + for (;;) { + struct dentry *victim = NULL; + d_walk(dentry, &victim, find_submount); + if (!victim) { + if (had_submounts) + shrink_dcache_parent(dentry); + return; + } + had_submounts = true; + detach_mounts(victim); + dput(victim); + } +} +EXPORT_SYMBOL(d_invalidate); + +/** + * __d_alloc - allocate a dcache entry + * @sb: filesystem it will belong to + * @name: qstr of the name + * + * Allocates a dentry. It returns %NULL if there is insufficient memory + * available. On a success the dentry is returned. The name passed in is + * copied and the copy passed in may be reused after this call. + */ + +static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) +{ + struct dentry *dentry; + char *dname; + int err; + + dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); + if (!dentry) + return NULL; + + /* + * We guarantee that the inline name is always NUL-terminated. + * This way the memcpy() done by the name switching in rename + * will still always have a NUL at the end, even if we might + * be overwriting an internal NUL character + */ + dentry->d_iname[DNAME_INLINE_LEN-1] = 0; + if (unlikely(!name)) { + name = &slash_name; + dname = dentry->d_iname; + } else if (name->len > DNAME_INLINE_LEN-1) { + size_t size = offsetof(struct external_name, name[1]); + struct external_name *p = kmalloc(size + name->len, + GFP_KERNEL_ACCOUNT | + __GFP_RECLAIMABLE); + if (!p) { + kmem_cache_free(dentry_cache, dentry); + return NULL; + } + atomic_set(&p->u.count, 1); + dname = p->name; + } else { + dname = dentry->d_iname; + } + + dentry->d_name.len = name->len; + dentry->d_name.hash = name->hash; + memcpy(dname, name->name, name->len); + dname[name->len] = 0; + + /* Make sure we always see the terminating NUL character */ + smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ + + dentry->d_lockref.count = 1; + dentry->d_flags = 0; + spin_lock_init(&dentry->d_lock); + seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); + dentry->d_inode = NULL; + dentry->d_parent = dentry; + dentry->d_sb = sb; + dentry->d_op = NULL; + dentry->d_fsdata = NULL; + INIT_HLIST_BL_NODE(&dentry->d_hash); + INIT_LIST_HEAD(&dentry->d_lru); + INIT_LIST_HEAD(&dentry->d_subdirs); + INIT_HLIST_NODE(&dentry->d_u.d_alias); + INIT_LIST_HEAD(&dentry->d_child); + d_set_d_op(dentry, dentry->d_sb->s_d_op); + + if (dentry->d_op && dentry->d_op->d_init) { + err = dentry->d_op->d_init(dentry); + if (err) { + if (dname_external(dentry)) + kfree(external_name(dentry)); + kmem_cache_free(dentry_cache, dentry); + return NULL; + } + } + + this_cpu_inc(nr_dentry); + + return dentry; +} + +/** + * d_alloc - allocate a dcache entry + * @parent: parent of entry to allocate + * @name: qstr of the name + * + * Allocates a dentry. It returns %NULL if there is insufficient memory + * available. On a success the dentry is returned. The name passed in is + * copied and the copy passed in may be reused after this call. + */ +struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) +{ + struct dentry *dentry = __d_alloc(parent->d_sb, name); + if (!dentry) + return NULL; + spin_lock(&parent->d_lock); + /* + * don't need child lock because it is not subject + * to concurrency here + */ + __dget_dlock(parent); + dentry->d_parent = parent; + list_add(&dentry->d_child, &parent->d_subdirs); + spin_unlock(&parent->d_lock); + + return dentry; +} +EXPORT_SYMBOL(d_alloc); + +struct dentry *d_alloc_anon(struct super_block *sb) +{ + return __d_alloc(sb, NULL); +} +EXPORT_SYMBOL(d_alloc_anon); + +struct dentry *d_alloc_cursor(struct dentry * parent) +{ + struct dentry *dentry = d_alloc_anon(parent->d_sb); + if (dentry) { + dentry->d_flags |= DCACHE_DENTRY_CURSOR; + dentry->d_parent = dget(parent); + } + return dentry; +} + +/** + * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems) + * @sb: the superblock + * @name: qstr of the name + * + * For a filesystem that just pins its dentries in memory and never + * performs lookups at all, return an unhashed IS_ROOT dentry. + * This is used for pipes, sockets et.al. - the stuff that should + * never be anyone's children or parents. Unlike all other + * dentries, these will not have RCU delay between dropping the + * last reference and freeing them. + * + * The only user is alloc_file_pseudo() and that's what should + * be considered a public interface. Don't use directly. + */ +struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) +{ + struct dentry *dentry = __d_alloc(sb, name); + if (likely(dentry)) + dentry->d_flags |= DCACHE_NORCU; + return dentry; +} + +struct dentry *d_alloc_name(struct dentry *parent, const char *name) +{ + struct qstr q; + + q.name = name; + q.hash_len = hashlen_string(parent, name); + return d_alloc(parent, &q); +} +EXPORT_SYMBOL(d_alloc_name); + +void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) +{ + WARN_ON_ONCE(dentry->d_op); + WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH | + DCACHE_OP_COMPARE | + DCACHE_OP_REVALIDATE | + DCACHE_OP_WEAK_REVALIDATE | + DCACHE_OP_DELETE | + DCACHE_OP_REAL)); + dentry->d_op = op; + if (!op) + return; + if (op->d_hash) + dentry->d_flags |= DCACHE_OP_HASH; + if (op->d_compare) + dentry->d_flags |= DCACHE_OP_COMPARE; + if (op->d_revalidate) + dentry->d_flags |= DCACHE_OP_REVALIDATE; + if (op->d_weak_revalidate) + dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE; + if (op->d_delete) + dentry->d_flags |= DCACHE_OP_DELETE; + if (op->d_prune) + dentry->d_flags |= DCACHE_OP_PRUNE; + if (op->d_real) + dentry->d_flags |= DCACHE_OP_REAL; + +} +EXPORT_SYMBOL(d_set_d_op); + + +/* + * d_set_fallthru - Mark a dentry as falling through to a lower layer + * @dentry - The dentry to mark + * + * Mark a dentry as falling through to the lower layer (as set with + * d_pin_lower()). This flag may be recorded on the medium. + */ +void d_set_fallthru(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_FALLTHRU; + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(d_set_fallthru); + +static unsigned d_flags_for_inode(struct inode *inode) +{ + unsigned add_flags = DCACHE_REGULAR_TYPE; + + if (!inode) + return DCACHE_MISS_TYPE; + + if (S_ISDIR(inode->i_mode)) { + add_flags = DCACHE_DIRECTORY_TYPE; + if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) { + if (unlikely(!inode->i_op->lookup)) + add_flags = DCACHE_AUTODIR_TYPE; + else + inode->i_opflags |= IOP_LOOKUP; + } + goto type_determined; + } + + if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { + if (unlikely(inode->i_op->get_link)) { + add_flags = DCACHE_SYMLINK_TYPE; + goto type_determined; + } + inode->i_opflags |= IOP_NOFOLLOW; + } + + if (unlikely(!S_ISREG(inode->i_mode))) + add_flags = DCACHE_SPECIAL_TYPE; + +type_determined: + if (unlikely(IS_AUTOMOUNT(inode))) + add_flags |= DCACHE_NEED_AUTOMOUNT; + return add_flags; +} + +static void __d_instantiate(struct dentry *dentry, struct inode *inode) +{ + unsigned add_flags = d_flags_for_inode(inode); + WARN_ON(d_in_lookup(dentry)); + + spin_lock(&dentry->d_lock); + /* + * The negative counter only tracks dentries on the LRU. Don't dec if + * d_lru is on another list. + */ + if ((dentry->d_flags & + (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) + this_cpu_dec(nr_dentry_negative); + hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); + raw_write_seqcount_begin(&dentry->d_seq); + __d_set_inode_and_type(dentry, inode, add_flags); + raw_write_seqcount_end(&dentry->d_seq); + fsnotify_update_flags(dentry); + spin_unlock(&dentry->d_lock); +} + +/** + * d_instantiate - fill in inode information for a dentry + * @entry: dentry to complete + * @inode: inode to attach to this dentry + * + * Fill in inode information in the entry. + * + * This turns negative dentries into productive full members + * of society. + * + * NOTE! This assumes that the inode count has been incremented + * (or otherwise set) by the caller to indicate that it is now + * in use by the dcache. + */ + +void d_instantiate(struct dentry *entry, struct inode * inode) +{ + BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); + if (inode) { + security_d_instantiate(entry, inode); + spin_lock(&inode->i_lock); + __d_instantiate(entry, inode); + spin_unlock(&inode->i_lock); + } +} +EXPORT_SYMBOL(d_instantiate); + +/* + * This should be equivalent to d_instantiate() + unlock_new_inode(), + * with lockdep-related part of unlock_new_inode() done before + * anything else. Use that instead of open-coding d_instantiate()/ + * unlock_new_inode() combinations. + */ +void d_instantiate_new(struct dentry *entry, struct inode *inode) +{ + BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); + BUG_ON(!inode); + lockdep_annotate_inode_mutex_key(inode); + security_d_instantiate(entry, inode); + spin_lock(&inode->i_lock); + __d_instantiate(entry, inode); + WARN_ON(!(inode->i_state & I_NEW)); + inode->i_state &= ~I_NEW & ~I_CREATING; + smp_mb(); + wake_up_bit(&inode->i_state, __I_NEW); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(d_instantiate_new); + +struct dentry *d_make_root(struct inode *root_inode) +{ + struct dentry *res = NULL; + + if (root_inode) { + res = d_alloc_anon(root_inode->i_sb); + if (res) + d_instantiate(res, root_inode); + else + iput(root_inode); + } + return res; +} +EXPORT_SYMBOL(d_make_root); + +static struct dentry *__d_instantiate_anon(struct dentry *dentry, + struct inode *inode, + bool disconnected) +{ + struct dentry *res; + unsigned add_flags; + + security_d_instantiate(dentry, inode); + spin_lock(&inode->i_lock); + res = __d_find_any_alias(inode); + if (res) { + spin_unlock(&inode->i_lock); + dput(dentry); + goto out_iput; + } + + /* attach a disconnected dentry */ + add_flags = d_flags_for_inode(inode); + + if (disconnected) + add_flags |= DCACHE_DISCONNECTED; + + spin_lock(&dentry->d_lock); + __d_set_inode_and_type(dentry, inode, add_flags); + hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); + if (!disconnected) { + hlist_bl_lock(&dentry->d_sb->s_roots); + hlist_bl_add_head(&dentry->d_hash, &dentry->d_sb->s_roots); + hlist_bl_unlock(&dentry->d_sb->s_roots); + } + spin_unlock(&dentry->d_lock); + spin_unlock(&inode->i_lock); + + return dentry; + + out_iput: + iput(inode); + return res; +} + +struct dentry *d_instantiate_anon(struct dentry *dentry, struct inode *inode) +{ + return __d_instantiate_anon(dentry, inode, true); +} +EXPORT_SYMBOL(d_instantiate_anon); + +static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected) +{ + struct dentry *tmp; + struct dentry *res; + + if (!inode) + return ERR_PTR(-ESTALE); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + res = d_find_any_alias(inode); + if (res) + goto out_iput; + + tmp = d_alloc_anon(inode->i_sb); + if (!tmp) { + res = ERR_PTR(-ENOMEM); + goto out_iput; + } + + return __d_instantiate_anon(tmp, inode, disconnected); + +out_iput: + iput(inode); + return res; +} + +/** + * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode + * @inode: inode to allocate the dentry for + * + * Obtain a dentry for an inode resulting from NFS filehandle conversion or + * similar open by handle operations. The returned dentry may be anonymous, + * or may have a full name (if the inode was already in the cache). + * + * When called on a directory inode, we must ensure that the inode only ever + * has one dentry. If a dentry is found, that is returned instead of + * allocating a new one. + * + * On successful return, the reference to the inode has been transferred + * to the dentry. In case of an error the reference on the inode is released. + * To make it easier to use in export operations a %NULL or IS_ERR inode may + * be passed in and the error will be propagated to the return value, + * with a %NULL @inode replaced by ERR_PTR(-ESTALE). + */ +struct dentry *d_obtain_alias(struct inode *inode) +{ + return __d_obtain_alias(inode, true); +} +EXPORT_SYMBOL_NS(d_obtain_alias, ANDROID_GKI_VFS_EXPORT_ONLY); + +/** + * d_obtain_root - find or allocate a dentry for a given inode + * @inode: inode to allocate the dentry for + * + * Obtain an IS_ROOT dentry for the root of a filesystem. + * + * We must ensure that directory inodes only ever have one dentry. If a + * dentry is found, that is returned instead of allocating a new one. + * + * On successful return, the reference to the inode has been transferred + * to the dentry. In case of an error the reference on the inode is + * released. A %NULL or IS_ERR inode may be passed in and will be the + * error will be propagate to the return value, with a %NULL @inode + * replaced by ERR_PTR(-ESTALE). + */ +struct dentry *d_obtain_root(struct inode *inode) +{ + return __d_obtain_alias(inode, false); +} +EXPORT_SYMBOL(d_obtain_root); + +/** + * d_add_ci - lookup or allocate new dentry with case-exact name + * @inode: the inode case-insensitive lookup has found + * @dentry: the negative dentry that was passed to the parent's lookup func + * @name: the case-exact name to be associated with the returned dentry + * + * This is to avoid filling the dcache with case-insensitive names to the + * same inode, only the actual correct case is stored in the dcache for + * case-insensitive filesystems. + * + * For a case-insensitive lookup match and if the the case-exact dentry + * already exists in in the dcache, use it and return it. + * + * If no entry exists with the exact case name, allocate new dentry with + * the exact case, and return the spliced entry. + */ +struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, + struct qstr *name) +{ + struct dentry *found, *res; + + /* + * First check if a dentry matching the name already exists, + * if not go ahead and create it now. + */ + found = d_hash_and_lookup(dentry->d_parent, name); + if (found) { + iput(inode); + return found; + } + if (d_in_lookup(dentry)) { + found = d_alloc_parallel(dentry->d_parent, name, + dentry->d_wait); + if (IS_ERR(found) || !d_in_lookup(found)) { + iput(inode); + return found; + } + } else { + found = d_alloc(dentry->d_parent, name); + if (!found) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + } + res = d_splice_alias(inode, found); + if (res) { + dput(found); + return res; + } + return found; +} +EXPORT_SYMBOL_NS(d_add_ci, ANDROID_GKI_VFS_EXPORT_ONLY); + + +static inline bool d_same_name(const struct dentry *dentry, + const struct dentry *parent, + const struct qstr *name) +{ + if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) { + if (dentry->d_name.len != name->len) + return false; + return dentry_cmp(dentry, name->name, name->len) == 0; + } + return parent->d_op->d_compare(dentry, + dentry->d_name.len, dentry->d_name.name, + name) == 0; +} + +/** + * __d_lookup_rcu - search for a dentry (racy, store-free) + * @parent: parent dentry + * @name: qstr of name we wish to find + * @seqp: returns d_seq value at the point where the dentry was found + * Returns: dentry, or NULL + * + * __d_lookup_rcu is the dcache lookup function for rcu-walk name + * resolution (store-free path walking) design described in + * Documentation/filesystems/path-lookup.txt. + * + * This is not to be used outside core vfs. + * + * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock + * held, and rcu_read_lock held. The returned dentry must not be stored into + * without taking d_lock and checking d_seq sequence count against @seq + * returned here. + * + * A refcount may be taken on the found dentry with the d_rcu_to_refcount + * function. + * + * Alternatively, __d_lookup_rcu may be called again to look up the child of + * the returned dentry, so long as its parent's seqlock is checked after the + * child is looked up. Thus, an interlocking stepping of sequence lock checks + * is formed, giving integrity down the path walk. + * + * NOTE! The caller *has* to check the resulting dentry against the sequence + * number we've returned before using any of the resulting dentry state! + */ +struct dentry *__d_lookup_rcu(const struct dentry *parent, + const struct qstr *name, + unsigned *seqp) +{ + u64 hashlen = name->hash_len; + const unsigned char *str = name->name; + struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen)); + struct hlist_bl_node *node; + struct dentry *dentry; + + /* + * Note: There is significant duplication with __d_lookup_rcu which is + * required to prevent single threaded performance regressions + * especially on architectures where smp_rmb (in seqcounts) are costly. + * Keep the two functions in sync. + */ + + /* + * The hash list is protected using RCU. + * + * Carefully use d_seq when comparing a candidate dentry, to avoid + * races with d_move(). + * + * It is possible that concurrent renames can mess up our list + * walk here and result in missing our dentry, resulting in the + * false-negative result. d_lookup() protects against concurrent + * renames using rename_lock seqlock. + * + * See Documentation/filesystems/path-lookup.txt for more details. + */ + hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { + unsigned seq; + +seqretry: + /* + * The dentry sequence count protects us from concurrent + * renames, and thus protects parent and name fields. + * + * The caller must perform a seqcount check in order + * to do anything useful with the returned dentry. + * + * NOTE! We do a "raw" seqcount_begin here. That means that + * we don't wait for the sequence count to stabilize if it + * is in the middle of a sequence change. If we do the slow + * dentry compare, we will do seqretries until it is stable, + * and if we end up with a successful lookup, we actually + * want to exit RCU lookup anyway. + * + * Note that raw_seqcount_begin still *does* smp_rmb(), so + * we are still guaranteed NUL-termination of ->d_name.name. + */ + seq = raw_seqcount_begin(&dentry->d_seq); + if (dentry->d_parent != parent) + continue; + if (d_unhashed(dentry)) + continue; + + if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { + int tlen; + const char *tname; + if (dentry->d_name.hash != hashlen_hash(hashlen)) + continue; + tlen = dentry->d_name.len; + tname = dentry->d_name.name; + /* we want a consistent (name,len) pair */ + if (read_seqcount_retry(&dentry->d_seq, seq)) { + cpu_relax(); + goto seqretry; + } + if (parent->d_op->d_compare(dentry, + tlen, tname, name) != 0) + continue; + } else { + if (dentry->d_name.hash_len != hashlen) + continue; + if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0) + continue; + } + *seqp = seq; + return dentry; + } + return NULL; +} + +/** + * d_lookup - search for a dentry + * @parent: parent dentry + * @name: qstr of name we wish to find + * Returns: dentry, or NULL + * + * d_lookup searches the children of the parent dentry for the name in + * question. If the dentry is found its reference count is incremented and the + * dentry is returned. The caller must use dput to free the entry when it has + * finished using it. %NULL is returned if the dentry does not exist. + */ +struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name) +{ + struct dentry *dentry; + unsigned seq; + + do { + seq = read_seqbegin(&rename_lock); + dentry = __d_lookup(parent, name); + if (dentry) + break; + } while (read_seqretry(&rename_lock, seq)); + return dentry; +} +EXPORT_SYMBOL(d_lookup); + +/** + * __d_lookup - search for a dentry (racy) + * @parent: parent dentry + * @name: qstr of name we wish to find + * Returns: dentry, or NULL + * + * __d_lookup is like d_lookup, however it may (rarely) return a + * false-negative result due to unrelated rename activity. + * + * __d_lookup is slightly faster by avoiding rename_lock read seqlock, + * however it must be used carefully, eg. with a following d_lookup in + * the case of failure. + * + * __d_lookup callers must be commented. + */ +struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) +{ + unsigned int hash = name->hash; + struct hlist_bl_head *b = d_hash(hash); + struct hlist_bl_node *node; + struct dentry *found = NULL; + struct dentry *dentry; + + /* + * Note: There is significant duplication with __d_lookup_rcu which is + * required to prevent single threaded performance regressions + * especially on architectures where smp_rmb (in seqcounts) are costly. + * Keep the two functions in sync. + */ + + /* + * The hash list is protected using RCU. + * + * Take d_lock when comparing a candidate dentry, to avoid races + * with d_move(). + * + * It is possible that concurrent renames can mess up our list + * walk here and result in missing our dentry, resulting in the + * false-negative result. d_lookup() protects against concurrent + * renames using rename_lock seqlock. + * + * See Documentation/filesystems/path-lookup.txt for more details. + */ + rcu_read_lock(); + + hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { + + if (dentry->d_name.hash != hash) + continue; + + spin_lock(&dentry->d_lock); + if (dentry->d_parent != parent) + goto next; + if (d_unhashed(dentry)) + goto next; + + if (!d_same_name(dentry, parent, name)) + goto next; + + dentry->d_lockref.count++; + found = dentry; + spin_unlock(&dentry->d_lock); + break; +next: + spin_unlock(&dentry->d_lock); + } + rcu_read_unlock(); + + return found; +} + +/** + * d_hash_and_lookup - hash the qstr then search for a dentry + * @dir: Directory to search in + * @name: qstr of name we wish to find + * + * On lookup failure NULL is returned; on bad name - ERR_PTR(-error) + */ +struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) +{ + /* + * Check for a fs-specific hash function. Note that we must + * calculate the standard hash first, as the d_op->d_hash() + * routine may choose to leave the hash value unchanged. + */ + name->hash = full_name_hash(dir, name->name, name->len); + if (dir->d_flags & DCACHE_OP_HASH) { + int err = dir->d_op->d_hash(dir, name); + if (unlikely(err < 0)) + return ERR_PTR(err); + } + return d_lookup(dir, name); +} +EXPORT_SYMBOL(d_hash_and_lookup); + +/* + * When a file is deleted, we have two options: + * - turn this dentry into a negative dentry + * - unhash this dentry and free it. + * + * Usually, we want to just turn this into + * a negative dentry, but if anybody else is + * currently using the dentry or the inode + * we can't do that and we fall back on removing + * it from the hash queues and waiting for + * it to be deleted later when it has no users + */ + +/** + * d_delete - delete a dentry + * @dentry: The dentry to delete + * + * Turn the dentry into a negative dentry if possible, otherwise + * remove it from the hash queues so it can be deleted later + */ + +void d_delete(struct dentry * dentry) +{ + struct inode *inode = dentry->d_inode; + + spin_lock(&inode->i_lock); + spin_lock(&dentry->d_lock); + /* + * Are we the only user? + */ + if (dentry->d_lockref.count == 1) { + dentry->d_flags &= ~DCACHE_CANT_MOUNT; + dentry_unlink_inode(dentry); + } else { + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&inode->i_lock); + } +} +EXPORT_SYMBOL(d_delete); + +static void __d_rehash(struct dentry *entry) +{ + struct hlist_bl_head *b = d_hash(entry->d_name.hash); + + hlist_bl_lock(b); + hlist_bl_add_head_rcu(&entry->d_hash, b); + hlist_bl_unlock(b); +} + +/** + * d_rehash - add an entry back to the hash + * @entry: dentry to add to the hash + * + * Adds a dentry to the hash according to its name. + */ + +void d_rehash(struct dentry * entry) +{ + spin_lock(&entry->d_lock); + __d_rehash(entry); + spin_unlock(&entry->d_lock); +} +EXPORT_SYMBOL(d_rehash); + +static inline unsigned start_dir_add(struct inode *dir) +{ + + for (;;) { + unsigned n = dir->i_dir_seq; + if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) + return n; + cpu_relax(); + } +} + +static inline void end_dir_add(struct inode *dir, unsigned n) +{ + smp_store_release(&dir->i_dir_seq, n + 2); +} + +static void d_wait_lookup(struct dentry *dentry) +{ + if (d_in_lookup(dentry)) { + DECLARE_WAITQUEUE(wait, current); + add_wait_queue(dentry->d_wait, &wait); + do { + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&dentry->d_lock); + schedule(); + spin_lock(&dentry->d_lock); + } while (d_in_lookup(dentry)); + } +} + +struct dentry *d_alloc_parallel(struct dentry *parent, + const struct qstr *name, + wait_queue_head_t *wq) +{ + unsigned int hash = name->hash; + struct hlist_bl_head *b = in_lookup_hash(parent, hash); + struct hlist_bl_node *node; + struct dentry *new = d_alloc(parent, name); + struct dentry *dentry; + unsigned seq, r_seq, d_seq; + + if (unlikely(!new)) + return ERR_PTR(-ENOMEM); + +retry: + rcu_read_lock(); + seq = smp_load_acquire(&parent->d_inode->i_dir_seq); + r_seq = read_seqbegin(&rename_lock); + dentry = __d_lookup_rcu(parent, name, &d_seq); + if (unlikely(dentry)) { + if (!lockref_get_not_dead(&dentry->d_lockref)) { + rcu_read_unlock(); + goto retry; + } + if (read_seqcount_retry(&dentry->d_seq, d_seq)) { + rcu_read_unlock(); + dput(dentry); + goto retry; + } + rcu_read_unlock(); + dput(new); + return dentry; + } + if (unlikely(read_seqretry(&rename_lock, r_seq))) { + rcu_read_unlock(); + goto retry; + } + + if (unlikely(seq & 1)) { + rcu_read_unlock(); + goto retry; + } + + hlist_bl_lock(b); + if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) { + hlist_bl_unlock(b); + rcu_read_unlock(); + goto retry; + } + /* + * No changes for the parent since the beginning of d_lookup(). + * Since all removals from the chain happen with hlist_bl_lock(), + * any potential in-lookup matches are going to stay here until + * we unlock the chain. All fields are stable in everything + * we encounter. + */ + hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) { + if (dentry->d_name.hash != hash) + continue; + if (dentry->d_parent != parent) + continue; + if (!d_same_name(dentry, parent, name)) + continue; + hlist_bl_unlock(b); + /* now we can try to grab a reference */ + if (!lockref_get_not_dead(&dentry->d_lockref)) { + rcu_read_unlock(); + goto retry; + } + + rcu_read_unlock(); + /* + * somebody is likely to be still doing lookup for it; + * wait for them to finish + */ + spin_lock(&dentry->d_lock); + d_wait_lookup(dentry); + /* + * it's not in-lookup anymore; in principle we should repeat + * everything from dcache lookup, but it's likely to be what + * d_lookup() would've found anyway. If it is, just return it; + * otherwise we really have to repeat the whole thing. + */ + if (unlikely(dentry->d_name.hash != hash)) + goto mismatch; + if (unlikely(dentry->d_parent != parent)) + goto mismatch; + if (unlikely(d_unhashed(dentry))) + goto mismatch; + if (unlikely(!d_same_name(dentry, parent, name))) + goto mismatch; + /* OK, it *is* a hashed match; return it */ + spin_unlock(&dentry->d_lock); + dput(new); + return dentry; + } + rcu_read_unlock(); + /* we can't take ->d_lock here; it's OK, though. */ + new->d_flags |= DCACHE_PAR_LOOKUP; + new->d_wait = wq; + hlist_bl_add_head_rcu(&new->d_u.d_in_lookup_hash, b); + hlist_bl_unlock(b); + return new; +mismatch: + spin_unlock(&dentry->d_lock); + dput(dentry); + goto retry; +} +EXPORT_SYMBOL(d_alloc_parallel); + +void __d_lookup_done(struct dentry *dentry) +{ + struct hlist_bl_head *b = in_lookup_hash(dentry->d_parent, + dentry->d_name.hash); + hlist_bl_lock(b); + dentry->d_flags &= ~DCACHE_PAR_LOOKUP; + __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); + wake_up_all(dentry->d_wait); + dentry->d_wait = NULL; + hlist_bl_unlock(b); + INIT_HLIST_NODE(&dentry->d_u.d_alias); + INIT_LIST_HEAD(&dentry->d_lru); +} +EXPORT_SYMBOL(__d_lookup_done); + +/* inode->i_lock held if inode is non-NULL */ + +static inline void __d_add(struct dentry *dentry, struct inode *inode) +{ + struct inode *dir = NULL; + unsigned n; + spin_lock(&dentry->d_lock); + if (unlikely(d_in_lookup(dentry))) { + dir = dentry->d_parent->d_inode; + n = start_dir_add(dir); + __d_lookup_done(dentry); + } + if (inode) { + unsigned add_flags = d_flags_for_inode(inode); + hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); + raw_write_seqcount_begin(&dentry->d_seq); + __d_set_inode_and_type(dentry, inode, add_flags); + raw_write_seqcount_end(&dentry->d_seq); + fsnotify_update_flags(dentry); + } + __d_rehash(dentry); + if (dir) + end_dir_add(dir, n); + spin_unlock(&dentry->d_lock); + if (inode) + spin_unlock(&inode->i_lock); +} + +/** + * d_add - add dentry to hash queues + * @entry: dentry to add + * @inode: The inode to attach to this dentry + * + * This adds the entry to the hash queues and initializes @inode. + * The entry was actually filled in earlier during d_alloc(). + */ + +void d_add(struct dentry *entry, struct inode *inode) +{ + if (inode) { + security_d_instantiate(entry, inode); + spin_lock(&inode->i_lock); + } + __d_add(entry, inode); +} +EXPORT_SYMBOL(d_add); + +/** + * d_exact_alias - find and hash an exact unhashed alias + * @entry: dentry to add + * @inode: The inode to go with this dentry + * + * If an unhashed dentry with the same name/parent and desired + * inode already exists, hash and return it. Otherwise, return + * NULL. + * + * Parent directory should be locked. + */ +struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) +{ + struct dentry *alias; + unsigned int hash = entry->d_name.hash; + + spin_lock(&inode->i_lock); + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + /* + * Don't need alias->d_lock here, because aliases with + * d_parent == entry->d_parent are not subject to name or + * parent changes, because the parent inode i_mutex is held. + */ + if (alias->d_name.hash != hash) + continue; + if (alias->d_parent != entry->d_parent) + continue; + if (!d_same_name(alias, entry->d_parent, &entry->d_name)) + continue; + spin_lock(&alias->d_lock); + if (!d_unhashed(alias)) { + spin_unlock(&alias->d_lock); + alias = NULL; + } else { + __dget_dlock(alias); + __d_rehash(alias); + spin_unlock(&alias->d_lock); + } + spin_unlock(&inode->i_lock); + return alias; + } + spin_unlock(&inode->i_lock); + return NULL; +} +EXPORT_SYMBOL(d_exact_alias); + +static void swap_names(struct dentry *dentry, struct dentry *target) +{ + if (unlikely(dname_external(target))) { + if (unlikely(dname_external(dentry))) { + /* + * Both external: swap the pointers + */ + swap(target->d_name.name, dentry->d_name.name); + } else { + /* + * dentry:internal, target:external. Steal target's + * storage and make target internal. + */ + memcpy(target->d_iname, dentry->d_name.name, + dentry->d_name.len + 1); + dentry->d_name.name = target->d_name.name; + target->d_name.name = target->d_iname; + } + } else { + if (unlikely(dname_external(dentry))) { + /* + * dentry:external, target:internal. Give dentry's + * storage to target and make dentry internal + */ + memcpy(dentry->d_iname, target->d_name.name, + target->d_name.len + 1); + target->d_name.name = dentry->d_name.name; + dentry->d_name.name = dentry->d_iname; + } else { + /* + * Both are internal. + */ + unsigned int i; + BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); + for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { + swap(((long *) &dentry->d_iname)[i], + ((long *) &target->d_iname)[i]); + } + } + } + swap(dentry->d_name.hash_len, target->d_name.hash_len); +} + +static void copy_name(struct dentry *dentry, struct dentry *target) +{ + struct external_name *old_name = NULL; + if (unlikely(dname_external(dentry))) + old_name = external_name(dentry); + if (unlikely(dname_external(target))) { + atomic_inc(&external_name(target)->u.count); + dentry->d_name = target->d_name; + } else { + memcpy(dentry->d_iname, target->d_name.name, + target->d_name.len + 1); + dentry->d_name.name = dentry->d_iname; + dentry->d_name.hash_len = target->d_name.hash_len; + } + if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) + kfree_rcu(old_name, u.head); +} + +/* + * __d_move - move a dentry + * @dentry: entry to move + * @target: new dentry + * @exchange: exchange the two dentries + * + * Update the dcache to reflect the move of a file name. Negative + * dcache entries should not be moved in this way. Caller must hold + * rename_lock, the i_mutex of the source and target directories, + * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). + */ +static void __d_move(struct dentry *dentry, struct dentry *target, + bool exchange) +{ + struct dentry *old_parent, *p; + struct inode *dir = NULL; + unsigned n; + + WARN_ON(!dentry->d_inode); + if (WARN_ON(dentry == target)) + return; + + BUG_ON(d_ancestor(target, dentry)); + old_parent = dentry->d_parent; + p = d_ancestor(old_parent, target); + if (IS_ROOT(dentry)) { + BUG_ON(p); + spin_lock(&target->d_parent->d_lock); + } else if (!p) { + /* target is not a descendent of dentry->d_parent */ + spin_lock(&target->d_parent->d_lock); + spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED); + } else { + BUG_ON(p == dentry); + spin_lock(&old_parent->d_lock); + if (p != target) + spin_lock_nested(&target->d_parent->d_lock, + DENTRY_D_LOCK_NESTED); + } + spin_lock_nested(&dentry->d_lock, 2); + spin_lock_nested(&target->d_lock, 3); + + if (unlikely(d_in_lookup(target))) { + dir = target->d_parent->d_inode; + n = start_dir_add(dir); + __d_lookup_done(target); + } + + write_seqcount_begin(&dentry->d_seq); + write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); + + /* unhash both */ + if (!d_unhashed(dentry)) + ___d_drop(dentry); + if (!d_unhashed(target)) + ___d_drop(target); + + /* ... and switch them in the tree */ + dentry->d_parent = target->d_parent; + if (!exchange) { + copy_name(dentry, target); + target->d_hash.pprev = NULL; + dentry->d_parent->d_lockref.count++; + if (dentry != old_parent) /* wasn't IS_ROOT */ + WARN_ON(!--old_parent->d_lockref.count); + } else { + target->d_parent = old_parent; + swap_names(dentry, target); + list_move(&target->d_child, &target->d_parent->d_subdirs); + __d_rehash(target); + fsnotify_update_flags(target); + } + list_move(&dentry->d_child, &dentry->d_parent->d_subdirs); + __d_rehash(dentry); + fsnotify_update_flags(dentry); + fscrypt_handle_d_move(dentry); + + write_seqcount_end(&target->d_seq); + write_seqcount_end(&dentry->d_seq); + + if (dir) + end_dir_add(dir, n); + + if (dentry->d_parent != old_parent) + spin_unlock(&dentry->d_parent->d_lock); + if (dentry != old_parent) + spin_unlock(&old_parent->d_lock); + spin_unlock(&target->d_lock); + spin_unlock(&dentry->d_lock); +} + +/* + * d_move - move a dentry + * @dentry: entry to move + * @target: new dentry + * + * Update the dcache to reflect the move of a file name. Negative + * dcache entries should not be moved in this way. See the locking + * requirements for __d_move. + */ +void d_move(struct dentry *dentry, struct dentry *target) +{ + write_seqlock(&rename_lock); + __d_move(dentry, target, false); + write_sequnlock(&rename_lock); +} +EXPORT_SYMBOL(d_move); + +/* + * d_exchange - exchange two dentries + * @dentry1: first dentry + * @dentry2: second dentry + */ +void d_exchange(struct dentry *dentry1, struct dentry *dentry2) +{ + write_seqlock(&rename_lock); + + WARN_ON(!dentry1->d_inode); + WARN_ON(!dentry2->d_inode); + WARN_ON(IS_ROOT(dentry1)); + WARN_ON(IS_ROOT(dentry2)); + + __d_move(dentry1, dentry2, true); + + write_sequnlock(&rename_lock); +} + +/** + * d_ancestor - search for an ancestor + * @p1: ancestor dentry + * @p2: child dentry + * + * Returns the ancestor dentry of p2 which is a child of p1, if p1 is + * an ancestor of p2, else NULL. + */ +struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) +{ + struct dentry *p; + + for (p = p2; !IS_ROOT(p); p = p->d_parent) { + if (p->d_parent == p1) + return p; + } + return NULL; +} + +/* + * This helper attempts to cope with remotely renamed directories + * + * It assumes that the caller is already holding + * dentry->d_parent->d_inode->i_mutex, and rename_lock + * + * Note: If ever the locking in lock_rename() changes, then please + * remember to update this too... + */ +static int __d_unalias(struct inode *inode, + struct dentry *dentry, struct dentry *alias) +{ + struct mutex *m1 = NULL; + struct rw_semaphore *m2 = NULL; + int ret = -ESTALE; + + /* If alias and dentry share a parent, then no extra locks required */ + if (alias->d_parent == dentry->d_parent) + goto out_unalias; + + /* See lock_rename() */ + if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) + goto out_err; + m1 = &dentry->d_sb->s_vfs_rename_mutex; + if (!inode_trylock_shared(alias->d_parent->d_inode)) + goto out_err; + m2 = &alias->d_parent->d_inode->i_rwsem; +out_unalias: + __d_move(alias, dentry, false); + ret = 0; +out_err: + if (m2) + up_read(m2); + if (m1) + mutex_unlock(m1); + return ret; +} + +/** + * d_splice_alias - splice a disconnected dentry into the tree if one exists + * @inode: the inode which may have a disconnected dentry + * @dentry: a negative dentry which we want to point to the inode. + * + * If inode is a directory and has an IS_ROOT alias, then d_move that in + * place of the given dentry and return it, else simply d_add the inode + * to the dentry and return NULL. + * + * If a non-IS_ROOT directory is found, the filesystem is corrupt, and + * we should error out: directories can't have multiple aliases. + * + * This is needed in the lookup routine of any filesystem that is exportable + * (via knfsd) so that we can build dcache paths to directories effectively. + * + * If a dentry was found and moved, then it is returned. Otherwise NULL + * is returned. This matches the expected return value of ->lookup. + * + * Cluster filesystems may call this function with a negative, hashed dentry. + * In that case, we know that the inode will be a regular file, and also this + * will only occur during atomic_open. So we need to check for the dentry + * being already hashed only in the final case. + */ +struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) +{ + if (IS_ERR(inode)) + return ERR_CAST(inode); + + BUG_ON(!d_unhashed(dentry)); + + if (!inode) + goto out; + + security_d_instantiate(dentry, inode); + spin_lock(&inode->i_lock); + if (S_ISDIR(inode->i_mode)) { + struct dentry *new = __d_find_any_alias(inode); + if (unlikely(new)) { + /* The reference to new ensures it remains an alias */ + spin_unlock(&inode->i_lock); + write_seqlock(&rename_lock); + if (unlikely(d_ancestor(new, dentry))) { + write_sequnlock(&rename_lock); + dput(new); + new = ERR_PTR(-ELOOP); + pr_warn_ratelimited( + "VFS: Lookup of '%s' in %s %s" + " would have caused loop\n", + dentry->d_name.name, + inode->i_sb->s_type->name, + inode->i_sb->s_id); + } else if (!IS_ROOT(new)) { + struct dentry *old_parent = dget(new->d_parent); + int err = __d_unalias(inode, dentry, new); + write_sequnlock(&rename_lock); + if (err) { + dput(new); + new = ERR_PTR(err); + } + dput(old_parent); + } else { + __d_move(new, dentry, false); + write_sequnlock(&rename_lock); + } + iput(inode); + return new; + } + } +out: + __d_add(dentry, inode); + return NULL; +} +EXPORT_SYMBOL_NS(d_splice_alias, ANDROID_GKI_VFS_EXPORT_ONLY); + +/* + * Test whether new_dentry is a subdirectory of old_dentry. + * + * Trivially implemented using the dcache structure + */ + +/** + * is_subdir - is new dentry a subdirectory of old_dentry + * @new_dentry: new dentry + * @old_dentry: old dentry + * + * Returns true if new_dentry is a subdirectory of the parent (at any depth). + * Returns false otherwise. + * Caller must ensure that "new_dentry" is pinned before calling is_subdir() + */ + +bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) +{ + bool subdir; + unsigned seq; + + if (new_dentry == old_dentry) + return true; + + /* Access d_parent under rcu as d_move() may change it. */ + rcu_read_lock(); + seq = read_seqbegin(&rename_lock); + subdir = d_ancestor(old_dentry, new_dentry); + /* Try lockless once... */ + if (read_seqretry(&rename_lock, seq)) { + /* ...else acquire lock for progress even on deep chains. */ + read_seqlock_excl(&rename_lock); + subdir = d_ancestor(old_dentry, new_dentry); + read_sequnlock_excl(&rename_lock); + } + rcu_read_unlock(); + return subdir; +} +EXPORT_SYMBOL(is_subdir); + +static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) +{ + struct dentry *root = data; + if (dentry != root) { + if (d_unhashed(dentry) || !dentry->d_inode) + return D_WALK_SKIP; + + if (!(dentry->d_flags & DCACHE_GENOCIDE)) { + dentry->d_flags |= DCACHE_GENOCIDE; + dentry->d_lockref.count--; + } + } + return D_WALK_CONTINUE; +} + +void d_genocide(struct dentry *parent) +{ + d_walk(parent, parent, d_genocide_kill); +} + +EXPORT_SYMBOL(d_genocide); + +void d_tmpfile(struct dentry *dentry, struct inode *inode) +{ + inode_dec_link_count(inode); + BUG_ON(dentry->d_name.name != dentry->d_iname || + !hlist_unhashed(&dentry->d_u.d_alias) || + !d_unlinked(dentry)); + spin_lock(&dentry->d_parent->d_lock); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + dentry->d_name.len = sprintf(dentry->d_iname, "#%llu", + (unsigned long long)inode->i_ino); + spin_unlock(&dentry->d_lock); + spin_unlock(&dentry->d_parent->d_lock); + d_instantiate(dentry, inode); +} +EXPORT_SYMBOL(d_tmpfile); + +static __initdata unsigned long dhash_entries; +static int __init set_dhash_entries(char *str) +{ + if (!str) + return 0; + dhash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("dhash_entries=", set_dhash_entries); + +static void __init dcache_init_early(void) +{ + /* If hashes are distributed across NUMA nodes, defer + * hash allocation until vmalloc space is available. + */ + if (hashdist) + return; + + dentry_hashtable = + alloc_large_system_hash("Dentry cache", + sizeof(struct hlist_bl_head), + dhash_entries, + 13, + HASH_EARLY | HASH_ZERO, + &d_hash_shift, + NULL, + 0, + 0); + d_hash_shift = 32 - d_hash_shift; +} + +static void __init dcache_init(void) +{ + /* + * A constructor could be added for stable state like the lists, + * but it is probably not worth it because of the cache nature + * of the dcache. + */ + dentry_cache = KMEM_CACHE_USERCOPY(dentry, + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT, + d_iname); + + /* Hash may have been set up in dcache_init_early */ + if (!hashdist) + return; + + dentry_hashtable = + alloc_large_system_hash("Dentry cache", + sizeof(struct hlist_bl_head), + dhash_entries, + 13, + HASH_ZERO, + &d_hash_shift, + NULL, + 0, + 0); + d_hash_shift = 32 - d_hash_shift; +} + +/* SLAB cache for __getname() consumers */ +struct kmem_cache *names_cachep __read_mostly; +EXPORT_SYMBOL(names_cachep); + +void __init vfs_caches_init_early(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++) + INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]); + + dcache_init_early(); + inode_init_early(); +} + +void __init vfs_caches_init(void) +{ + names_cachep = kmem_cache_create_usercopy("names_cache", PATH_MAX, 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, 0, PATH_MAX, NULL); + + dcache_init(); + inode_init(); + files_init(); + files_maxfiles_init(); + mnt_init(); + bdev_cache_init(); + chrdev_init(); +} diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 4f25015aa534..2938aa2a808e 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -596,6 +596,11 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) return dentry; } +#ifdef CONFIG_KSU_SUSFS_SUS_SU +extern bool ksu_devpts_hook; +extern int ksu_handle_devpts(struct inode*); +#endif + /** * devpts_get_priv -- get private data for a slave * @pts_inode: inode of the slave @@ -604,6 +609,12 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) */ void *devpts_get_priv(struct dentry *dentry) { +#ifdef CONFIG_KSU_SUSFS_SUS_SU + if (likely(ksu_devpts_hook)) { + ksu_handle_devpts(dentry->d_inode); + } +#endif + if (dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC) return NULL; return dentry->d_fsdata; diff --git a/fs/exec.c b/fs/exec.c index 4cbc211e9049..72f0e2dfb9af 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1871,6 +1871,12 @@ static int bprm_execve(struct linux_binprm *bprm, return retval; } +#ifdef CONFIG_KSU_SUSFS_SUS_SU +extern bool susfs_is_sus_su_hooks_enabled __read_mostly; +extern int ksu_handle_execveat_sucompat(int *fd, struct filename **filename_ptr, void *argv, + void *envp, int *flags); +#endif + static int do_execveat_common(int fd, struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp, @@ -1882,6 +1888,11 @@ static int do_execveat_common(int fd, struct filename *filename, if (IS_ERR(filename)) return PTR_ERR(filename); +#ifdef CONFIG_KSU_SUSFS_SUS_SU + if (susfs_is_sus_su_hooks_enabled) + ksu_handle_execveat_sucompat(&fd, &filename, &argv, &envp, &flags); +#endif + /* * We move the actual failure in case of RLIMIT_NPROC excess from * set*uid() to execve() because too many poorly written programs diff --git a/fs/exec.c.orig b/fs/exec.c.orig new file mode 100644 index 000000000000..4cbc211e9049 --- /dev/null +++ b/fs/exec.c.orig @@ -0,0 +1,2132 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/fs/exec.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * #!-checking implemented by tytso. + */ +/* + * Demand-loading implemented 01.12.91 - no need to read anything but + * the header into memory. The inode of the executable is put into + * "current->executable", and page faults do the actual loading. Clean. + * + * Once more I can proudly say that linux stood up to being changed: it + * was less than 2 hours work to get demand-loading completely implemented. + * + * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, + * current->executable is only used by the procfs. This allows a dispatch + * table to check for several different types of binary formats. We keep + * trying until we recognize the file or we run out of supported binary + * formats. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include "internal.h" + +#include + +EXPORT_TRACEPOINT_SYMBOL_GPL(task_rename); + +static int bprm_creds_from_file(struct linux_binprm *bprm); + +int suid_dumpable = 0; + +static LIST_HEAD(formats); +static DEFINE_RWLOCK(binfmt_lock); + +void __register_binfmt(struct linux_binfmt * fmt, int insert) +{ + BUG_ON(!fmt); + if (WARN_ON(!fmt->load_binary)) + return; + write_lock(&binfmt_lock); + insert ? list_add(&fmt->lh, &formats) : + list_add_tail(&fmt->lh, &formats); + write_unlock(&binfmt_lock); +} + +EXPORT_SYMBOL(__register_binfmt); + +void unregister_binfmt(struct linux_binfmt * fmt) +{ + write_lock(&binfmt_lock); + list_del(&fmt->lh); + write_unlock(&binfmt_lock); +} + +EXPORT_SYMBOL(unregister_binfmt); + +static inline void put_binfmt(struct linux_binfmt * fmt) +{ + module_put(fmt->module); +} + +bool path_noexec(const struct path *path) +{ + return (path->mnt->mnt_flags & MNT_NOEXEC) || + (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); +} + +#ifdef CONFIG_USELIB +/* + * Note that a shared library must be both readable and executable due to + * security reasons. + * + * Also note that we take the address to load from from the file itself. + */ +SYSCALL_DEFINE1(uselib, const char __user *, library) +{ + struct linux_binfmt *fmt; + struct file *file; + struct filename *tmp = getname(library); + int error = PTR_ERR(tmp); + static const struct open_flags uselib_flags = { + .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .acc_mode = MAY_READ | MAY_EXEC, + .intent = LOOKUP_OPEN, + .lookup_flags = LOOKUP_FOLLOW, + }; + + if (IS_ERR(tmp)) + goto out; + + file = do_filp_open(AT_FDCWD, tmp, &uselib_flags); + putname(tmp); + error = PTR_ERR(file); + if (IS_ERR(file)) + goto out; + + /* + * Check do_open_execat() for an explanation. + */ + error = -EACCES; + if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || + path_noexec(&file->f_path)) + goto exit; + + fsnotify_open(file); + + error = -ENOEXEC; + + read_lock(&binfmt_lock); + list_for_each_entry(fmt, &formats, lh) { + if (!fmt->load_shlib) + continue; + if (!try_module_get(fmt->module)) + continue; + read_unlock(&binfmt_lock); + error = fmt->load_shlib(file); + read_lock(&binfmt_lock); + put_binfmt(fmt); + if (error != -ENOEXEC) + break; + } + read_unlock(&binfmt_lock); +exit: + fput(file); +out: + return error; +} +#endif /* #ifdef CONFIG_USELIB */ + +#ifdef CONFIG_MMU +/* + * The nascent bprm->mm is not visible until exec_mmap() but it can + * use a lot of memory, account these pages in current->mm temporary + * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we + * change the counter back via acct_arg_size(0). + */ +static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ + struct mm_struct *mm = current->mm; + long diff = (long)(pages - bprm->vma_pages); + + if (!mm || !diff) + return; + + bprm->vma_pages = pages; + add_mm_counter(mm, MM_ANONPAGES, diff); +} + +static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, + int write) +{ + struct page *page; + int ret; + unsigned int gup_flags = FOLL_FORCE; + +#ifdef CONFIG_STACK_GROWSUP + if (write) { + ret = expand_downwards(bprm->vma, pos); + if (ret < 0) + return NULL; + } +#endif + + if (write) + gup_flags |= FOLL_WRITE; + + /* + * We are doing an exec(). 'current' is the process + * doing the exec and bprm->mm is the new process's mm. + */ + ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags, + &page, NULL, NULL); + if (ret <= 0) + return NULL; + + if (write) + acct_arg_size(bprm, vma_pages(bprm->vma)); + + return page; +} + +static void put_arg_page(struct page *page) +{ + put_user_page(page); +} + +static void free_arg_pages(struct linux_binprm *bprm) +{ +} + +static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, + struct page *page) +{ + flush_cache_page(bprm->vma, pos, page_to_pfn(page)); +} + +static int __bprm_mm_init(struct linux_binprm *bprm) +{ + int err; + struct vm_area_struct *vma = NULL; + struct mm_struct *mm = bprm->mm; + + bprm->vma = vma = vm_area_alloc(mm); + if (!vma) + return -ENOMEM; + vma_set_anonymous(vma); + + if (mmap_write_lock_killable(mm)) { + err = -EINTR; + goto err_free; + } + + /* + * Place the stack at the largest stack address the architecture + * supports. Later, we'll move this to an appropriate place. We don't + * use STACK_TOP because that can depend on attributes which aren't + * configured yet. + */ + BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); + vma->vm_end = STACK_TOP_MAX; + vma->vm_start = vma->vm_end - PAGE_SIZE; + vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); + + err = insert_vm_struct(mm, vma); + if (err) + goto err; + + mm->stack_vm = mm->total_vm = 1; + mmap_write_unlock(mm); + bprm->p = vma->vm_end - sizeof(void *); + return 0; +err: + mmap_write_unlock(mm); +err_free: + bprm->vma = NULL; + vm_area_free(vma); + return err; +} + +static bool valid_arg_len(struct linux_binprm *bprm, long len) +{ + return len <= MAX_ARG_STRLEN; +} + +#else + +static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ +} + +static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, + int write) +{ + struct page *page; + + page = bprm->page[pos / PAGE_SIZE]; + if (!page && write) { + page = alloc_page(GFP_HIGHUSER|__GFP_ZERO); + if (!page) + return NULL; + bprm->page[pos / PAGE_SIZE] = page; + } + + return page; +} + +static void put_arg_page(struct page *page) +{ +} + +static void free_arg_page(struct linux_binprm *bprm, int i) +{ + if (bprm->page[i]) { + __free_page(bprm->page[i]); + bprm->page[i] = NULL; + } +} + +static void free_arg_pages(struct linux_binprm *bprm) +{ + int i; + + for (i = 0; i < MAX_ARG_PAGES; i++) + free_arg_page(bprm, i); +} + +static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, + struct page *page) +{ +} + +static int __bprm_mm_init(struct linux_binprm *bprm) +{ + bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); + return 0; +} + +static bool valid_arg_len(struct linux_binprm *bprm, long len) +{ + return len <= bprm->p; +} + +#endif /* CONFIG_MMU */ + +/* + * Create a new mm_struct and populate it with a temporary stack + * vm_area_struct. We don't have enough context at this point to set the stack + * flags, permissions, and offset, so we use temporary values. We'll update + * them later in setup_arg_pages(). + */ +static int bprm_mm_init(struct linux_binprm *bprm) +{ + int err; + struct mm_struct *mm = NULL; + + bprm->mm = mm = mm_alloc(); + err = -ENOMEM; + if (!mm) + goto err; + + /* Save current stack limit for all calculations made during exec. */ + task_lock(current->group_leader); + bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; + task_unlock(current->group_leader); + + err = __bprm_mm_init(bprm); + if (err) + goto err; + + return 0; + +err: + if (mm) { + bprm->mm = NULL; + mmdrop(mm); + } + + return err; +} + +struct user_arg_ptr { +#ifdef CONFIG_COMPAT + bool is_compat; +#endif + union { + const char __user *const __user *native; +#ifdef CONFIG_COMPAT + const compat_uptr_t __user *compat; +#endif + } ptr; +}; + +static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr) +{ + const char __user *native; + +#ifdef CONFIG_COMPAT + if (unlikely(argv.is_compat)) { + compat_uptr_t compat; + + if (get_user(compat, argv.ptr.compat + nr)) + return ERR_PTR(-EFAULT); + + return compat_ptr(compat); + } +#endif + + if (get_user(native, argv.ptr.native + nr)) + return ERR_PTR(-EFAULT); + + return native; +} + +/* + * count() counts the number of strings in array ARGV. + */ +static int count(struct user_arg_ptr argv, int max) +{ + int i = 0; + + if (argv.ptr.native != NULL) { + for (;;) { + const char __user *p = get_user_arg_ptr(argv, i); + + if (!p) + break; + + if (IS_ERR(p)) + return -EFAULT; + + if (i >= max) + return -E2BIG; + ++i; + + if (fatal_signal_pending(current)) + return -ERESTARTNOHAND; + cond_resched(); + } + } + return i; +} + +static int count_strings_kernel(const char *const *argv) +{ + int i; + + if (!argv) + return 0; + + for (i = 0; argv[i]; ++i) { + if (i >= MAX_ARG_STRINGS) + return -E2BIG; + if (fatal_signal_pending(current)) + return -ERESTARTNOHAND; + cond_resched(); + } + return i; +} + +static int bprm_stack_limits(struct linux_binprm *bprm) +{ + unsigned long limit, ptr_size; + + /* + * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM + * (whichever is smaller) for the argv+env strings. + * This ensures that: + * - the remaining binfmt code will not run out of stack space, + * - the program will have a reasonable amount of stack left + * to work from. + */ + limit = _STK_LIM / 4 * 3; + limit = min(limit, bprm->rlim_stack.rlim_cur / 4); + /* + * We've historically supported up to 32 pages (ARG_MAX) + * of argument strings even with small stacks + */ + limit = max_t(unsigned long, limit, ARG_MAX); + /* + * We must account for the size of all the argv and envp pointers to + * the argv and envp strings, since they will also take up space in + * the stack. They aren't stored until much later when we can't + * signal to the parent that the child has run out of stack space. + * Instead, calculate it here so it's possible to fail gracefully. + * + * In the case of argc = 0, make sure there is space for adding a + * empty string (which will bump argc to 1), to ensure confused + * userspace programs don't start processing from argv[1], thinking + * argc can never be 0, to keep them from walking envp by accident. + * See do_execveat_common(). + */ + ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *); + if (limit <= ptr_size) + return -E2BIG; + limit -= ptr_size; + + bprm->argmin = bprm->p - limit; + return 0; +} + +/* + * 'copy_strings()' copies argument/environment strings from the old + * processes's memory to the new process's stack. The call to get_user_pages() + * ensures the destination page is created and not swapped out. + */ +static int copy_strings(int argc, struct user_arg_ptr argv, + struct linux_binprm *bprm) +{ + struct page *kmapped_page = NULL; + char *kaddr = NULL; + unsigned long kpos = 0; + int ret; + + while (argc-- > 0) { + const char __user *str; + int len; + unsigned long pos; + + ret = -EFAULT; + str = get_user_arg_ptr(argv, argc); + if (IS_ERR(str)) + goto out; + + len = strnlen_user(str, MAX_ARG_STRLEN); + if (!len) + goto out; + + ret = -E2BIG; + if (!valid_arg_len(bprm, len)) + goto out; + + /* We're going to work our way backwords. */ + pos = bprm->p; + str += len; + bprm->p -= len; +#ifdef CONFIG_MMU + if (bprm->p < bprm->argmin) + goto out; +#endif + + while (len > 0) { + int offset, bytes_to_copy; + + if (fatal_signal_pending(current)) { + ret = -ERESTARTNOHAND; + goto out; + } + cond_resched(); + + offset = pos % PAGE_SIZE; + if (offset == 0) + offset = PAGE_SIZE; + + bytes_to_copy = offset; + if (bytes_to_copy > len) + bytes_to_copy = len; + + offset -= bytes_to_copy; + pos -= bytes_to_copy; + str -= bytes_to_copy; + len -= bytes_to_copy; + + if (!kmapped_page || kpos != (pos & PAGE_MASK)) { + struct page *page; + + page = get_arg_page(bprm, pos, 1); + if (!page) { + ret = -E2BIG; + goto out; + } + + if (kmapped_page) { + flush_kernel_dcache_page(kmapped_page); + kunmap(kmapped_page); + put_arg_page(kmapped_page); + } + kmapped_page = page; + kaddr = kmap(kmapped_page); + kpos = pos & PAGE_MASK; + flush_arg_page(bprm, kpos, kmapped_page); + } + if (copy_from_user(kaddr+offset, str, bytes_to_copy)) { + ret = -EFAULT; + goto out; + } + } + } + ret = 0; +out: + if (kmapped_page) { + flush_kernel_dcache_page(kmapped_page); + kunmap(kmapped_page); + put_arg_page(kmapped_page); + } + return ret; +} + +/* + * Copy and argument/environment string from the kernel to the processes stack. + */ +int copy_string_kernel(const char *arg, struct linux_binprm *bprm) +{ + int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */; + unsigned long pos = bprm->p; + + if (len == 0) + return -EFAULT; + if (!valid_arg_len(bprm, len)) + return -E2BIG; + + /* We're going to work our way backwards. */ + arg += len; + bprm->p -= len; + if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin) + return -E2BIG; + + while (len > 0) { + unsigned int bytes_to_copy = min_t(unsigned int, len, + min_not_zero(offset_in_page(pos), PAGE_SIZE)); + struct page *page; + char *kaddr; + + pos -= bytes_to_copy; + arg -= bytes_to_copy; + len -= bytes_to_copy; + + page = get_arg_page(bprm, pos, 1); + if (!page) + return -E2BIG; + kaddr = kmap_atomic(page); + flush_arg_page(bprm, pos & PAGE_MASK, page); + memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy); + flush_kernel_dcache_page(page); + kunmap_atomic(kaddr); + put_arg_page(page); + } + + return 0; +} +EXPORT_SYMBOL(copy_string_kernel); + +static int copy_strings_kernel(int argc, const char *const *argv, + struct linux_binprm *bprm) +{ + while (argc-- > 0) { + int ret = copy_string_kernel(argv[argc], bprm); + if (ret < 0) + return ret; + if (fatal_signal_pending(current)) + return -ERESTARTNOHAND; + cond_resched(); + } + return 0; +} + +#ifdef CONFIG_MMU + +/* + * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once + * the binfmt code determines where the new stack should reside, we shift it to + * its final location. The process proceeds as follows: + * + * 1) Use shift to calculate the new vma endpoints. + * 2) Extend vma to cover both the old and new ranges. This ensures the + * arguments passed to subsequent functions are consistent. + * 3) Move vma's page tables to the new range. + * 4) Free up any cleared pgd range. + * 5) Shrink the vma to cover only the new range. + */ +static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long old_start = vma->vm_start; + unsigned long old_end = vma->vm_end; + unsigned long length = old_end - old_start; + unsigned long new_start = old_start - shift; + unsigned long new_end = old_end - shift; + struct mmu_gather tlb; + + BUG_ON(new_start > new_end); + + /* + * ensure there are no vmas between where we want to go + * and where we are + */ + if (vma != find_vma(mm, new_start)) + return -EFAULT; + + /* + * cover the whole range: [new_start, old_end) + */ + if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL)) + return -ENOMEM; + + /* + * move the page tables downwards, on failure we rely on + * process cleanup to remove whatever mess we made. + */ + if (length != move_page_tables(vma, old_start, + vma, new_start, length, false)) + return -ENOMEM; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, old_start, old_end); + if (new_end > old_start) { + /* + * when the old and new regions overlap clear from new_end. + */ + free_pgd_range(&tlb, new_end, old_end, new_end, + vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); + } else { + /* + * otherwise, clean from old_start; this is done to not touch + * the address space in [new_end, old_start) some architectures + * have constraints on va-space that make this illegal (IA64) - + * for the others its just a little faster. + */ + free_pgd_range(&tlb, old_start, old_end, new_end, + vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); + } + tlb_finish_mmu(&tlb, old_start, old_end); + + /* + * Shrink the vma to just the new range. Always succeeds. + */ + vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL); + + return 0; +} + +/* + * Finalizes the stack vm_area_struct. The flags and permissions are updated, + * the stack is optionally relocated, and some extra space is added. + */ +int setup_arg_pages(struct linux_binprm *bprm, + unsigned long stack_top, + int executable_stack) +{ + unsigned long ret; + unsigned long stack_shift; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = bprm->vma; + struct vm_area_struct *prev = NULL; + unsigned long vm_flags; + unsigned long stack_base; + unsigned long stack_size; + unsigned long stack_expand; + unsigned long rlim_stack; + +#ifdef CONFIG_STACK_GROWSUP + /* Limit stack size */ + stack_base = bprm->rlim_stack.rlim_max; + if (stack_base > STACK_SIZE_MAX) + stack_base = STACK_SIZE_MAX; + + /* Add space for stack randomization. */ + if (current->flags & PF_RANDOMIZE) + stack_base += (STACK_RND_MASK << PAGE_SHIFT); + + /* Make sure we didn't let the argument array grow too large. */ + if (vma->vm_end - vma->vm_start > stack_base) + return -ENOMEM; + + stack_base = PAGE_ALIGN(stack_top - stack_base); + + stack_shift = vma->vm_start - stack_base; + mm->arg_start = bprm->p - stack_shift; + bprm->p = vma->vm_end - stack_shift; +#else + stack_top = arch_align_stack(stack_top); + stack_top = PAGE_ALIGN(stack_top); + + if (unlikely(stack_top < mmap_min_addr) || + unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) + return -ENOMEM; + + stack_shift = vma->vm_end - stack_top; + + bprm->p -= stack_shift; + mm->arg_start = bprm->p; +#endif + + if (bprm->loader) + bprm->loader -= stack_shift; + bprm->exec -= stack_shift; + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + vm_flags = VM_STACK_FLAGS; + + /* + * Adjust stack execute permissions; explicitly enable for + * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone + * (arch default) otherwise. + */ + if (unlikely(executable_stack == EXSTACK_ENABLE_X)) + vm_flags |= VM_EXEC; + else if (executable_stack == EXSTACK_DISABLE_X) + vm_flags &= ~VM_EXEC; + vm_flags |= mm->def_flags; + vm_flags |= VM_STACK_INCOMPLETE_SETUP; + + ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, + vm_flags); + if (ret) + goto out_unlock; + BUG_ON(prev != vma); + + if (unlikely(vm_flags & VM_EXEC)) { + pr_warn_once("process '%pD4' started with executable stack\n", + bprm->file); + } + + /* Move stack pages down in memory. */ + if (stack_shift) { + ret = shift_arg_pages(vma, stack_shift); + if (ret) + goto out_unlock; + } + + /* mprotect_fixup is overkill to remove the temporary stack flags */ + vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP; + + stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ + stack_size = vma->vm_end - vma->vm_start; + /* + * Align this down to a page boundary as expand_stack + * will align it up. + */ + rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK; +#ifdef CONFIG_STACK_GROWSUP + if (stack_size + stack_expand > rlim_stack) + stack_base = vma->vm_start + rlim_stack; + else + stack_base = vma->vm_end + stack_expand; +#else + if (stack_size + stack_expand > rlim_stack) + stack_base = vma->vm_end - rlim_stack; + else + stack_base = vma->vm_start - stack_expand; +#endif + current->mm->start_stack = bprm->p; + ret = expand_stack(vma, stack_base); + if (ret) + ret = -EFAULT; + +out_unlock: + mmap_write_unlock(mm); + return ret; +} +EXPORT_SYMBOL(setup_arg_pages); + +#else + +/* + * Transfer the program arguments and environment from the holding pages + * onto the stack. The provided stack pointer is adjusted accordingly. + */ +int transfer_args_to_stack(struct linux_binprm *bprm, + unsigned long *sp_location) +{ + unsigned long index, stop, sp; + int ret = 0; + + stop = bprm->p >> PAGE_SHIFT; + sp = *sp_location; + + for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { + unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0; + char *src = kmap(bprm->page[index]) + offset; + sp -= PAGE_SIZE - offset; + if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0) + ret = -EFAULT; + kunmap(bprm->page[index]); + if (ret) + goto out; + } + + bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE; + *sp_location = sp; + +out: + return ret; +} +EXPORT_SYMBOL(transfer_args_to_stack); + +#endif /* CONFIG_MMU */ + +static struct file *do_open_execat(int fd, struct filename *name, int flags) +{ + struct file *file; + int err; + struct open_flags open_exec_flags = { + .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .acc_mode = MAY_EXEC, + .intent = LOOKUP_OPEN, + .lookup_flags = LOOKUP_FOLLOW, + }; + + if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return ERR_PTR(-EINVAL); + if (flags & AT_SYMLINK_NOFOLLOW) + open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + open_exec_flags.lookup_flags |= LOOKUP_EMPTY; + + file = do_filp_open(fd, name, &open_exec_flags); + if (IS_ERR(file)) + return file; + + /* + * In the past the regular type check was here. It moved to may_open() in + * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is + * an invariant that all non-regular files error out before we get here. + */ + err = -EACCES; + if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || + path_noexec(&file->f_path)) + goto exit; + + err = deny_write_access(file); + if (err) + goto exit; + + if (name->name[0] != '\0') + fsnotify_open(file); + + return file; + +exit: + fput(file); + return ERR_PTR(err); +} + +struct file *open_exec(const char *name) +{ + struct filename *filename = getname_kernel(name); + struct file *f = ERR_CAST(filename); + + if (!IS_ERR(filename)) { + f = do_open_execat(AT_FDCWD, filename, 0); + putname(filename); + } + return f; +} +EXPORT_SYMBOL(open_exec); + +#if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \ + defined(CONFIG_BINFMT_ELF_FDPIC) +ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) +{ + ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); + if (res > 0) + flush_icache_user_range(addr, addr + len); + return res; +} +EXPORT_SYMBOL(read_code); +#endif + +/* + * Maps the mm_struct mm into the current task struct. + * On success, this function returns with exec_update_lock + * held for writing. + */ +static int exec_mmap(struct mm_struct *mm) +{ + struct task_struct *tsk; + struct mm_struct *old_mm, *active_mm; + int ret; + + /* Notify parent that we're no longer interested in the old VM */ + tsk = current; + old_mm = current->mm; + exec_mm_release(tsk, old_mm); + if (old_mm) + sync_mm_rss(old_mm); + + ret = down_write_killable(&tsk->signal->exec_update_lock); + if (ret) + return ret; + + if (old_mm) { + /* + * Make sure that if there is a core dump in progress + * for the old mm, we get out and die instead of going + * through with the exec. We must hold mmap_lock around + * checking core_state and changing tsk->mm. + */ + mmap_read_lock(old_mm); + if (unlikely(old_mm->core_state)) { + mmap_read_unlock(old_mm); + up_write(&tsk->signal->exec_update_lock); + return -EINTR; + } + } + + task_lock(tsk); + membarrier_exec_mmap(mm); + + local_irq_disable(); + active_mm = tsk->active_mm; + tsk->active_mm = mm; + tsk->mm = mm; + /* + * This prevents preemption while active_mm is being loaded and + * it and mm are being updated, which could cause problems for + * lazy tlb mm refcounting when these are updated by context + * switches. Not all architectures can handle irqs off over + * activate_mm yet. + */ + if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) + local_irq_enable(); + activate_mm(active_mm, mm); + if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) + local_irq_enable(); + tsk->mm->vmacache_seqnum = 0; + vmacache_flush(tsk); + task_unlock(tsk); + if (old_mm) { + mmap_read_unlock(old_mm); + BUG_ON(active_mm != old_mm); + setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm); + mm_update_next_owner(old_mm); + mmput(old_mm); + return 0; + } + mmdrop(active_mm); + return 0; +} + +static int de_thread(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + struct sighand_struct *oldsighand = tsk->sighand; + spinlock_t *lock = &oldsighand->siglock; + + if (thread_group_empty(tsk)) + goto no_thread_group; + + /* + * Kill all other threads in the thread group. + */ + spin_lock_irq(lock); + if (signal_group_exit(sig)) { + /* + * Another group action in progress, just + * return so that the signal is processed. + */ + spin_unlock_irq(lock); + return -EAGAIN; + } + + sig->group_exit_task = tsk; + sig->notify_count = zap_other_threads(tsk); + if (!thread_group_leader(tsk)) + sig->notify_count--; + + while (sig->notify_count) { + __set_current_state(TASK_KILLABLE); + spin_unlock_irq(lock); + schedule(); + if (__fatal_signal_pending(tsk)) + goto killed; + spin_lock_irq(lock); + } + spin_unlock_irq(lock); + + /* + * At this point all other threads have exited, all we have to + * do is to wait for the thread group leader to become inactive, + * and to assume its PID: + */ + if (!thread_group_leader(tsk)) { + struct task_struct *leader = tsk->group_leader; + + for (;;) { + cgroup_threadgroup_change_begin(tsk); + write_lock_irq(&tasklist_lock); + /* + * Do this under tasklist_lock to ensure that + * exit_notify() can't miss ->group_exit_task + */ + sig->notify_count = -1; + if (likely(leader->exit_state)) + break; + __set_current_state(TASK_KILLABLE); + write_unlock_irq(&tasklist_lock); + cgroup_threadgroup_change_end(tsk); + schedule(); + if (__fatal_signal_pending(tsk)) + goto killed; + } + + /* + * The only record we have of the real-time age of a + * process, regardless of execs it's done, is start_time. + * All the past CPU time is accumulated in signal_struct + * from sister threads now dead. But in this non-leader + * exec, nothing survives from the original leader thread, + * whose birth marks the true age of this process now. + * When we take on its identity by switching to its PID, we + * also take its birthdate (always earlier than our own). + */ + tsk->start_time = leader->start_time; + tsk->start_boottime = leader->start_boottime; + + BUG_ON(!same_thread_group(leader, tsk)); + /* + * An exec() starts a new thread group with the + * TGID of the previous thread group. Rehash the + * two threads with a switched PID, and release + * the former thread group leader: + */ + + /* Become a process group leader with the old leader's pid. + * The old leader becomes a thread of the this thread group. + */ + exchange_tids(tsk, leader); + transfer_pid(leader, tsk, PIDTYPE_TGID); + transfer_pid(leader, tsk, PIDTYPE_PGID); + transfer_pid(leader, tsk, PIDTYPE_SID); + + list_replace_rcu(&leader->tasks, &tsk->tasks); + list_replace_init(&leader->sibling, &tsk->sibling); + + tsk->group_leader = tsk; + leader->group_leader = tsk; + + tsk->exit_signal = SIGCHLD; + leader->exit_signal = -1; + + BUG_ON(leader->exit_state != EXIT_ZOMBIE); + leader->exit_state = EXIT_DEAD; + + /* + * We are going to release_task()->ptrace_unlink() silently, + * the tracer can sleep in do_wait(). EXIT_DEAD guarantees + * the tracer wont't block again waiting for this thread. + */ + if (unlikely(leader->ptrace)) + __wake_up_parent(leader, leader->parent); + write_unlock_irq(&tasklist_lock); + cgroup_threadgroup_change_end(tsk); + + release_task(leader); + } + + sig->group_exit_task = NULL; + sig->notify_count = 0; + +no_thread_group: + /* we have changed execution domain */ + tsk->exit_signal = SIGCHLD; + + BUG_ON(!thread_group_leader(tsk)); + return 0; + +killed: + /* protects against exit_notify() and __exit_signal() */ + read_lock(&tasklist_lock); + sig->group_exit_task = NULL; + sig->notify_count = 0; + read_unlock(&tasklist_lock); + return -EAGAIN; +} + + +/* + * This function makes sure the current process has its own signal table, + * so that flush_signal_handlers can later reset the handlers without + * disturbing other processes. (Other processes might share the signal + * table via the CLONE_SIGHAND option to clone().) + */ +static int unshare_sighand(struct task_struct *me) +{ + struct sighand_struct *oldsighand = me->sighand; + + if (refcount_read(&oldsighand->count) != 1) { + struct sighand_struct *newsighand; + /* + * This ->sighand is shared with the CLONE_SIGHAND + * but not CLONE_THREAD task, switch to the new one. + */ + newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); + if (!newsighand) + return -ENOMEM; + + refcount_set(&newsighand->count, 1); + + write_lock_irq(&tasklist_lock); + spin_lock(&oldsighand->siglock); + memcpy(newsighand->action, oldsighand->action, + sizeof(newsighand->action)); + rcu_assign_pointer(me->sighand, newsighand); + spin_unlock(&oldsighand->siglock); + write_unlock_irq(&tasklist_lock); + + __cleanup_sighand(oldsighand); + } + return 0; +} + +char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) +{ + task_lock(tsk); + strncpy(buf, tsk->comm, buf_size); + task_unlock(tsk); + return buf; +} +EXPORT_SYMBOL_GPL(__get_task_comm); + +/* + * These functions flushes out all traces of the currently running executable + * so that a new one can be started + */ + +void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) +{ + task_lock(tsk); + trace_task_rename(tsk, buf); + strlcpy(tsk->comm, buf, sizeof(tsk->comm)); + task_unlock(tsk); + perf_event_comm(tsk, exec); +} + +/* + * Calling this is the point of no return. None of the failures will be + * seen by userspace since either the process is already taking a fatal + * signal (via de_thread() or coredump), or will have SEGV raised + * (after exec_mmap()) by search_binary_handler (see below). + */ +int begin_new_exec(struct linux_binprm * bprm) +{ + struct task_struct *me = current; + int retval; + + /* Once we are committed compute the creds */ + retval = bprm_creds_from_file(bprm); + if (retval) + return retval; + + /* + * Ensure all future errors are fatal. + */ + bprm->point_of_no_return = true; + + /* + * Make this the only thread in the thread group. + */ + retval = de_thread(me); + if (retval) + goto out; + + /* + * Must be called _before_ exec_mmap() as bprm->mm is + * not visibile until then. This also enables the update + * to be lockless. + */ + set_mm_exe_file(bprm->mm, bprm->file); + + /* If the binary is not readable then enforce mm->dumpable=0 */ + would_dump(bprm, bprm->file); + if (bprm->have_execfd) + would_dump(bprm, bprm->executable); + + /* + * Release all of the old mmap stuff + */ + acct_arg_size(bprm, 0); + retval = exec_mmap(bprm->mm); + if (retval) + goto out; + + bprm->mm = NULL; + +#ifdef CONFIG_POSIX_TIMERS + spin_lock_irq(&me->sighand->siglock); + posix_cpu_timers_exit(me); + spin_unlock_irq(&me->sighand->siglock); + exit_itimers(me); + flush_itimer_signals(); +#endif + + /* + * Make the signal table private. + */ + retval = unshare_sighand(me); + if (retval) + goto out_unlock; + + /* + * Ensure that the uaccess routines can actually operate on userspace + * pointers: + */ + force_uaccess_begin(); + + me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | + PF_NOFREEZE | PF_NO_SETAFFINITY); + flush_thread(); + me->personality &= ~bprm->per_clear; + + /* + * We have to apply CLOEXEC before we change whether the process is + * dumpable (in setup_new_exec) to avoid a race with a process in userspace + * trying to access the should-be-closed file descriptors of a process + * undergoing exec(2). + */ + do_close_on_exec(me->files); + + if (bprm->secureexec) { + /* Make sure parent cannot signal privileged process. */ + me->pdeath_signal = 0; + + /* + * For secureexec, reset the stack limit to sane default to + * avoid bad behavior from the prior rlimits. This has to + * happen before arch_pick_mmap_layout(), which examines + * RLIMIT_STACK, but after the point of no return to avoid + * needing to clean up the change on failure. + */ + if (bprm->rlim_stack.rlim_cur > _STK_LIM) + bprm->rlim_stack.rlim_cur = _STK_LIM; + } + + me->sas_ss_sp = me->sas_ss_size = 0; + + /* + * Figure out dumpability. Note that this checking only of current + * is wrong, but userspace depends on it. This should be testing + * bprm->secureexec instead. + */ + if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || + !(uid_eq(current_euid(), current_uid()) && + gid_eq(current_egid(), current_gid()))) + set_dumpable(current->mm, suid_dumpable); + else + set_dumpable(current->mm, SUID_DUMP_USER); + + perf_event_exec(); + __set_task_comm(me, kbasename(bprm->filename), true); + + /* An exec changes our domain. We are no longer part of the thread + group */ + WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1); + flush_signal_handlers(me, 0); + + /* + * install the new credentials for this executable + */ + security_bprm_committing_creds(bprm); + + commit_creds(bprm->cred); + bprm->cred = NULL; + + /* + * Disable monitoring for regular users + * when executing setuid binaries. Must + * wait until new credentials are committed + * by commit_creds() above + */ + if (get_dumpable(me->mm) != SUID_DUMP_USER) + perf_event_exit_task(me); + /* + * cred_guard_mutex must be held at least to this point to prevent + * ptrace_attach() from altering our determination of the task's + * credentials; any time after this it may be unlocked. + */ + security_bprm_committed_creds(bprm); + + /* Pass the opened binary to the interpreter. */ + if (bprm->have_execfd) { + retval = get_unused_fd_flags(0); + if (retval < 0) + goto out_unlock; + fd_install(retval, bprm->executable); + bprm->executable = NULL; + bprm->execfd = retval; + } + return 0; + +out_unlock: + up_write(&me->signal->exec_update_lock); + if (!bprm->cred) + mutex_unlock(&me->signal->cred_guard_mutex); + +out: + return retval; +} +EXPORT_SYMBOL(begin_new_exec); + +void would_dump(struct linux_binprm *bprm, struct file *file) +{ + struct inode *inode = file_inode(file); + if (inode_permission(inode, MAY_READ) < 0) { + struct user_namespace *old, *user_ns; + bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; + + /* Ensure mm->user_ns contains the executable */ + user_ns = old = bprm->mm->user_ns; + while ((user_ns != &init_user_ns) && + !privileged_wrt_inode_uidgid(user_ns, inode)) + user_ns = user_ns->parent; + + if (old != user_ns) { + bprm->mm->user_ns = get_user_ns(user_ns); + put_user_ns(old); + } + } +} +EXPORT_SYMBOL(would_dump); + +void setup_new_exec(struct linux_binprm * bprm) +{ + /* Setup things that can depend upon the personality */ + struct task_struct *me = current; + + arch_pick_mmap_layout(me->mm, &bprm->rlim_stack); + + arch_setup_new_exec(); + + /* Set the new mm task size. We have to do that late because it may + * depend on TIF_32BIT which is only updated in flush_thread() on + * some architectures like powerpc + */ + me->mm->task_size = TASK_SIZE; + up_write(&me->signal->exec_update_lock); + mutex_unlock(&me->signal->cred_guard_mutex); +} +EXPORT_SYMBOL(setup_new_exec); + +/* Runs immediately before start_thread() takes over. */ +void finalize_exec(struct linux_binprm *bprm) +{ + /* Store any stack rlimit changes before starting thread. */ + task_lock(current->group_leader); + current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack; + task_unlock(current->group_leader); +} +EXPORT_SYMBOL(finalize_exec); + +/* + * Prepare credentials and lock ->cred_guard_mutex. + * setup_new_exec() commits the new creds and drops the lock. + * Or, if exec fails before, free_bprm() should release ->cred and + * and unlock. + */ +static int prepare_bprm_creds(struct linux_binprm *bprm) +{ + if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) + return -ERESTARTNOINTR; + + bprm->cred = prepare_exec_creds(); + if (likely(bprm->cred)) + return 0; + + mutex_unlock(¤t->signal->cred_guard_mutex); + return -ENOMEM; +} + +static void free_bprm(struct linux_binprm *bprm) +{ + if (bprm->mm) { + acct_arg_size(bprm, 0); + mmput(bprm->mm); + } + free_arg_pages(bprm); + if (bprm->cred) { + mutex_unlock(¤t->signal->cred_guard_mutex); + abort_creds(bprm->cred); + } + if (bprm->file) { + allow_write_access(bprm->file); + fput(bprm->file); + } + if (bprm->executable) + fput(bprm->executable); + /* If a binfmt changed the interp, free it. */ + if (bprm->interp != bprm->filename) + kfree(bprm->interp); + kfree(bprm->fdpath); + kfree(bprm); +} + +static struct linux_binprm *alloc_bprm(int fd, struct filename *filename) +{ + struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + int retval = -ENOMEM; + if (!bprm) + goto out; + + if (fd == AT_FDCWD || filename->name[0] == '/') { + bprm->filename = filename->name; + } else { + if (filename->name[0] == '\0') + bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); + else + bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", + fd, filename->name); + if (!bprm->fdpath) + goto out_free; + + bprm->filename = bprm->fdpath; + } + bprm->interp = bprm->filename; + + retval = bprm_mm_init(bprm); + if (retval) + goto out_free; + return bprm; + +out_free: + free_bprm(bprm); +out: + return ERR_PTR(retval); +} + +int bprm_change_interp(const char *interp, struct linux_binprm *bprm) +{ + /* If a binfmt changed the interp, free it first. */ + if (bprm->interp != bprm->filename) + kfree(bprm->interp); + bprm->interp = kstrdup(interp, GFP_KERNEL); + if (!bprm->interp) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL(bprm_change_interp); + +/* + * determine how safe it is to execute the proposed program + * - the caller must hold ->cred_guard_mutex to protect against + * PTRACE_ATTACH or seccomp thread-sync + */ +static void check_unsafe_exec(struct linux_binprm *bprm) +{ + struct task_struct *p = current, *t; + unsigned n_fs; + + if (p->ptrace) + bprm->unsafe |= LSM_UNSAFE_PTRACE; + + /* + * This isn't strictly necessary, but it makes it harder for LSMs to + * mess up. + */ + if (task_no_new_privs(current)) + bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; + + t = p; + n_fs = 1; + spin_lock(&p->fs->lock); + rcu_read_lock(); + while_each_thread(p, t) { + if (t->fs == p->fs) + n_fs++; + } + rcu_read_unlock(); + + if (p->fs->users > n_fs) + bprm->unsafe |= LSM_UNSAFE_SHARE; + else + p->fs->in_exec = 1; + spin_unlock(&p->fs->lock); +} + +static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) +{ + /* Handle suid and sgid on files */ + struct inode *inode; + unsigned int mode; + kuid_t uid; + kgid_t gid; + int err; + + if (!mnt_may_suid(file->f_path.mnt)) + return; + + if (task_no_new_privs(current)) + return; + + inode = file->f_path.dentry->d_inode; + mode = READ_ONCE(inode->i_mode); + if (!(mode & (S_ISUID|S_ISGID))) + return; + + /* Be careful if suid/sgid is set */ + inode_lock(inode); + + /* Atomically reload and check mode/uid/gid now that lock held. */ + mode = inode->i_mode; + uid = inode->i_uid; + gid = inode->i_gid; + err = inode_permission(inode, MAY_EXEC); + inode_unlock(inode); + + /* Did the exec bit vanish out from under us? Give up. */ + if (err) + return; + + /* We ignore suid/sgid if there are no mappings for them in the ns */ + if (!kuid_has_mapping(bprm->cred->user_ns, uid) || + !kgid_has_mapping(bprm->cred->user_ns, gid)) + return; + + if (mode & S_ISUID) { + bprm->per_clear |= PER_CLEAR_ON_SETID; + bprm->cred->euid = uid; + } + + if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + bprm->per_clear |= PER_CLEAR_ON_SETID; + bprm->cred->egid = gid; + } +} + +/* + * Compute brpm->cred based upon the final binary. + */ +static int bprm_creds_from_file(struct linux_binprm *bprm) +{ + /* Compute creds based on which file? */ + struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file; + + bprm_fill_uid(bprm, file); + return security_bprm_creds_from_file(bprm, file); +} + +/* + * Fill the binprm structure from the inode. + * Read the first BINPRM_BUF_SIZE bytes + * + * This may be called multiple times for binary chains (scripts for example). + */ +static int prepare_binprm(struct linux_binprm *bprm) +{ + loff_t pos = 0; + + memset(bprm->buf, 0, BINPRM_BUF_SIZE); + return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos); +} + +/* + * Arguments are '\0' separated strings found at the location bprm->p + * points to; chop off the first by relocating brpm->p to right after + * the first '\0' encountered. + */ +int remove_arg_zero(struct linux_binprm *bprm) +{ + int ret = 0; + unsigned long offset; + char *kaddr; + struct page *page; + + if (!bprm->argc) + return 0; + + do { + offset = bprm->p & ~PAGE_MASK; + page = get_arg_page(bprm, bprm->p, 0); + if (!page) { + ret = -EFAULT; + goto out; + } + kaddr = kmap_atomic(page); + + for (; offset < PAGE_SIZE && kaddr[offset]; + offset++, bprm->p++) + ; + + kunmap_atomic(kaddr); + put_arg_page(page); + } while (offset == PAGE_SIZE); + + bprm->p++; + bprm->argc--; + ret = 0; + +out: + return ret; +} +EXPORT_SYMBOL(remove_arg_zero); + +#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) +/* + * cycle the list of binary formats handler, until one recognizes the image + */ +static int search_binary_handler(struct linux_binprm *bprm) +{ + bool need_retry = IS_ENABLED(CONFIG_MODULES); + struct linux_binfmt *fmt; + int retval; + + retval = prepare_binprm(bprm); + if (retval < 0) + return retval; + + retval = security_bprm_check(bprm); + if (retval) + return retval; + + retval = -ENOENT; + retry: + read_lock(&binfmt_lock); + list_for_each_entry(fmt, &formats, lh) { + if (!try_module_get(fmt->module)) + continue; + read_unlock(&binfmt_lock); + + retval = fmt->load_binary(bprm); + + read_lock(&binfmt_lock); + put_binfmt(fmt); + if (bprm->point_of_no_return || (retval != -ENOEXEC)) { + read_unlock(&binfmt_lock); + return retval; + } + } + read_unlock(&binfmt_lock); + + if (need_retry) { + if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && + printable(bprm->buf[2]) && printable(bprm->buf[3])) + return retval; + if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0) + return retval; + need_retry = false; + goto retry; + } + + return retval; +} + +static int exec_binprm(struct linux_binprm *bprm) +{ + pid_t old_pid, old_vpid; + int ret, depth; + + /* Need to fetch pid before load_binary changes it */ + old_pid = current->pid; + rcu_read_lock(); + old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); + rcu_read_unlock(); + + /* This allows 4 levels of binfmt rewrites before failing hard. */ + for (depth = 0;; depth++) { + struct file *exec; + if (depth > 5) + return -ELOOP; + + ret = search_binary_handler(bprm); + if (ret < 0) + return ret; + if (!bprm->interpreter) + break; + + exec = bprm->file; + bprm->file = bprm->interpreter; + bprm->interpreter = NULL; + + allow_write_access(exec); + if (unlikely(bprm->have_execfd)) { + if (bprm->executable) { + fput(exec); + return -ENOEXEC; + } + bprm->executable = exec; + } else + fput(exec); + } + + audit_bprm(bprm); + trace_sched_process_exec(current, old_pid, bprm); + ptrace_event(PTRACE_EVENT_EXEC, old_vpid); + proc_exec_connector(current); + return 0; +} + +/* + * sys_execve() executes a new program. + */ +static int bprm_execve(struct linux_binprm *bprm, + int fd, struct filename *filename, int flags) +{ + struct file *file; + struct files_struct *displaced; + int retval; + + /* + * Cancel any io_uring activity across execve + */ + io_uring_task_cancel(); + + retval = unshare_files(&displaced); + if (retval) + return retval; + + retval = prepare_bprm_creds(bprm); + if (retval) + goto out_files; + + check_unsafe_exec(bprm); + current->in_execve = 1; + + file = do_open_execat(fd, filename, flags); + retval = PTR_ERR(file); + if (IS_ERR(file)) + goto out_unmark; + + sched_exec(); + + bprm->file = file; + /* + * Record that a name derived from an O_CLOEXEC fd will be + * inaccessible after exec. Relies on having exclusive access to + * current->files (due to unshare_files above). + */ + if (bprm->fdpath && + close_on_exec(fd, rcu_dereference_raw(current->files->fdt))) + bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; + + /* Set the unchanging part of bprm->cred */ + retval = security_bprm_creds_for_exec(bprm); + if (retval) + goto out; + + retval = exec_binprm(bprm); + if (retval < 0) + goto out; + + /* execve succeeded */ + current->fs->in_exec = 0; + current->in_execve = 0; + rseq_execve(current); + acct_update_integrals(current); + task_numa_free(current, false); + if (displaced) + put_files_struct(displaced); + return retval; + +out: + /* + * If past the point of no return ensure the the code never + * returns to the userspace process. Use an existing fatal + * signal if present otherwise terminate the process with + * SIGSEGV. + */ + if (bprm->point_of_no_return && !fatal_signal_pending(current)) + force_sigsegv(SIGSEGV); + +out_unmark: + current->fs->in_exec = 0; + current->in_execve = 0; + +out_files: + if (displaced) + reset_files_struct(displaced); + + return retval; +} + +static int do_execveat_common(int fd, struct filename *filename, + struct user_arg_ptr argv, + struct user_arg_ptr envp, + int flags) +{ + struct linux_binprm *bprm; + int retval; + + if (IS_ERR(filename)) + return PTR_ERR(filename); + + /* + * We move the actual failure in case of RLIMIT_NPROC excess from + * set*uid() to execve() because too many poorly written programs + * don't check setuid() return code. Here we additionally recheck + * whether NPROC limit is still exceeded. + */ + if ((current->flags & PF_NPROC_EXCEEDED) && + atomic_read(¤t_user()->processes) > rlimit(RLIMIT_NPROC)) { + retval = -EAGAIN; + goto out_ret; + } + + /* We're below the limit (still or again), so we don't want to make + * further execve() calls fail. */ + current->flags &= ~PF_NPROC_EXCEEDED; + + bprm = alloc_bprm(fd, filename); + if (IS_ERR(bprm)) { + retval = PTR_ERR(bprm); + goto out_ret; + } + + retval = count(argv, MAX_ARG_STRINGS); + if (retval == 0) + pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n", + current->comm, bprm->filename); + if (retval < 0) + goto out_free; + bprm->argc = retval; + + retval = count(envp, MAX_ARG_STRINGS); + if (retval < 0) + goto out_free; + bprm->envc = retval; + + retval = bprm_stack_limits(bprm); + if (retval < 0) + goto out_free; + + retval = copy_string_kernel(bprm->filename, bprm); + if (retval < 0) + goto out_free; + bprm->exec = bprm->p; + + retval = copy_strings(bprm->envc, envp, bprm); + if (retval < 0) + goto out_free; + + retval = copy_strings(bprm->argc, argv, bprm); + if (retval < 0) + goto out_free; + + /* + * When argv is empty, add an empty string ("") as argv[0] to + * ensure confused userspace programs that start processing + * from argv[1] won't end up walking envp. See also + * bprm_stack_limits(). + */ + if (bprm->argc == 0) { + retval = copy_string_kernel("", bprm); + if (retval < 0) + goto out_free; + bprm->argc = 1; + } + + retval = bprm_execve(bprm, fd, filename, flags); +out_free: + free_bprm(bprm); + +out_ret: + putname(filename); + return retval; +} + +int kernel_execve(const char *kernel_filename, + const char *const *argv, const char *const *envp) +{ + struct filename *filename; + struct linux_binprm *bprm; + int fd = AT_FDCWD; + int retval; + + filename = getname_kernel(kernel_filename); + if (IS_ERR(filename)) + return PTR_ERR(filename); + + bprm = alloc_bprm(fd, filename); + if (IS_ERR(bprm)) { + retval = PTR_ERR(bprm); + goto out_ret; + } + + retval = count_strings_kernel(argv); + if (WARN_ON_ONCE(retval == 0)) + retval = -EINVAL; + if (retval < 0) + goto out_free; + bprm->argc = retval; + + retval = count_strings_kernel(envp); + if (retval < 0) + goto out_free; + bprm->envc = retval; + + retval = bprm_stack_limits(bprm); + if (retval < 0) + goto out_free; + + retval = copy_string_kernel(bprm->filename, bprm); + if (retval < 0) + goto out_free; + bprm->exec = bprm->p; + + retval = copy_strings_kernel(bprm->envc, envp, bprm); + if (retval < 0) + goto out_free; + + retval = copy_strings_kernel(bprm->argc, argv, bprm); + if (retval < 0) + goto out_free; + + retval = bprm_execve(bprm, fd, filename, 0); +out_free: + free_bprm(bprm); +out_ret: + putname(filename); + return retval; +} + +static int do_execve(struct filename *filename, + const char __user *const __user *__argv, + const char __user *const __user *__envp) +{ + struct user_arg_ptr argv = { .ptr.native = __argv }; + struct user_arg_ptr envp = { .ptr.native = __envp }; + return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); +} + +static int do_execveat(int fd, struct filename *filename, + const char __user *const __user *__argv, + const char __user *const __user *__envp, + int flags) +{ + struct user_arg_ptr argv = { .ptr.native = __argv }; + struct user_arg_ptr envp = { .ptr.native = __envp }; + + return do_execveat_common(fd, filename, argv, envp, flags); +} + +#ifdef CONFIG_COMPAT +static int compat_do_execve(struct filename *filename, + const compat_uptr_t __user *__argv, + const compat_uptr_t __user *__envp) +{ + struct user_arg_ptr argv = { + .is_compat = true, + .ptr.compat = __argv, + }; + struct user_arg_ptr envp = { + .is_compat = true, + .ptr.compat = __envp, + }; + return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); +} + +static int compat_do_execveat(int fd, struct filename *filename, + const compat_uptr_t __user *__argv, + const compat_uptr_t __user *__envp, + int flags) +{ + struct user_arg_ptr argv = { + .is_compat = true, + .ptr.compat = __argv, + }; + struct user_arg_ptr envp = { + .is_compat = true, + .ptr.compat = __envp, + }; + return do_execveat_common(fd, filename, argv, envp, flags); +} +#endif + +void set_binfmt(struct linux_binfmt *new) +{ + struct mm_struct *mm = current->mm; + + if (mm->binfmt) + module_put(mm->binfmt->module); + + mm->binfmt = new; + if (new) + __module_get(new->module); +} +EXPORT_SYMBOL(set_binfmt); + +/* + * set_dumpable stores three-value SUID_DUMP_* into mm->flags. + */ +void set_dumpable(struct mm_struct *mm, int value) +{ + if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) + return; + + set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value); +} + +SYSCALL_DEFINE3(execve, + const char __user *, filename, + const char __user *const __user *, argv, + const char __user *const __user *, envp) +{ + return do_execve(getname(filename), argv, envp); +} + +SYSCALL_DEFINE5(execveat, + int, fd, const char __user *, filename, + const char __user *const __user *, argv, + const char __user *const __user *, envp, + int, flags) +{ + int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; + + return do_execveat(fd, + getname_flags(filename, lookup_flags, NULL), + argv, envp, flags); +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, + const compat_uptr_t __user *, argv, + const compat_uptr_t __user *, envp) +{ + return compat_do_execve(getname(filename), argv, envp); +} + +COMPAT_SYSCALL_DEFINE5(execveat, int, fd, + const char __user *, filename, + const compat_uptr_t __user *, argv, + const compat_uptr_t __user *, envp, + int, flags) +{ + int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; + + return compat_do_execveat(fd, + getname_flags(filename, lookup_flags, NULL), + argv, envp, flags); +} +#endif diff --git a/fs/inode.c b/fs/inode.c index 5d835021fd7b..b532c848f065 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -24,6 +24,10 @@ #include #include "internal.h" +#ifdef CONFIG_KSU_SUSFS_SUS_KSTAT +extern bool susfs_is_current_ksu_domain(void); +#endif + /* * Inode locking rules: * @@ -1829,6 +1833,11 @@ int generic_update_time(struct inode *inode, struct timespec64 *time, int flags) int iflags = I_DIRTY_TIME; bool dirty = false; +#ifdef CONFIG_KSU_SUSFS_SUS_KSTAT + if (susfs_is_current_ksu_domain()) { + return 0; + } +#endif if (flags & S_ATIME) inode->i_atime = *time; if (flags & S_VERSION) @@ -1854,6 +1863,11 @@ EXPORT_SYMBOL(generic_update_time); */ int inode_update_time(struct inode *inode, struct timespec64 *time, int flags) { +#ifdef CONFIG_KSU_SUSFS_SUS_KSTAT + if (susfs_is_current_ksu_domain()) { + return 0; + } +#endif if (inode->i_op->update_time) return inode->i_op->update_time(inode, time, flags); return generic_update_time(inode, time, flags); @@ -1910,6 +1924,12 @@ void touch_atime(const struct path *path) struct inode *inode = d_inode(path->dentry); struct timespec64 now; +#ifdef CONFIG_KSU_SUSFS_SUS_KSTAT + if (susfs_is_current_ksu_domain()) { + return; + } +#endif + if (!atime_needs_update(path, inode)) return; diff --git a/fs/inode.c.orig b/fs/inode.c.orig new file mode 100644 index 000000000000..5d835021fd7b --- /dev/null +++ b/fs/inode.c.orig @@ -0,0 +1,2493 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * (C) 1997 Linus Torvalds + * (C) 1999 Andrea Arcangeli (dynamic inode allocation) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for inode_has_buffers */ +#include +#include +#include +#include +#include "internal.h" + +/* + * Inode locking rules: + * + * inode->i_lock protects: + * inode->i_state, inode->i_hash, __iget() + * Inode LRU list locks protect: + * inode->i_sb->s_inode_lru, inode->i_lru + * inode->i_sb->s_inode_list_lock protects: + * inode->i_sb->s_inodes, inode->i_sb_list + * bdi->wb.list_lock protects: + * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list + * inode_hash_lock protects: + * inode_hashtable, inode->i_hash + * + * Lock ordering: + * + * inode->i_sb->s_inode_list_lock + * inode->i_lock + * Inode LRU list locks + * + * bdi->wb.list_lock + * inode->i_lock + * + * inode_hash_lock + * inode->i_sb->s_inode_list_lock + * inode->i_lock + * + * iunique_lock + * inode_hash_lock + */ + +static unsigned int i_hash_mask __read_mostly; +static unsigned int i_hash_shift __read_mostly; +static struct hlist_head *inode_hashtable __read_mostly; +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); + +/* + * Empty aops. Can be used for the cases where the user does not + * define any of the address_space operations. + */ +const struct address_space_operations empty_aops = { +}; +EXPORT_SYMBOL(empty_aops); + +/* + * Statistics gathering.. + */ +struct inodes_stat_t inodes_stat; + +static DEFINE_PER_CPU(unsigned long, nr_inodes); +static DEFINE_PER_CPU(unsigned long, nr_unused); + +static struct kmem_cache *inode_cachep __read_mostly; + +static long get_nr_inodes(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_inodes, i); + return sum < 0 ? 0 : sum; +} + +static inline long get_nr_inodes_unused(void) +{ + int i; + long sum = 0; + for_each_possible_cpu(i) + sum += per_cpu(nr_unused, i); + return sum < 0 ? 0 : sum; +} + +long get_nr_dirty_inodes(void) +{ + /* not actually dirty inodes, but a wild approximation */ + long nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); + return nr_dirty > 0 ? nr_dirty : 0; +} + +/* + * Handle nr_inode sysctl + */ +#ifdef CONFIG_SYSCTL +int proc_nr_inodes(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + inodes_stat.nr_inodes = get_nr_inodes(); + inodes_stat.nr_unused = get_nr_inodes_unused(); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + +static int no_open(struct inode *inode, struct file *file) +{ + return -ENXIO; +} + +/** + * inode_init_always - perform inode structure initialisation + * @sb: superblock inode belongs to + * @inode: inode to initialise + * + * These are initializations that need to be done on every inode + * allocation as the fields are not initialised by slab allocation. + */ +int inode_init_always(struct super_block *sb, struct inode *inode) +{ + static const struct inode_operations empty_iops; + static const struct file_operations no_open_fops = {.open = no_open}; + struct address_space *const mapping = &inode->i_data; + + inode->i_sb = sb; + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic64_set(&inode->i_sequence, 0); + atomic_set(&inode->i_count, 1); + inode->i_op = &empty_iops; + inode->i_fop = &no_open_fops; + inode->__i_nlink = 1; + inode->i_opflags = 0; + if (sb->s_xattr) + inode->i_opflags |= IOP_XATTR; + i_uid_write(inode, 0); + i_gid_write(inode, 0); + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; + inode->i_write_hint = WRITE_LIFE_NOT_SET; + inode->i_blocks = 0; + inode->i_bytes = 0; + inode->i_generation = 0; + inode->i_pipe = NULL; + inode->i_bdev = NULL; + inode->i_cdev = NULL; + inode->i_link = NULL; + inode->i_dir_seq = 0; + inode->i_rdev = 0; + inode->dirtied_when = 0; + +#ifdef CONFIG_CGROUP_WRITEBACK + inode->i_wb_frn_winner = 0; + inode->i_wb_frn_avg_time = 0; + inode->i_wb_frn_history = 0; +#endif + + spin_lock_init(&inode->i_lock); + lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); + + init_rwsem(&inode->i_rwsem); + lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key); + + atomic_set(&inode->i_dio_count, 0); + + mapping->a_ops = &empty_aops; + mapping->host = inode; + mapping->flags = 0; + if (sb->s_type->fs_flags & FS_THP_SUPPORT) + __set_bit(AS_THP_SUPPORT, &mapping->flags); + mapping->wb_err = 0; + atomic_set(&mapping->i_mmap_writable, 0); +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + atomic_set(&mapping->nr_thps, 0); +#endif + mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); + mapping->private_data = NULL; + mapping->writeback_index = 0; + inode->i_private = NULL; + inode->i_mapping = mapping; + INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ +#ifdef CONFIG_FS_POSIX_ACL + inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; +#endif + +#ifdef CONFIG_FSNOTIFY + inode->i_fsnotify_mask = 0; +#endif + inode->i_flctx = NULL; + + if (unlikely(security_inode_alloc(inode))) + return -ENOMEM; + this_cpu_inc(nr_inodes); + + return 0; +} +EXPORT_SYMBOL(inode_init_always); + +void free_inode_nonrcu(struct inode *inode) +{ + kmem_cache_free(inode_cachep, inode); +} +EXPORT_SYMBOL(free_inode_nonrcu); + +static void i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + if (inode->free_inode) + inode->free_inode(inode); + else + free_inode_nonrcu(inode); +} + +static struct inode *alloc_inode(struct super_block *sb) +{ + const struct super_operations *ops = sb->s_op; + struct inode *inode; + + if (ops->alloc_inode) + inode = ops->alloc_inode(sb); + else + inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); + + if (!inode) + return NULL; + + if (unlikely(inode_init_always(sb, inode))) { + if (ops->destroy_inode) { + ops->destroy_inode(inode); + if (!ops->free_inode) + return NULL; + } + inode->free_inode = ops->free_inode; + i_callback(&inode->i_rcu); + return NULL; + } + + return inode; +} + +void __destroy_inode(struct inode *inode) +{ + BUG_ON(inode_has_buffers(inode)); + inode_detach_wb(inode); + security_inode_free(inode); + fsnotify_inode_delete(inode); + locks_free_lock_context(inode); + if (!inode->i_nlink) { + WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); + atomic_long_dec(&inode->i_sb->s_remove_count); + } + +#ifdef CONFIG_FS_POSIX_ACL + if (inode->i_acl && !is_uncached_acl(inode->i_acl)) + posix_acl_release(inode->i_acl); + if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl)) + posix_acl_release(inode->i_default_acl); +#endif + this_cpu_dec(nr_inodes); +} +EXPORT_SYMBOL(__destroy_inode); + +static void destroy_inode(struct inode *inode) +{ + const struct super_operations *ops = inode->i_sb->s_op; + + BUG_ON(!list_empty(&inode->i_lru)); + __destroy_inode(inode); + if (ops->destroy_inode) { + ops->destroy_inode(inode); + if (!ops->free_inode) + return; + } + inode->free_inode = ops->free_inode; + call_rcu(&inode->i_rcu, i_callback); +} + +/** + * drop_nlink - directly drop an inode's link count + * @inode: inode + * + * This is a low-level filesystem helper to replace any + * direct filesystem manipulation of i_nlink. In cases + * where we are attempting to track writes to the + * filesystem, a decrement to zero means an imminent + * write when the file is truncated and actually unlinked + * on the filesystem. + */ +void drop_nlink(struct inode *inode) +{ + WARN_ON(inode->i_nlink == 0); + inode->__i_nlink--; + if (!inode->i_nlink) + atomic_long_inc(&inode->i_sb->s_remove_count); +} +EXPORT_SYMBOL_NS(drop_nlink, ANDROID_GKI_VFS_EXPORT_ONLY); + +/** + * clear_nlink - directly zero an inode's link count + * @inode: inode + * + * This is a low-level filesystem helper to replace any + * direct filesystem manipulation of i_nlink. See + * drop_nlink() for why we care about i_nlink hitting zero. + */ +void clear_nlink(struct inode *inode) +{ + if (inode->i_nlink) { + inode->__i_nlink = 0; + atomic_long_inc(&inode->i_sb->s_remove_count); + } +} +EXPORT_SYMBOL(clear_nlink); + +/** + * set_nlink - directly set an inode's link count + * @inode: inode + * @nlink: new nlink (should be non-zero) + * + * This is a low-level filesystem helper to replace any + * direct filesystem manipulation of i_nlink. + */ +void set_nlink(struct inode *inode, unsigned int nlink) +{ + if (!nlink) { + clear_nlink(inode); + } else { + /* Yes, some filesystems do change nlink from zero to one */ + if (inode->i_nlink == 0) + atomic_long_dec(&inode->i_sb->s_remove_count); + + inode->__i_nlink = nlink; + } +} +EXPORT_SYMBOL_NS(set_nlink, ANDROID_GKI_VFS_EXPORT_ONLY); + +/** + * inc_nlink - directly increment an inode's link count + * @inode: inode + * + * This is a low-level filesystem helper to replace any + * direct filesystem manipulation of i_nlink. Currently, + * it is only here for parity with dec_nlink(). + */ +void inc_nlink(struct inode *inode) +{ + if (unlikely(inode->i_nlink == 0)) { + WARN_ON(!(inode->i_state & I_LINKABLE)); + atomic_long_dec(&inode->i_sb->s_remove_count); + } + + inode->__i_nlink++; +} +EXPORT_SYMBOL(inc_nlink); + +static void __address_space_init_once(struct address_space *mapping) +{ + xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); + init_rwsem(&mapping->i_mmap_rwsem); + INIT_LIST_HEAD(&mapping->private_list); + spin_lock_init(&mapping->private_lock); + mapping->i_mmap = RB_ROOT_CACHED; +} + +void address_space_init_once(struct address_space *mapping) +{ + memset(mapping, 0, sizeof(*mapping)); + __address_space_init_once(mapping); +} +EXPORT_SYMBOL(address_space_init_once); + +/* + * These are initializations that only need to be done + * once, because the fields are idempotent across use + * of the inode, so let the slab aware of that. + */ +void inode_init_once(struct inode *inode) +{ + memset(inode, 0, sizeof(*inode)); + INIT_HLIST_NODE(&inode->i_hash); + INIT_LIST_HEAD(&inode->i_devices); + INIT_LIST_HEAD(&inode->i_io_list); + INIT_LIST_HEAD(&inode->i_wb_list); + INIT_LIST_HEAD(&inode->i_lru); + __address_space_init_once(&inode->i_data); + i_size_ordered_init(inode); +} +EXPORT_SYMBOL_NS(inode_init_once, ANDROID_GKI_VFS_EXPORT_ONLY); + +static void init_once(void *foo) +{ + struct inode *inode = (struct inode *) foo; + + inode_init_once(inode); +} + +/* + * inode->i_lock must be held + */ +void __iget(struct inode *inode) +{ + atomic_inc(&inode->i_count); +} + +/* + * get additional reference to inode; caller must already hold one. + */ +void ihold(struct inode *inode) +{ + WARN_ON(atomic_inc_return(&inode->i_count) < 2); +} +EXPORT_SYMBOL_NS(ihold, ANDROID_GKI_VFS_EXPORT_ONLY); + +static void inode_lru_list_add(struct inode *inode) +{ + if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) + this_cpu_inc(nr_unused); + else + inode->i_state |= I_REFERENCED; +} + +/* + * Add inode to LRU if needed (inode is unused and clean). + * + * Needs inode->i_lock held. + */ +void inode_add_lru(struct inode *inode) +{ + if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC | + I_FREEING | I_WILL_FREE)) && + !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE) + inode_lru_list_add(inode); +} + + +static void inode_lru_list_del(struct inode *inode) +{ + + if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) + this_cpu_dec(nr_unused); +} + +static void inode_pin_lru_isolating(struct inode *inode) +{ + lockdep_assert_held(&inode->i_lock); + WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE)); + inode->i_state |= I_LRU_ISOLATING; +} + +static void inode_unpin_lru_isolating(struct inode *inode) +{ + spin_lock(&inode->i_lock); + WARN_ON(!(inode->i_state & I_LRU_ISOLATING)); + inode->i_state &= ~I_LRU_ISOLATING; + smp_mb(); + wake_up_bit(&inode->i_state, __I_LRU_ISOLATING); + spin_unlock(&inode->i_lock); +} + +static void inode_wait_for_lru_isolating(struct inode *inode) +{ + spin_lock(&inode->i_lock); + if (inode->i_state & I_LRU_ISOLATING) { + DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING); + wait_queue_head_t *wqh; + + wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING); + spin_unlock(&inode->i_lock); + __wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE); + spin_lock(&inode->i_lock); + WARN_ON(inode->i_state & I_LRU_ISOLATING); + } + spin_unlock(&inode->i_lock); +} + +/** + * inode_sb_list_add - add inode to the superblock list of inodes + * @inode: inode to add + */ +void inode_sb_list_add(struct inode *inode) +{ + spin_lock(&inode->i_sb->s_inode_list_lock); + list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); + spin_unlock(&inode->i_sb->s_inode_list_lock); +} +EXPORT_SYMBOL_GPL(inode_sb_list_add); + +static inline void inode_sb_list_del(struct inode *inode) +{ + if (!list_empty(&inode->i_sb_list)) { + spin_lock(&inode->i_sb->s_inode_list_lock); + list_del_init(&inode->i_sb_list); + spin_unlock(&inode->i_sb->s_inode_list_lock); + } +} + +static unsigned long hash(struct super_block *sb, unsigned long hashval) +{ + unsigned long tmp; + + tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / + L1_CACHE_BYTES; + tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); + return tmp & i_hash_mask; +} + +/** + * __insert_inode_hash - hash an inode + * @inode: unhashed inode + * @hashval: unsigned long value used to locate this object in the + * inode_hashtable. + * + * Add an inode to the inode hash for this superblock. + */ +void __insert_inode_hash(struct inode *inode, unsigned long hashval) +{ + struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); + + spin_lock(&inode_hash_lock); + spin_lock(&inode->i_lock); + hlist_add_head_rcu(&inode->i_hash, b); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); +} +EXPORT_SYMBOL_NS(__insert_inode_hash, ANDROID_GKI_VFS_EXPORT_ONLY); + +/** + * __remove_inode_hash - remove an inode from the hash + * @inode: inode to unhash + * + * Remove an inode from the superblock. + */ +void __remove_inode_hash(struct inode *inode) +{ + spin_lock(&inode_hash_lock); + spin_lock(&inode->i_lock); + hlist_del_init_rcu(&inode->i_hash); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); +} +EXPORT_SYMBOL_NS(__remove_inode_hash, ANDROID_GKI_VFS_EXPORT_ONLY); + +void clear_inode(struct inode *inode) +{ + /* + * We have to cycle the i_pages lock here because reclaim can be in the + * process of removing the last page (in __delete_from_page_cache()) + * and we must not free the mapping under it. + */ + xa_lock_irq(&inode->i_data.i_pages); + BUG_ON(inode->i_data.nrpages); + BUG_ON(inode->i_data.nrexceptional); + xa_unlock_irq(&inode->i_data.i_pages); + BUG_ON(!list_empty(&inode->i_data.private_list)); + BUG_ON(!(inode->i_state & I_FREEING)); + BUG_ON(inode->i_state & I_CLEAR); + BUG_ON(!list_empty(&inode->i_wb_list)); + /* don't need i_lock here, no concurrent mods to i_state */ + inode->i_state = I_FREEING | I_CLEAR; +} +EXPORT_SYMBOL_NS(clear_inode, ANDROID_GKI_VFS_EXPORT_ONLY); + +/* + * Free the inode passed in, removing it from the lists it is still connected + * to. We remove any pages still attached to the inode and wait for any IO that + * is still in progress before finally destroying the inode. + * + * An inode must already be marked I_FREEING so that we avoid the inode being + * moved back onto lists if we race with other code that manipulates the lists + * (e.g. writeback_single_inode). The caller is responsible for setting this. + * + * An inode must already be removed from the LRU list before being evicted from + * the cache. This should occur atomically with setting the I_FREEING state + * flag, so no inodes here should ever be on the LRU when being evicted. + */ +static void evict(struct inode *inode) +{ + const struct super_operations *op = inode->i_sb->s_op; + + BUG_ON(!(inode->i_state & I_FREEING)); + BUG_ON(!list_empty(&inode->i_lru)); + + if (!list_empty(&inode->i_io_list)) + inode_io_list_del(inode); + + inode_sb_list_del(inode); + + inode_wait_for_lru_isolating(inode); + + /* + * Wait for flusher thread to be done with the inode so that filesystem + * does not start destroying it while writeback is still running. Since + * the inode has I_FREEING set, flusher thread won't start new work on + * the inode. We just have to wait for running writeback to finish. + */ + inode_wait_for_writeback(inode); + + if (op->evict_inode) { + op->evict_inode(inode); + } else { + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + } + if (S_ISBLK(inode->i_mode) && inode->i_bdev) + bd_forget(inode); + if (S_ISCHR(inode->i_mode) && inode->i_cdev) + cd_forget(inode); + + remove_inode_hash(inode); + + spin_lock(&inode->i_lock); + wake_up_bit(&inode->i_state, __I_NEW); + BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); + spin_unlock(&inode->i_lock); + + destroy_inode(inode); +} + +/* + * dispose_list - dispose of the contents of a local list + * @head: the head of the list to free + * + * Dispose-list gets a local list with local inodes in it, so it doesn't + * need to worry about list corruption and SMP locks. + */ +static void dispose_list(struct list_head *head) +{ + while (!list_empty(head)) { + struct inode *inode; + + inode = list_first_entry(head, struct inode, i_lru); + list_del_init(&inode->i_lru); + + evict(inode); + cond_resched(); + } +} + +/** + * evict_inodes - evict all evictable inodes for a superblock + * @sb: superblock to operate on + * + * Make sure that no inodes with zero refcount are retained. This is + * called by superblock shutdown after having SB_ACTIVE flag removed, + * so any inode reaching zero refcount during or after that call will + * be immediately evicted. + */ +void evict_inodes(struct super_block *sb) +{ + struct inode *inode, *next; + LIST_HEAD(dispose); + +again: + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { + if (atomic_read(&inode->i_count)) + continue; + + spin_lock(&inode->i_lock); + if (atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); + continue; + } + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); + continue; + } + + inode->i_state |= I_FREEING; + inode_lru_list_del(inode); + spin_unlock(&inode->i_lock); + list_add(&inode->i_lru, &dispose); + + /* + * We can have a ton of inodes to evict at unmount time given + * enough memory, check to see if we need to go to sleep for a + * bit so we don't livelock. + */ + if (need_resched()) { + spin_unlock(&sb->s_inode_list_lock); + cond_resched(); + dispose_list(&dispose); + goto again; + } + } + spin_unlock(&sb->s_inode_list_lock); + + dispose_list(&dispose); +} +EXPORT_SYMBOL_GPL(evict_inodes); + +/** + * invalidate_inodes - attempt to free all inodes on a superblock + * @sb: superblock to operate on + * @kill_dirty: flag to guide handling of dirty inodes + * + * Attempts to free all inodes for a given superblock. If there were any + * busy inodes return a non-zero value, else zero. + * If @kill_dirty is set, discard dirty inodes too, otherwise treat + * them as busy. + */ +int invalidate_inodes(struct super_block *sb, bool kill_dirty) +{ + int busy = 0; + struct inode *inode, *next; + LIST_HEAD(dispose); + +again: + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { + spin_lock(&inode->i_lock); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); + continue; + } + if (inode->i_state & I_DIRTY_ALL && !kill_dirty) { + spin_unlock(&inode->i_lock); + busy = 1; + continue; + } + if (atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); + busy = 1; + continue; + } + + inode->i_state |= I_FREEING; + inode_lru_list_del(inode); + spin_unlock(&inode->i_lock); + list_add(&inode->i_lru, &dispose); + if (need_resched()) { + spin_unlock(&sb->s_inode_list_lock); + cond_resched(); + dispose_list(&dispose); + goto again; + } + } + spin_unlock(&sb->s_inode_list_lock); + + dispose_list(&dispose); + + return busy; +} + +/* + * Isolate the inode from the LRU in preparation for freeing it. + * + * Any inodes which are pinned purely because of attached pagecache have their + * pagecache removed. If the inode has metadata buffers attached to + * mapping->private_list then try to remove them. + * + * If the inode has the I_REFERENCED flag set, then it means that it has been + * used recently - the flag is set in iput_final(). When we encounter such an + * inode, clear the flag and move it to the back of the LRU so it gets another + * pass through the LRU before it gets reclaimed. This is necessary because of + * the fact we are doing lazy LRU updates to minimise lock contention so the + * LRU does not have strict ordering. Hence we don't want to reclaim inodes + * with this flag set because they are the inodes that are out of order. + */ +static enum lru_status inode_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) +{ + struct list_head *freeable = arg; + struct inode *inode = container_of(item, struct inode, i_lru); + + /* + * we are inverting the lru lock/inode->i_lock here, so use a trylock. + * If we fail to get the lock, just skip it. + */ + if (!spin_trylock(&inode->i_lock)) + return LRU_SKIP; + + /* + * Referenced or dirty inodes are still in use. Give them another pass + * through the LRU as we canot reclaim them now. + */ + if (atomic_read(&inode->i_count) || + (inode->i_state & ~I_REFERENCED)) { + list_lru_isolate(lru, &inode->i_lru); + spin_unlock(&inode->i_lock); + this_cpu_dec(nr_unused); + return LRU_REMOVED; + } + + /* recently referenced inodes get one more pass */ + if (inode->i_state & I_REFERENCED) { + inode->i_state &= ~I_REFERENCED; + spin_unlock(&inode->i_lock); + return LRU_ROTATE; + } + + if (inode_has_buffers(inode) || inode->i_data.nrpages) { + inode_pin_lru_isolating(inode); + spin_unlock(&inode->i_lock); + spin_unlock(lru_lock); + if (remove_inode_buffers(inode)) { + unsigned long reap; + reap = invalidate_mapping_pages(&inode->i_data, 0, -1); + if (current_is_kswapd()) + __count_vm_events(KSWAPD_INODESTEAL, reap); + else + __count_vm_events(PGINODESTEAL, reap); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += reap; + } + inode_unpin_lru_isolating(inode); + spin_lock(lru_lock); + return LRU_RETRY; + } + + WARN_ON(inode->i_state & I_NEW); + inode->i_state |= I_FREEING; + list_lru_isolate_move(lru, &inode->i_lru, freeable); + spin_unlock(&inode->i_lock); + + this_cpu_dec(nr_unused); + return LRU_REMOVED; +} + +/* + * Walk the superblock inode LRU for freeable inodes and attempt to free them. + * This is called from the superblock shrinker function with a number of inodes + * to trim from the LRU. Inodes to be freed are moved to a temporary list and + * then are freed outside inode_lock by dispose_list(). + */ +long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) +{ + LIST_HEAD(freeable); + long freed; + + freed = list_lru_shrink_walk(&sb->s_inode_lru, sc, + inode_lru_isolate, &freeable); + dispose_list(&freeable); + return freed; +} + +static void __wait_on_freeing_inode(struct inode *inode); +/* + * Called with the inode lock held. + */ +static struct inode *find_inode(struct super_block *sb, + struct hlist_head *head, + int (*test)(struct inode *, void *), + void *data) +{ + struct inode *inode = NULL; + +repeat: + hlist_for_each_entry(inode, head, i_hash) { + if (inode->i_sb != sb) + continue; + if (!test(inode, data)) + continue; + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE)) { + __wait_on_freeing_inode(inode); + goto repeat; + } + if (unlikely(inode->i_state & I_CREATING)) { + spin_unlock(&inode->i_lock); + return ERR_PTR(-ESTALE); + } + __iget(inode); + spin_unlock(&inode->i_lock); + return inode; + } + return NULL; +} + +/* + * find_inode_fast is the fast path version of find_inode, see the comment at + * iget_locked for details. + */ +static struct inode *find_inode_fast(struct super_block *sb, + struct hlist_head *head, unsigned long ino) +{ + struct inode *inode = NULL; + +repeat: + hlist_for_each_entry(inode, head, i_hash) { + if (inode->i_ino != ino) + continue; + if (inode->i_sb != sb) + continue; + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE)) { + __wait_on_freeing_inode(inode); + goto repeat; + } + if (unlikely(inode->i_state & I_CREATING)) { + spin_unlock(&inode->i_lock); + return ERR_PTR(-ESTALE); + } + __iget(inode); + spin_unlock(&inode->i_lock); + return inode; + } + return NULL; +} + +/* + * Each cpu owns a range of LAST_INO_BATCH numbers. + * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations, + * to renew the exhausted range. + * + * This does not significantly increase overflow rate because every CPU can + * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is + * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the + * 2^32 range, and is a worst-case. Even a 50% wastage would only increase + * overflow rate by 2x, which does not seem too significant. + * + * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW + * error if st_ino won't fit in target struct field. Use 32bit counter + * here to attempt to avoid that. + */ +#define LAST_INO_BATCH 1024 +static DEFINE_PER_CPU(unsigned int, last_ino); + +unsigned int get_next_ino(void) +{ + unsigned int *p = &get_cpu_var(last_ino); + unsigned int res = *p; + +#ifdef CONFIG_SMP + if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { + static atomic_t shared_last_ino; + int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino); + + res = next - LAST_INO_BATCH; + } +#endif + + res++; + /* get_next_ino should not provide a 0 inode number */ + if (unlikely(!res)) + res++; + *p = res; + put_cpu_var(last_ino); + return res; +} +EXPORT_SYMBOL(get_next_ino); + +/** + * new_inode_pseudo - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + * Inode wont be chained in superblock s_inodes list + * This means : + * - fs can't be unmount + * - quotas, fsnotify, writeback can't work + */ +struct inode *new_inode_pseudo(struct super_block *sb) +{ + struct inode *inode = alloc_inode(sb); + + if (inode) { + spin_lock(&inode->i_lock); + inode->i_state = 0; + spin_unlock(&inode->i_lock); + INIT_LIST_HEAD(&inode->i_sb_list); + } + return inode; +} + +/** + * new_inode - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. The default gfp_mask + * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE. + * If HIGHMEM pages are unsuitable or it is known that pages allocated + * for the page cache are not reclaimable or migratable, + * mapping_set_gfp_mask() must be called with suitable flags on the + * newly created inode's mapping + * + */ +struct inode *new_inode(struct super_block *sb) +{ + struct inode *inode; + + spin_lock_prefetch(&sb->s_inode_list_lock); + + inode = new_inode_pseudo(sb); + if (inode) + inode_sb_list_add(inode); + return inode; +} +EXPORT_SYMBOL(new_inode); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void lockdep_annotate_inode_mutex_key(struct inode *inode) +{ + if (S_ISDIR(inode->i_mode)) { + struct file_system_type *type = inode->i_sb->s_type; + + /* Set new key only if filesystem hasn't already changed it */ + if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) { + /* + * ensure nobody is actually holding i_mutex + */ + // mutex_destroy(&inode->i_mutex); + init_rwsem(&inode->i_rwsem); + lockdep_set_class(&inode->i_rwsem, + &type->i_mutex_dir_key); + } + } +} +EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key); +#endif + +/** + * unlock_new_inode - clear the I_NEW state and wake up any waiters + * @inode: new inode to unlock + * + * Called when the inode is fully initialised to clear the new state of the + * inode and wake up anyone waiting for the inode to finish initialisation. + */ +void unlock_new_inode(struct inode *inode) +{ + lockdep_annotate_inode_mutex_key(inode); + spin_lock(&inode->i_lock); + WARN_ON(!(inode->i_state & I_NEW)); + inode->i_state &= ~I_NEW & ~I_CREATING; + smp_mb(); + wake_up_bit(&inode->i_state, __I_NEW); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL_NS(unlock_new_inode, ANDROID_GKI_VFS_EXPORT_ONLY); + +void discard_new_inode(struct inode *inode) +{ + lockdep_annotate_inode_mutex_key(inode); + spin_lock(&inode->i_lock); + WARN_ON(!(inode->i_state & I_NEW)); + inode->i_state &= ~I_NEW; + smp_mb(); + wake_up_bit(&inode->i_state, __I_NEW); + spin_unlock(&inode->i_lock); + iput(inode); +} +EXPORT_SYMBOL(discard_new_inode); + +/** + * lock_two_inodes - lock two inodes (may be regular files but also dirs) + * + * Lock any non-NULL argument. The caller must make sure that if he is passing + * in two directories, one is not ancestor of the other. Zero, one or two + * objects may be locked by this function. + * + * @inode1: first inode to lock + * @inode2: second inode to lock + * @subclass1: inode lock subclass for the first lock obtained + * @subclass2: inode lock subclass for the second lock obtained + */ +void lock_two_inodes(struct inode *inode1, struct inode *inode2, + unsigned subclass1, unsigned subclass2) +{ + if (!inode1 || !inode2) { + /* + * Make sure @subclass1 will be used for the acquired lock. + * This is not strictly necessary (no current caller cares) but + * let's keep things consistent. + */ + if (!inode1) + swap(inode1, inode2); + goto lock; + } + + /* + * If one object is directory and the other is not, we must make sure + * to lock directory first as the other object may be its child. + */ + if (S_ISDIR(inode2->i_mode) == S_ISDIR(inode1->i_mode)) { + if (inode1 > inode2) + swap(inode1, inode2); + } else if (!S_ISDIR(inode1->i_mode)) + swap(inode1, inode2); +lock: + if (inode1) + inode_lock_nested(inode1, subclass1); + if (inode2 && inode2 != inode1) + inode_lock_nested(inode2, subclass2); +} + +/** + * lock_two_nondirectories - take two i_mutexes on non-directory objects + * + * Lock any non-NULL argument that is not a directory. + * Zero, one or two objects may be locked by this function. + * + * @inode1: first inode to lock + * @inode2: second inode to lock + */ +void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) +{ + if (inode1 > inode2) + swap(inode1, inode2); + + if (inode1 && !S_ISDIR(inode1->i_mode)) + inode_lock(inode1); + if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) + inode_lock_nested(inode2, I_MUTEX_NONDIR2); +} +EXPORT_SYMBOL(lock_two_nondirectories); + +/** + * unlock_two_nondirectories - release locks from lock_two_nondirectories() + * @inode1: first inode to unlock + * @inode2: second inode to unlock + */ +void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) +{ + if (inode1 && !S_ISDIR(inode1->i_mode)) + inode_unlock(inode1); + if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) + inode_unlock(inode2); +} +EXPORT_SYMBOL(unlock_two_nondirectories); + +/** + * inode_insert5 - obtain an inode from a mounted file system + * @inode: pre-allocated inode to use for insert to cache + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * Search for the inode specified by @hashval and @data in the inode cache, + * and if present it is return it with an increased reference count. This is + * a variant of iget5_locked() for callers that don't want to fail on memory + * allocation of inode. + * + * If the inode is not in cache, insert the pre-allocated inode to cache and + * return it locked, hashed, and with the I_NEW flag set. The file system gets + * to fill it in before unlocking it via unlock_new_inode(). + * + * Note both @test and @set are called with the inode_hash_lock held, so can't + * sleep. + */ +struct inode *inode_insert5(struct inode *inode, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); + struct inode *old; + bool creating = inode->i_state & I_CREATING; + +again: + spin_lock(&inode_hash_lock); + old = find_inode(inode->i_sb, head, test, data); + if (unlikely(old)) { + /* + * Uhhuh, somebody else created the same inode under us. + * Use the old inode instead of the preallocated one. + */ + spin_unlock(&inode_hash_lock); + if (IS_ERR(old)) + return NULL; + wait_on_inode(old); + if (unlikely(inode_unhashed(old))) { + iput(old); + goto again; + } + return old; + } + + if (set && unlikely(set(inode, data))) { + inode = NULL; + goto unlock; + } + + /* + * Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW; + hlist_add_head_rcu(&inode->i_hash, head); + spin_unlock(&inode->i_lock); + if (!creating) + inode_sb_list_add(inode); +unlock: + spin_unlock(&inode_hash_lock); + + return inode; +} +EXPORT_SYMBOL(inode_insert5); + +/** + * iget5_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * Search for the inode specified by @hashval and @data in the inode cache, + * and if present it is return it with an increased reference count. This is + * a generalized version of iget_locked() for file systems where the inode + * number is not sufficient for unique identification of an inode. + * + * If the inode is not in cache, allocate a new inode and return it locked, + * hashed, and with the I_NEW flag set. The file system gets to fill it in + * before unlocking it via unlock_new_inode(). + * + * Note both @test and @set are called with the inode_hash_lock held, so can't + * sleep. + */ +struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) +{ + struct inode *inode = ilookup5(sb, hashval, test, data); + + if (!inode) { + struct inode *new = alloc_inode(sb); + + if (new) { + new->i_state = 0; + inode = inode_insert5(new, hashval, test, set, data); + if (unlikely(inode != new)) + destroy_inode(new); + } + } + return inode; +} +EXPORT_SYMBOL_NS(iget5_locked, ANDROID_GKI_VFS_EXPORT_ONLY); + +/** + * iget_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @ino: inode number to get + * + * Search for the inode specified by @ino in the inode cache and if present + * return it with an increased reference count. This is for file systems + * where the inode number is sufficient for unique identification of an inode. + * + * If the inode is not in cache, allocate a new inode and return it locked, + * hashed, and with the I_NEW flag set. The file system gets to fill it in + * before unlocking it via unlock_new_inode(). + */ +struct inode *iget_locked(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *inode; +again: + spin_lock(&inode_hash_lock); + inode = find_inode_fast(sb, head, ino); + spin_unlock(&inode_hash_lock); + if (inode) { + if (IS_ERR(inode)) + return NULL; + wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } + return inode; + } + + inode = alloc_inode(sb); + if (inode) { + struct inode *old; + + spin_lock(&inode_hash_lock); + /* We released the lock, so.. */ + old = find_inode_fast(sb, head, ino); + if (!old) { + inode->i_ino = ino; + spin_lock(&inode->i_lock); + inode->i_state = I_NEW; + hlist_add_head_rcu(&inode->i_hash, head); + spin_unlock(&inode->i_lock); + inode_sb_list_add(inode); + spin_unlock(&inode_hash_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + spin_unlock(&inode_hash_lock); + destroy_inode(inode); + if (IS_ERR(old)) + return NULL; + inode = old; + wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } + } + return inode; +} +EXPORT_SYMBOL(iget_locked); + +/* + * search the inode cache for a matching inode number. + * If we find one, then the inode number we are trying to + * allocate is not unique and so we should not use it. + * + * Returns 1 if the inode number is unique, 0 if it is not. + */ +static int test_inode_iunique(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *b = inode_hashtable + hash(sb, ino); + struct inode *inode; + + hlist_for_each_entry_rcu(inode, b, i_hash) { + if (inode->i_ino == ino && inode->i_sb == sb) + return 0; + } + return 1; +} + +/** + * iunique - get a unique inode number + * @sb: superblock + * @max_reserved: highest reserved inode number + * + * Obtain an inode number that is unique on the system for a given + * superblock. This is used by file systems that have no natural + * permanent inode numbering system. An inode number is returned that + * is higher than the reserved limit but unique. + * + * BUGS: + * With a large number of inodes live on the file system this function + * currently becomes quite slow. + */ +ino_t iunique(struct super_block *sb, ino_t max_reserved) +{ + /* + * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW + * error if st_ino won't fit in target struct field. Use 32bit counter + * here to attempt to avoid that. + */ + static DEFINE_SPINLOCK(iunique_lock); + static unsigned int counter; + ino_t res; + + rcu_read_lock(); + spin_lock(&iunique_lock); + do { + if (counter <= max_reserved) + counter = max_reserved + 1; + res = counter++; + } while (!test_inode_iunique(sb, res)); + spin_unlock(&iunique_lock); + rcu_read_unlock(); + + return res; +} +EXPORT_SYMBOL_NS(iunique, ANDROID_GKI_VFS_EXPORT_ONLY); + +struct inode *igrab(struct inode *inode) +{ + spin_lock(&inode->i_lock); + if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) { + __iget(inode); + spin_unlock(&inode->i_lock); + } else { + spin_unlock(&inode->i_lock); + /* + * Handle the case where s_op->clear_inode is not been + * called yet, and somebody is calling igrab + * while the inode is getting freed. + */ + inode = NULL; + } + return inode; +} +EXPORT_SYMBOL(igrab); + +/** + * ilookup5_nowait - search for an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * Search for the inode specified by @hashval and @data in the inode cache. + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Note: I_NEW is not waited upon so you have to be very careful what you do + * with the returned inode. You probably should be using ilookup5() instead. + * + * Note2: @test is called with the inode_hash_lock held, so can't sleep. + */ +struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode; + + spin_lock(&inode_hash_lock); + inode = find_inode(sb, head, test, data); + spin_unlock(&inode_hash_lock); + + return IS_ERR(inode) ? NULL : inode; +} +EXPORT_SYMBOL(ilookup5_nowait); + +/** + * ilookup5 - search for an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * Search for the inode specified by @hashval and @data in the inode cache, + * and if the inode is in the cache, return the inode with an incremented + * reference count. Waits on I_NEW before returning the inode. + * returned with an incremented reference count. + * + * This is a generalized version of ilookup() for file systems where the + * inode number is not sufficient for unique identification of an inode. + * + * Note: @test is called with the inode_hash_lock held, so can't sleep. + */ +struct inode *ilookup5(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct inode *inode; +again: + inode = ilookup5_nowait(sb, hashval, test, data); + if (inode) { + wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } + } + return inode; +} +EXPORT_SYMBOL_NS(ilookup5, ANDROID_GKI_VFS_EXPORT_ONLY); + +/** + * ilookup - search for an inode in the inode cache + * @sb: super block of file system to search + * @ino: inode number to search for + * + * Search for the inode @ino in the inode cache, and if the inode is in the + * cache, the inode is returned with an incremented reference count. + */ +struct inode *ilookup(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *inode; +again: + spin_lock(&inode_hash_lock); + inode = find_inode_fast(sb, head, ino); + spin_unlock(&inode_hash_lock); + + if (inode) { + if (IS_ERR(inode)) + return NULL; + wait_on_inode(inode); + if (unlikely(inode_unhashed(inode))) { + iput(inode); + goto again; + } + } + return inode; +} +EXPORT_SYMBOL(ilookup); + +/** + * find_inode_nowait - find an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @match: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @match + * + * Search for the inode specified by @hashval and @data in the inode + * cache, where the helper function @match will return 0 if the inode + * does not match, 1 if the inode does match, and -1 if the search + * should be stopped. The @match function must be responsible for + * taking the i_lock spin_lock and checking i_state for an inode being + * freed or being initialized, and incrementing the reference count + * before returning 1. It also must not sleep, since it is called with + * the inode_hash_lock spinlock held. + * + * This is a even more generalized version of ilookup5() when the + * function must never block --- find_inode() can block in + * __wait_on_freeing_inode() --- or when the caller can not increment + * the reference count because the resulting iput() might cause an + * inode eviction. The tradeoff is that the @match funtion must be + * very carefully implemented. + */ +struct inode *find_inode_nowait(struct super_block *sb, + unsigned long hashval, + int (*match)(struct inode *, unsigned long, + void *), + void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode, *ret_inode = NULL; + int mval; + + spin_lock(&inode_hash_lock); + hlist_for_each_entry(inode, head, i_hash) { + if (inode->i_sb != sb) + continue; + mval = match(inode, hashval, data); + if (mval == 0) + continue; + if (mval == 1) + ret_inode = inode; + goto out; + } +out: + spin_unlock(&inode_hash_lock); + return ret_inode; +} +EXPORT_SYMBOL(find_inode_nowait); + +/** + * find_inode_rcu - find an inode in the inode cache + * @sb: Super block of file system to search + * @hashval: Key to hash + * @test: Function to test match on an inode + * @data: Data for test function + * + * Search for the inode specified by @hashval and @data in the inode cache, + * where the helper function @test will return 0 if the inode does not match + * and 1 if it does. The @test function must be responsible for taking the + * i_lock spin_lock and checking i_state for an inode being freed or being + * initialized. + * + * If successful, this will return the inode for which the @test function + * returned 1 and NULL otherwise. + * + * The @test function is not permitted to take a ref on any inode presented. + * It is also not permitted to sleep. + * + * The caller must hold the RCU read lock. + */ +struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), + "suspicious find_inode_rcu() usage"); + + hlist_for_each_entry_rcu(inode, head, i_hash) { + if (inode->i_sb == sb && + !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && + test(inode, data)) + return inode; + } + return NULL; +} +EXPORT_SYMBOL(find_inode_rcu); + +/** + * find_inode_by_rcu - Find an inode in the inode cache + * @sb: Super block of file system to search + * @ino: The inode number to match + * + * Search for the inode specified by @hashval and @data in the inode cache, + * where the helper function @test will return 0 if the inode does not match + * and 1 if it does. The @test function must be responsible for taking the + * i_lock spin_lock and checking i_state for an inode being freed or being + * initialized. + * + * If successful, this will return the inode for which the @test function + * returned 1 and NULL otherwise. + * + * The @test function is not permitted to take a ref on any inode presented. + * It is also not permitted to sleep. + * + * The caller must hold the RCU read lock. + */ +struct inode *find_inode_by_ino_rcu(struct super_block *sb, + unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *inode; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), + "suspicious find_inode_by_ino_rcu() usage"); + + hlist_for_each_entry_rcu(inode, head, i_hash) { + if (inode->i_ino == ino && + inode->i_sb == sb && + !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) + return inode; + } + return NULL; +} +EXPORT_SYMBOL(find_inode_by_ino_rcu); + +int insert_inode_locked(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + ino_t ino = inode->i_ino; + struct hlist_head *head = inode_hashtable + hash(sb, ino); + + while (1) { + struct inode *old = NULL; + spin_lock(&inode_hash_lock); + hlist_for_each_entry(old, head, i_hash) { + if (old->i_ino != ino) + continue; + if (old->i_sb != sb) + continue; + spin_lock(&old->i_lock); + if (old->i_state & (I_FREEING|I_WILL_FREE)) { + spin_unlock(&old->i_lock); + continue; + } + break; + } + if (likely(!old)) { + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW | I_CREATING; + hlist_add_head_rcu(&inode->i_hash, head); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); + return 0; + } + if (unlikely(old->i_state & I_CREATING)) { + spin_unlock(&old->i_lock); + spin_unlock(&inode_hash_lock); + return -EBUSY; + } + __iget(old); + spin_unlock(&old->i_lock); + spin_unlock(&inode_hash_lock); + wait_on_inode(old); + if (unlikely(!inode_unhashed(old))) { + iput(old); + return -EBUSY; + } + iput(old); + } +} +EXPORT_SYMBOL(insert_inode_locked); + +int insert_inode_locked4(struct inode *inode, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct inode *old; + + inode->i_state |= I_CREATING; + old = inode_insert5(inode, hashval, test, NULL, data); + + if (old != inode) { + iput(old); + return -EBUSY; + } + return 0; +} +EXPORT_SYMBOL(insert_inode_locked4); + + +int generic_delete_inode(struct inode *inode) +{ + return 1; +} +EXPORT_SYMBOL(generic_delete_inode); + +/* + * Called when we're dropping the last reference + * to an inode. + * + * Call the FS "drop_inode()" function, defaulting to + * the legacy UNIX filesystem behaviour. If it tells + * us to evict inode, do so. Otherwise, retain inode + * in cache if fs is alive, sync and evict if fs is + * shutting down. + */ +static void iput_final(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + const struct super_operations *op = inode->i_sb->s_op; + unsigned long state; + int drop; + + WARN_ON(inode->i_state & I_NEW); + + if (op->drop_inode) + drop = op->drop_inode(inode); + else + drop = generic_drop_inode(inode); + + if (!drop && + !(inode->i_state & I_DONTCACHE) && + (sb->s_flags & SB_ACTIVE)) { + inode_add_lru(inode); + spin_unlock(&inode->i_lock); + return; + } + + state = inode->i_state; + if (!drop) { + WRITE_ONCE(inode->i_state, state | I_WILL_FREE); + spin_unlock(&inode->i_lock); + + write_inode_now(inode, 1); + + spin_lock(&inode->i_lock); + state = inode->i_state; + WARN_ON(state & I_NEW); + state &= ~I_WILL_FREE; + } + + WRITE_ONCE(inode->i_state, state | I_FREEING); + if (!list_empty(&inode->i_lru)) + inode_lru_list_del(inode); + spin_unlock(&inode->i_lock); + + evict(inode); +} + +/** + * iput - put an inode + * @inode: inode to put + * + * Puts an inode, dropping its usage count. If the inode use count hits + * zero, the inode is then freed and may also be destroyed. + * + * Consequently, iput() can sleep. + */ +void iput(struct inode *inode) +{ + if (!inode) + return; + BUG_ON(inode->i_state & I_CLEAR); +retry: + if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { + if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { + atomic_inc(&inode->i_count); + spin_unlock(&inode->i_lock); + trace_writeback_lazytime_iput(inode); + mark_inode_dirty_sync(inode); + goto retry; + } + iput_final(inode); + } +} +EXPORT_SYMBOL(iput); + +#ifdef CONFIG_BLOCK +/** + * bmap - find a block number in a file + * @inode: inode owning the block number being requested + * @block: pointer containing the block to find + * + * Replaces the value in ``*block`` with the block number on the device holding + * corresponding to the requested block number in the file. + * That is, asked for block 4 of inode 1 the function will replace the + * 4 in ``*block``, with disk block relative to the disk start that holds that + * block of the file. + * + * Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a + * hole, returns 0 and ``*block`` is also set to 0. + */ +int bmap(struct inode *inode, sector_t *block) +{ + if (!inode->i_mapping->a_ops->bmap) + return -EINVAL; + + *block = inode->i_mapping->a_ops->bmap(inode->i_mapping, *block); + return 0; +} +EXPORT_SYMBOL(bmap); +#endif + +/* + * With relative atime, only update atime if the previous atime is + * earlier than either the ctime or mtime or if at least a day has + * passed since the last atime update. + */ +static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, + struct timespec64 now) +{ + + if (!(mnt->mnt_flags & MNT_RELATIME)) + return 1; + /* + * Is mtime younger than atime? If yes, update atime: + */ + if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0) + return 1; + /* + * Is ctime younger than atime? If yes, update atime: + */ + if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0) + return 1; + + /* + * Is the previous atime value older than a day? If yes, + * update atime: + */ + if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60) + return 1; + /* + * Good, we can skip the atime update: + */ + return 0; +} + +int generic_update_time(struct inode *inode, struct timespec64 *time, int flags) +{ + int iflags = I_DIRTY_TIME; + bool dirty = false; + + if (flags & S_ATIME) + inode->i_atime = *time; + if (flags & S_VERSION) + dirty = inode_maybe_inc_iversion(inode, false); + if (flags & S_CTIME) + inode->i_ctime = *time; + if (flags & S_MTIME) + inode->i_mtime = *time; + if ((flags & (S_ATIME | S_CTIME | S_MTIME)) && + !(inode->i_sb->s_flags & SB_LAZYTIME)) + dirty = true; + + if (dirty) + iflags |= I_DIRTY_SYNC; + __mark_inode_dirty(inode, iflags); + return 0; +} +EXPORT_SYMBOL(generic_update_time); + +/* + * This does the actual work of updating an inodes time or version. Must have + * had called mnt_want_write() before calling this. + */ +int inode_update_time(struct inode *inode, struct timespec64 *time, int flags) +{ + if (inode->i_op->update_time) + return inode->i_op->update_time(inode, time, flags); + return generic_update_time(inode, time, flags); +} +EXPORT_SYMBOL(inode_update_time); + +/** + * touch_atime - update the access time + * @path: the &struct path to update + * @inode: inode to update + * + * Update the accessed time on an inode and mark it for writeback. + * This function automatically handles read only file systems and media, + * as well as the "noatime" flag and inode specific "noatime" markers. + */ +bool atime_needs_update(const struct path *path, struct inode *inode) +{ + struct vfsmount *mnt = path->mnt; + struct timespec64 now; + + if (inode->i_flags & S_NOATIME) + return false; + + /* Atime updates will likely cause i_uid and i_gid to be written + * back improprely if their true value is unknown to the vfs. + */ + if (HAS_UNMAPPED_ID(inode)) + return false; + + if (IS_NOATIME(inode)) + return false; + if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)) + return false; + + if (mnt->mnt_flags & MNT_NOATIME) + return false; + if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) + return false; + + now = current_time(inode); + + if (!relatime_need_update(mnt, inode, now)) + return false; + + if (timespec64_equal(&inode->i_atime, &now)) + return false; + + return true; +} + +void touch_atime(const struct path *path) +{ + struct vfsmount *mnt = path->mnt; + struct inode *inode = d_inode(path->dentry); + struct timespec64 now; + + if (!atime_needs_update(path, inode)) + return; + + if (!sb_start_write_trylock(inode->i_sb)) + return; + + if (__mnt_want_write(mnt) != 0) + goto skip_update; + /* + * File systems can error out when updating inodes if they need to + * allocate new space to modify an inode (such is the case for + * Btrfs), but since we touch atime while walking down the path we + * really don't care if we failed to update the atime of the file, + * so just ignore the return value. + * We may also fail on filesystems that have the ability to make parts + * of the fs read only, e.g. subvolumes in Btrfs. + */ + now = current_time(inode); + inode_update_time(inode, &now, S_ATIME); + __mnt_drop_write(mnt); +skip_update: + sb_end_write(inode->i_sb); +} +EXPORT_SYMBOL_NS(touch_atime, ANDROID_GKI_VFS_EXPORT_ONLY); + +/* + * Return mask of changes for notify_change() that need to be done as a + * response to write or truncate. Return 0 if nothing has to be changed. + * Negative value on error (change should be denied). + */ +int dentry_needs_remove_privs(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + int mask = 0; + int ret; + + if (IS_NOSEC(inode)) + return 0; + + mask = setattr_should_drop_suidgid(inode); + ret = security_inode_need_killpriv(dentry); + if (ret < 0) + return ret; + if (ret) + mask |= ATTR_KILL_PRIV; + return mask; +} + +static int __remove_privs(struct dentry *dentry, int kill) +{ + struct iattr newattrs; + + newattrs.ia_valid = ATTR_FORCE | kill; + /* + * Note we call this on write, so notify_change will not + * encounter any conflicting delegations: + */ + return notify_change(dentry, &newattrs, NULL); +} + +/* + * Remove special file priviledges (suid, capabilities) when file is written + * to or truncated. + */ +int file_remove_privs(struct file *file) +{ + struct dentry *dentry = file_dentry(file); + struct inode *inode = file_inode(file); + int kill; + int error = 0; + + /* + * Fast path for nothing security related. + * As well for non-regular files, e.g. blkdev inodes. + * For example, blkdev_write_iter() might get here + * trying to remove privs which it is not allowed to. + */ + if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) + return 0; + + kill = dentry_needs_remove_privs(dentry); + if (kill < 0) + return kill; + if (kill) + error = __remove_privs(dentry, kill); + if (!error) + inode_has_no_xattr(inode); + + return error; +} +EXPORT_SYMBOL_NS(file_remove_privs, ANDROID_GKI_VFS_EXPORT_ONLY); + +/** + * file_update_time - update mtime and ctime time + * @file: file accessed + * + * Update the mtime and ctime members of an inode and mark the inode + * for writeback. Note that this function is meant exclusively for + * usage in the file write path of filesystems, and filesystems may + * choose to explicitly ignore update via this function with the + * S_NOCMTIME inode flag, e.g. for network filesystem where these + * timestamps are handled by the server. This can return an error for + * file systems who need to allocate space in order to update an inode. + */ + +int file_update_time(struct file *file) +{ + struct inode *inode = file_inode(file); + struct timespec64 now; + int sync_it = 0; + int ret; + + /* First try to exhaust all avenues to not sync */ + if (IS_NOCMTIME(inode)) + return 0; + + now = current_time(inode); + if (!timespec64_equal(&inode->i_mtime, &now)) + sync_it = S_MTIME; + + if (!timespec64_equal(&inode->i_ctime, &now)) + sync_it |= S_CTIME; + + if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) + sync_it |= S_VERSION; + + if (!sync_it) + return 0; + + /* Finally allowed to write? Takes lock. */ + if (__mnt_want_write_file(file)) + return 0; + + ret = inode_update_time(inode, &now, sync_it); + __mnt_drop_write_file(file); + + return ret; +} +EXPORT_SYMBOL(file_update_time); + +/* Caller must hold the file's inode lock */ +int file_modified(struct file *file) +{ + int err; + + /* + * Clear the security bits if the process is not being run by root. + * This keeps people from modifying setuid and setgid binaries. + */ + err = file_remove_privs(file); + if (err) + return err; + + if (unlikely(file->f_mode & FMODE_NOCMTIME)) + return 0; + + return file_update_time(file); +} +EXPORT_SYMBOL(file_modified); + +int inode_needs_sync(struct inode *inode) +{ + if (IS_SYNC(inode)) + return 1; + if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) + return 1; + return 0; +} +EXPORT_SYMBOL(inode_needs_sync); + +/* + * If we try to find an inode in the inode hash while it is being + * deleted, we have to wait until the filesystem completes its + * deletion before reporting that it isn't found. This function waits + * until the deletion _might_ have completed. Callers are responsible + * to recheck inode state. + * + * It doesn't matter if I_NEW is not set initially, a call to + * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list + * will DTRT. + */ +static void __wait_on_freeing_inode(struct inode *inode) +{ + wait_queue_head_t *wq; + DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + wq = bit_waitqueue(&inode->i_state, __I_NEW); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); + schedule(); + finish_wait(wq, &wait.wq_entry); + spin_lock(&inode_hash_lock); +} + +static __initdata unsigned long ihash_entries; +static int __init set_ihash_entries(char *str) +{ + if (!str) + return 0; + ihash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("ihash_entries=", set_ihash_entries); + +/* + * Initialize the waitqueues and inode hash table. + */ +void __init inode_init_early(void) +{ + /* If hashes are distributed across NUMA nodes, defer + * hash allocation until vmalloc space is available. + */ + if (hashdist) + return; + + inode_hashtable = + alloc_large_system_hash("Inode-cache", + sizeof(struct hlist_head), + ihash_entries, + 14, + HASH_EARLY | HASH_ZERO, + &i_hash_shift, + &i_hash_mask, + 0, + 0); +} + +void __init inode_init(void) +{ + /* inode slab cache */ + inode_cachep = kmem_cache_create("inode_cache", + sizeof(struct inode), + 0, + (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| + SLAB_MEM_SPREAD|SLAB_ACCOUNT), + init_once); + + /* Hash may have been set up in inode_init_early */ + if (!hashdist) + return; + + inode_hashtable = + alloc_large_system_hash("Inode-cache", + sizeof(struct hlist_head), + ihash_entries, + 14, + HASH_ZERO, + &i_hash_shift, + &i_hash_mask, + 0, + 0); +} + +void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) +{ + inode->i_mode = mode; + if (S_ISCHR(mode)) { + inode->i_fop = &def_chr_fops; + inode->i_rdev = rdev; + } else if (S_ISBLK(mode)) { + inode->i_fop = &def_blk_fops; + inode->i_rdev = rdev; + } else if (S_ISFIFO(mode)) + inode->i_fop = &pipefifo_fops; + else if (S_ISSOCK(mode)) + ; /* leave it no_open_fops */ + else + printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" + " inode %s:%lu\n", mode, inode->i_sb->s_id, + inode->i_ino); +} +EXPORT_SYMBOL_NS(init_special_inode, ANDROID_GKI_VFS_EXPORT_ONLY); + +/** + * inode_init_owner - Init uid,gid,mode for new inode according to posix standards + * @inode: New inode + * @dir: Directory inode + * @mode: mode of the new inode + */ +void inode_init_owner(struct inode *inode, const struct inode *dir, + umode_t mode) +{ + inode->i_uid = current_fsuid(); + if (dir && dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + + /* Directories are special, and always inherit S_ISGID */ + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current_fsgid(); + inode->i_mode = mode; +} +EXPORT_SYMBOL_NS(inode_init_owner, ANDROID_GKI_VFS_EXPORT_ONLY); + +/** + * inode_owner_or_capable - check current task permissions to inode + * @inode: inode being checked + * + * Return true if current either has CAP_FOWNER in a namespace with the + * inode owner uid mapped, or owns the file. + */ +bool inode_owner_or_capable(const struct inode *inode) +{ + struct user_namespace *ns; + + if (uid_eq(current_fsuid(), inode->i_uid)) + return true; + + ns = current_user_ns(); + if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER)) + return true; + return false; +} +EXPORT_SYMBOL(inode_owner_or_capable); + +/* + * Direct i/o helper functions + */ +static void __inode_dio_wait(struct inode *inode) +{ + wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); + DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP); + + do { + prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE); + if (atomic_read(&inode->i_dio_count)) + schedule(); + } while (atomic_read(&inode->i_dio_count)); + finish_wait(wq, &q.wq_entry); +} + +/** + * inode_dio_wait - wait for outstanding DIO requests to finish + * @inode: inode to wait for + * + * Waits for all pending direct I/O requests to finish so that we can + * proceed with a truncate or equivalent operation. + * + * Must be called under a lock that serializes taking new references + * to i_dio_count, usually by inode->i_mutex. + */ +void inode_dio_wait(struct inode *inode) +{ + if (atomic_read(&inode->i_dio_count)) + __inode_dio_wait(inode); +} +EXPORT_SYMBOL_NS(inode_dio_wait, ANDROID_GKI_VFS_EXPORT_ONLY); + +/* + * inode_set_flags - atomically set some inode flags + * + * Note: the caller should be holding i_mutex, or else be sure that + * they have exclusive access to the inode structure (i.e., while the + * inode is being instantiated). The reason for the cmpxchg() loop + * --- which wouldn't be necessary if all code paths which modify + * i_flags actually followed this rule, is that there is at least one + * code path which doesn't today so we use cmpxchg() out of an abundance + * of caution. + * + * In the long run, i_mutex is overkill, and we should probably look + * at using the i_lock spinlock to protect i_flags, and then make sure + * it is so documented in include/linux/fs.h and that all code follows + * the locking convention!! + */ +void inode_set_flags(struct inode *inode, unsigned int flags, + unsigned int mask) +{ + WARN_ON_ONCE(flags & ~mask); + set_mask_bits(&inode->i_flags, mask, flags); +} +EXPORT_SYMBOL_NS(inode_set_flags, ANDROID_GKI_VFS_EXPORT_ONLY); + +void inode_nohighmem(struct inode *inode) +{ + mapping_set_gfp_mask(inode->i_mapping, GFP_USER); +} +EXPORT_SYMBOL(inode_nohighmem); + +/** + * timestamp_truncate - Truncate timespec to a granularity + * @t: Timespec + * @inode: inode being updated + * + * Truncate a timespec to the granularity supported by the fs + * containing the inode. Always rounds down. gran must + * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns). + */ +struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + unsigned int gran = sb->s_time_gran; + + t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max); + if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min)) + t.tv_nsec = 0; + + /* Avoid division in the common cases 1 ns and 1 s. */ + if (gran == 1) + ; /* nothing */ + else if (gran == NSEC_PER_SEC) + t.tv_nsec = 0; + else if (gran > 1 && gran < NSEC_PER_SEC) + t.tv_nsec -= t.tv_nsec % gran; + else + WARN(1, "invalid file time granularity: %u", gran); + return t; +} +EXPORT_SYMBOL_NS(timestamp_truncate, ANDROID_GKI_VFS_EXPORT_ONLY); + +/** + * current_time - Return FS time + * @inode: inode. + * + * Return the current time truncated to the time granularity supported by + * the fs. + * + * Note that inode and inode->sb cannot be NULL. + * Otherwise, the function warns and returns time without truncation. + */ +struct timespec64 current_time(struct inode *inode) +{ + struct timespec64 now; + + ktime_get_coarse_real_ts64(&now); + + if (unlikely(!inode->i_sb)) { + WARN(1, "current_time() called with uninitialized super_block in the inode"); + return now; + } + + return timestamp_truncate(now, inode); +} +EXPORT_SYMBOL(current_time); + +/* + * Generic function to check FS_IOC_SETFLAGS values and reject any invalid + * configurations. + * + * Note: the caller should be holding i_mutex, or else be sure that they have + * exclusive access to the inode structure. + */ +int vfs_ioc_setflags_prepare(struct inode *inode, unsigned int oldflags, + unsigned int flags) +{ + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + * + * This test looks nicer. Thanks to Pauline Middelink + */ + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) && + !capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + + return fscrypt_prepare_setflags(inode, oldflags, flags); +} +EXPORT_SYMBOL(vfs_ioc_setflags_prepare); + +/* + * Generic function to check FS_IOC_FSSETXATTR values and reject any invalid + * configurations. + * + * Note: the caller should be holding i_mutex, or else be sure that they have + * exclusive access to the inode structure. + */ +int vfs_ioc_fssetxattr_check(struct inode *inode, const struct fsxattr *old_fa, + struct fsxattr *fa) +{ + /* + * Can't modify an immutable/append-only file unless we have + * appropriate permission. + */ + if ((old_fa->fsx_xflags ^ fa->fsx_xflags) & + (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND) && + !capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + + /* + * Project Quota ID state is only allowed to change from within the init + * namespace. Enforce that restriction only if we are trying to change + * the quota ID state. Everything else is allowed in user namespaces. + */ + if (current_user_ns() != &init_user_ns) { + if (old_fa->fsx_projid != fa->fsx_projid) + return -EINVAL; + if ((old_fa->fsx_xflags ^ fa->fsx_xflags) & + FS_XFLAG_PROJINHERIT) + return -EINVAL; + } + + /* Check extent size hints. */ + if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode)) + return -EINVAL; + + if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) && + !S_ISDIR(inode->i_mode)) + return -EINVAL; + + if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) && + !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) + return -EINVAL; + + /* + * It is only valid to set the DAX flag on regular files and + * directories on filesystems. + */ + if ((fa->fsx_xflags & FS_XFLAG_DAX) && + !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) + return -EINVAL; + + /* Extent size hints of zero turn off the flags. */ + if (fa->fsx_extsize == 0) + fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT); + if (fa->fsx_cowextsize == 0) + fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; + + return 0; +} +EXPORT_SYMBOL(vfs_ioc_fssetxattr_check); + +/** + * inode_set_ctime_current - set the ctime to current_time + * @inode: inode + * + * Set the inode->i_ctime to the current value for the inode. Returns + * the current value that was assigned to i_ctime. + */ +struct timespec64 inode_set_ctime_current(struct inode *inode) +{ + struct timespec64 now = current_time(inode); + + inode_set_ctime(inode, now.tv_sec, now.tv_nsec); + return now; +} +EXPORT_SYMBOL(inode_set_ctime_current); + +/** + * in_group_or_capable - check whether caller is CAP_FSETID privileged + * @inode: inode to check + * @gid: the new/current gid of @inode + * + * Check wether @gid is in the caller's group list or if the caller is + * privileged with CAP_FSETID over @inode. This can be used to determine + * whether the setgid bit can be kept or must be dropped. + * + * Return: true if the caller is sufficiently privileged, false if not. + */ +bool in_group_or_capable(const struct inode *inode, kgid_t gid) +{ + if (in_group_p(gid)) + return true; + if (capable_wrt_inode_uidgid(inode, CAP_FSETID)) + return true; + return false; +} + +/** + * mode_strip_sgid - handle the sgid bit for non-directories + * @dir: parent directory inode + * @mode: mode of the file to be created in @dir + * + * If the @mode of the new file has both the S_ISGID and S_IXGRP bit + * raised and @dir has the S_ISGID bit raised ensure that the caller is + * either in the group of the parent directory or they have CAP_FSETID + * in their user namespace and are privileged over the parent directory. + * In all other cases, strip the S_ISGID bit from @mode. + * + * Return: the new mode to use for the file + */ +umode_t mode_strip_sgid(const struct inode *dir, umode_t mode) +{ + if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP)) + return mode; + if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID)) + return mode; + if (in_group_or_capable(dir, dir->i_gid)) + return mode; + return mode & ~S_ISGID; +} +EXPORT_SYMBOL(mode_strip_sgid); diff --git a/fs/namei.c b/fs/namei.c index 8cea84ecbf56..1cb8c046074d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1048,6 +1048,12 @@ int sysctl_protected_regular __read_mostly; */ static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { +#ifdef CONFIG_KSU_SUSFS_SUS_PATH + if (nd->inode && unlikely(nd->inode->i_state & 16777216) && likely(current_cred()->user->android_kabi_reserved2 & 16777216)) { + return -ENOENT; + } +#endif + if (!sysctl_protected_symlinks) return 0; @@ -1122,6 +1128,12 @@ int may_linkat(struct path *link) { struct inode *inode = link->dentry->d_inode; +#ifdef CONFIG_KSU_SUSFS_SUS_PATH + if (inode && unlikely(inode->i_state & 16777216) && likely(current_cred()->user->android_kabi_reserved2 & 16777216)) { + return -ENOENT; + } +#endif + /* Inode writeback is not safe when the uid or gid are invalid. */ if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) return -EOVERFLOW; @@ -1163,6 +1175,12 @@ int may_linkat(struct path *link) static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid, struct inode * const inode) { +#ifdef CONFIG_KSU_SUSFS_SUS_PATH + if (unlikely(inode->i_state & 16777216) && likely(current_cred()->user->android_kabi_reserved2 & 16777216)) { + return -ENOENT; + } +#endif + if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || likely(!(dir_mode & S_ISVTX)) || @@ -1526,6 +1544,9 @@ static struct dentry *__lookup_hash(const struct qstr *name, struct dentry *dentry = lookup_dcache(name, base, flags); struct dentry *old; struct inode *dir = base->d_inode; +#ifdef CONFIG_KSU_SUSFS_SUS_PATH + int error; +#endif if (dentry) return dentry; @@ -1543,6 +1564,21 @@ static struct dentry *__lookup_hash(const struct qstr *name, dput(dentry); dentry = old; } +#ifdef CONFIG_KSU_SUSFS_SUS_PATH + if (!IS_ERR(dentry) && dentry->d_inode && unlikely(dentry->d_inode->i_state & 16777216) && likely(current_cred()->user->android_kabi_reserved2 & 16777216)) { + if ((flags & (LOOKUP_CREATE | LOOKUP_EXCL))) { + error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + if (error) { + dput(dentry); + return ERR_PTR(error); + } + dput(dentry); + return ERR_PTR(-ENOENT); + } + dput(dentry); + return ERR_PTR(-ENOENT); + } +#endif return dentry; } @@ -1644,6 +1680,12 @@ static struct dentry *__lookup_slow(const struct qstr *name, dentry = old; } } +#ifdef CONFIG_KSU_SUSFS_SUS_PATH + if (!IS_ERR(dentry) && dentry->d_inode && unlikely(dentry->d_inode->i_state & 16777216) && likely(current_cred()->user->android_kabi_reserved2 & 16777216)) { + dput(dentry); + return ERR_PTR(-ENOENT); + } +#endif return dentry; } @@ -2289,6 +2331,12 @@ static int link_path_walk(const char *name, struct nameidata *nd) } return -ENOTDIR; } +#ifdef CONFIG_KSU_SUSFS_SUS_PATH + // we deal with sus sub path here + if (nd->inode && unlikely(nd->inode->i_state & 16777216) && likely(current_cred()->user->android_kabi_reserved2 & 16777216)) { + return 0; + } +#endif } } @@ -2468,6 +2516,11 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags, flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0); restore_nameidata(); putname(name); +#ifdef CONFIG_KSU_SUSFS_SUS_PATH + if (!retval && path->dentry->d_inode && unlikely(path->dentry->d_inode->i_state & 16777216) && likely(current_cred()->user->android_kabi_reserved2 & 16777216)) { + return -ENOENT; + } +#endif return retval; } @@ -2797,6 +2850,12 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) if (IS_APPEND(dir)) return -EPERM; +#ifdef CONFIG_KSU_SUSFS_SUS_PATH + if (unlikely(inode->i_state & 16777216) && likely(current_cred()->user->android_kabi_reserved2 & 16777216)) { + return -ENOENT; + } +#endif + if (check_sticky(dir, inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode)) return -EPERM; @@ -2825,8 +2884,22 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) */ static inline int may_create(struct inode *dir, struct dentry *child) { +#ifdef CONFIG_KSU_SUSFS_SUS_PATH + int error; +#endif + struct user_namespace *s_user_ns; audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); +#ifdef CONFIG_KSU_SUSFS_SUS_PATH + if (child->d_inode && unlikely(child->d_inode->i_state & 16777216) && likely(current_cred()->user->android_kabi_reserved2 & 16777216)) { + error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + if (error) { + return error; + } + return -ENOENT; + } +#endif + if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) @@ -2990,6 +3063,12 @@ static int may_open(const struct path *path, int acc_mode, int flag) if (!inode) return -ENOENT; +#ifdef CONFIG_KSU_SUSFS_SUS_PATH + if (unlikely(inode->i_state & 16777216) && likely(current_cred()->user->android_kabi_reserved2 & 16777216)) { + return -ENOENT; + } +#endif + switch (inode->i_mode & S_IFMT) { case S_IFLNK: return -ELOOP; @@ -3069,7 +3148,20 @@ static inline int open_to_namei_flags(int flag) static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode) { struct user_namespace *s_user_ns; +#ifdef CONFIG_KSU_SUSFS_SUS_PATH + int error; + + if (dentry->d_inode && unlikely(dentry->d_inode->i_state & 16777216) && likely(current_cred()->user->android_kabi_reserved2 & 16777216)) { + error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); + if (error) { + return error; + } + return -ENOENT; + } + error = security_path_mknod(dir, dentry, mode, 0); +#else int error = security_path_mknod(dir, dentry, mode, 0); +#endif if (error) return error; @@ -3190,6 +3282,12 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, } if (dentry->d_inode) { /* Cached positive dentry: will open in f_op->open */ +#ifdef CONFIG_KSU_SUSFS_SUS_PATH + if (unlikely(dentry->d_inode->i_state & 16777216) && likely(current_cred()->user->android_kabi_reserved2 & 16777216)) { + dput(dentry); + return ERR_PTR(-ENOENT); + } +#endif return dentry; } @@ -3219,6 +3317,16 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, dentry = atomic_open(nd, dentry, file, open_flag, mode); if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT)) dentry = ERR_PTR(create_error); +#ifdef CONFIG_KSU_SUSFS_SUS_PATH + if (!IS_ERR(dentry) && dentry->d_inode && unlikely(dentry->d_inode->i_state & 16777216) && likely(current_cred()->user->android_kabi_reserved2 & 16777216)) { + if (create_error) { + dput(dentry); + return ERR_PTR(create_error); + } + dput(dentry); + return ERR_PTR(-ENOENT); + } +#endif return dentry; } @@ -3233,6 +3341,12 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, } dput(dentry); dentry = res; +#ifdef CONFIG_KSU_SUSFS_SUS_PATH + if (dentry->d_inode && unlikely(dentry->d_inode->i_state & 16777216) && likely(current_cred()->user->android_kabi_reserved2 & 16777216)) { + dput(dentry); + return ERR_PTR(-ENOENT); + } +#endif } } @@ -3525,12 +3639,19 @@ static struct file *path_openat(struct nameidata *nd, return ERR_PTR(error); } +#ifdef CONFIG_KSU_SUSFS_OPEN_REDIRECT +extern struct filename* susfs_get_redirected_path(unsigned long ino); +#endif + struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op) { struct nameidata nd; int flags = op->lookup_flags; struct file *filp; +#ifdef CONFIG_KSU_SUSFS_OPEN_REDIRECT + struct filename *fake_pathname; +#endif set_nameidata(&nd, dfd, pathname); filp = path_openat(&nd, op, flags | LOOKUP_RCU); @@ -3538,6 +3659,25 @@ struct file *do_filp_open(int dfd, struct filename *pathname, filp = path_openat(&nd, op, flags); if (unlikely(filp == ERR_PTR(-ESTALE))) filp = path_openat(&nd, op, flags | LOOKUP_REVAL); +#ifdef CONFIG_KSU_SUSFS_OPEN_REDIRECT + if (!IS_ERR(filp) && unlikely(filp->f_inode->i_state & 134217728) && current_uid().val < 2000) { + fake_pathname = susfs_get_redirected_path(filp->f_inode->i_ino); + if (!IS_ERR(fake_pathname)) { + restore_nameidata(); + filp_close(filp, NULL); + // no need to do `putname(pathname);` here as it will be done by calling process + set_nameidata(&nd, dfd, fake_pathname); + filp = path_openat(&nd, op, flags | LOOKUP_RCU); + if (unlikely(filp == ERR_PTR(-ECHILD))) + filp = path_openat(&nd, op, flags); + if (unlikely(filp == ERR_PTR(-ESTALE))) + filp = path_openat(&nd, op, flags | LOOKUP_REVAL); + restore_nameidata(); + putname(fake_pathname); + return filp; + } + } +#endif restore_nameidata(); return filp; } diff --git a/fs/namespace.c b/fs/namespace.c index 007040b013c0..a3862ef4df14 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -34,6 +34,23 @@ #include "pnode.h" #include "internal.h" +#ifdef CONFIG_KSU_SUSFS_SUS_MOUNT +extern bool susfs_is_current_ksu_domain(void); +extern bool susfs_is_current_zygote_domain(void); +#define CL_SUSFS_COPY_MNT_NS 0x1000000 +#define DEFAULT_SUS_MNT_GROUP_ID 1000 +#endif + +#ifdef CONFIG_KSU_SUSFS_AUTO_ADD_SUS_BIND_MOUNT +extern int susfs_auto_add_sus_bind_mount(const char *pathname, struct path *path_target); +#endif +#ifdef CONFIG_KSU_SUSFS_AUTO_ADD_TRY_UMOUNT_FOR_BIND_MOUNT +extern void susfs_auto_add_try_umount_for_bind_mount(struct path *path); +#endif +#ifdef CONFIG_KSU_SUSFS_AUTO_ADD_SUS_KSU_DEFAULT_MOUNT +extern void susfs_auto_add_sus_ksu_default_mount(const char __user *to_pathname); +#endif + /* Maximum number of mounts in a mount namespace */ unsigned int sysctl_mount_max __read_mostly = 100000; @@ -114,9 +131,25 @@ static int mnt_alloc_id(struct mount *mnt) static void mnt_free_id(struct mount *mnt) { +#ifdef CONFIG_KSU_SUSFS_SUS_MOUNT + // If mnt->mnt.android_kabi_reserved4 is not zero, it means mnt->mnt_id is spoofed, + // so here we return the original mnt_id for being freed. + if (unlikely(mnt->mnt.android_kabi_reserved4)) { + ida_free(&mnt_id_ida, mnt->mnt.android_kabi_reserved4); + return; + } +#endif ida_free(&mnt_id_ida, mnt->mnt_id); } +#ifdef CONFIG_KSU_SUSFS_SUS_MOUNT +static void susfs_mnt_alloc_group_id(struct mount *mnt) +{ + // Just assign the same default sus mount_group_id to mnt->mnt_group_id + mnt->mnt_group_id = DEFAULT_SUS_MNT_GROUP_ID; +} +#endif + /* * Allocate a new peer group ID */ @@ -135,6 +168,14 @@ static int mnt_alloc_group_id(struct mount *mnt) */ void mnt_release_group_id(struct mount *mnt) { +#ifdef CONFIG_KSU_SUSFS_SUS_MOUNT + // If mnt->mnt_group_id >= DEFAULT_SUS_MNT_GROUP_ID, it means 'mnt' is sus mount, + // here we don't need to free the mnt_group_id and just simply return and do nothing. + if (unlikely(mnt->mnt_group_id >= DEFAULT_SUS_MNT_GROUP_ID)) { + mnt->mnt_group_id = 0; + return; + } +#endif ida_free(&mnt_group_ida, mnt->mnt_group_id); mnt->mnt_group_id = 0; } @@ -966,6 +1007,13 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc) mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; +#ifdef CONFIG_KSU_SUSFS_SUS_MOUNT + if (susfs_is_current_zygote_domain()) { + mnt->mnt.android_kabi_reserved4 = mnt->mnt_id; + mnt->mnt_id = current->android_kabi_reserved8++; + } +#endif + lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); unlock_mount_hash(); @@ -1059,6 +1107,14 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, mnt->mnt.mnt_root = dget(root); mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; + +#ifdef CONFIG_KSU_SUSFS_SUS_MOUNT + if (susfs_is_current_zygote_domain() && !(flag & CL_SUSFS_COPY_MNT_NS)) { + mnt->mnt.android_kabi_reserved4 = mnt->mnt_id; + mnt->mnt_id = current->android_kabi_reserved8++; + } +#endif + lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &sb->s_mounts); unlock_mount_hash(); @@ -2034,6 +2090,17 @@ static int invent_group_ids(struct mount *mnt, bool recurse) { struct mount *p; +#ifdef CONFIG_KSU_SUSFS_SUS_MOUNT + if (susfs_is_current_ksu_domain()) { + for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { + if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { + susfs_mnt_alloc_group_id(p); + } + } + return 0; + } +#endif + for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { int err = mnt_alloc_group_id(p); @@ -2392,6 +2459,24 @@ static int do_loopback(struct path *path, const char *old_name, umount_tree(mnt, UMOUNT_SYNC); unlock_mount_hash(); } +#if defined(CONFIG_KSU_SUSFS_AUTO_ADD_SUS_BIND_MOUNT) || defined(CONFIG_KSU_SUSFS_AUTO_ADD_TRY_UMOUNT_FOR_BIND_MOUNT) + // Check if bind mounted path should be hidden and umounted automatically. + // And we target only process with ksu domain. + if (susfs_is_current_ksu_domain()) { +#if defined(CONFIG_KSU_SUSFS_AUTO_ADD_SUS_BIND_MOUNT) + if (susfs_auto_add_sus_bind_mount(old_name, &old_path)) { + goto orig_flow; + } +#endif +#if defined(CONFIG_KSU_SUSFS_AUTO_ADD_TRY_UMOUNT_FOR_BIND_MOUNT) + susfs_auto_add_try_umount_for_bind_mount(path); +#endif + } +#if defined(CONFIG_KSU_SUSFS_AUTO_ADD_SUS_BIND_MOUNT) +orig_flow: +#endif +#endif // #if defined(CONFIG_KSU_SUSFS_AUTO_ADD_SUS_BIND_MOUNT) || defined(CONFIG_KSU_SUSFS_AUTO_ADD_TRY_UMOUNT_FOR_BIND_MOUNT) + out2: unlock_mount(mp); out: @@ -3335,6 +3420,10 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, struct mount *old; struct mount *new; int copy_flags; +#ifdef CONFIG_KSU_SUSFS_SUS_MOUNT + bool is_zygote_pid = susfs_is_current_zygote_domain(); + int last_entry_mnt_id = 0; +#endif BUG_ON(!ns); @@ -3354,6 +3443,13 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) copy_flags |= CL_SHARED_TO_SLAVE; + +#ifdef CONFIG_KSU_SUSFS_SUS_MOUNT + if (is_zygote_pid) { + // Let clone_mnt() in copy_tree() know we only interested in function called by copy_mnt_ns() + copy_flags |= CL_SUSFS_COPY_MNT_NS; + } +#endif new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); @@ -3395,6 +3491,29 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, while (p->mnt.mnt_root != q->mnt.mnt_root) p = next_mnt(p, old); } +#ifdef CONFIG_KSU_SUSFS_SUS_MOUNT + // current->android_kabi_reserved8 -> to record last valid fake mnt_id to zygote pid + // q->mnt.android_kabi_reserved4 -> original mnt_id + // q->mnt_id -> will be modified to the fake mnt_id + + // Here We are only interested in processes of which original mnt namespace belongs to zygote + // Also we just make use of existing 'q' mount pointer, no need to delcare extra mount pointer + if (is_zygote_pid) { + last_entry_mnt_id = list_first_entry(&new_ns->list, struct mount, mnt_list)->mnt_id; + list_for_each_entry(q, &new_ns->list, mnt_list) { + if (unlikely(q->mnt.mnt_root->d_inode->i_state & 33554432)) { + continue; + } + q->mnt.android_kabi_reserved4 = q->mnt_id; + q->mnt_id = last_entry_mnt_id++; + } + } + // Assign the 'last_entry_mnt_id' to 'current->android_kabi_reserved8' for later use. + // should be fine here assuming zygote is forking/unsharing app in one single thread. + // Or should we put a lock here? + current->android_kabi_reserved8 = last_entry_mnt_id; +#endif + namespace_unlock(); if (rootmnt) @@ -3671,6 +3790,12 @@ SYSCALL_DEFINE5(move_mount, path_put(&to_path); out_from: path_put(&from_path); +#ifdef CONFIG_KSU_SUSFS_AUTO_ADD_SUS_KSU_DEFAULT_MOUNT + if (!ret && susfs_is_current_ksu_domain()) { + susfs_auto_add_sus_ksu_default_mount(to_pathname); + } +#endif + return ret; } @@ -4147,3 +4272,24 @@ const struct proc_ns_operations mntns_operations = { .install = mntns_install, .owner = mntns_owner, }; + +#ifdef CONFIG_KSU_SUSFS_TRY_UMOUNT +extern void susfs_try_umount_all(uid_t uid); +void susfs_run_try_umount_for_current_mnt_ns(void) { + struct mount *mnt; + struct mnt_namespace *mnt_ns; + + mnt_ns = current->nsproxy->mnt_ns; + // Lock the namespace + namespace_lock(); + list_for_each_entry(mnt, &mnt_ns->list, mnt_list) { + // Change the sus mount to be private + if (mnt->mnt.mnt_root->d_inode->i_state & 33554432) { + change_mnt_propagation(mnt, MS_PRIVATE); + } + } + // Unlock the namespace + namespace_unlock(); + susfs_try_umount_all(current_uid().val); +} +#endif \ No newline at end of file diff --git a/fs/namespace.c.orig b/fs/namespace.c.orig new file mode 100644 index 000000000000..007040b013c0 --- /dev/null +++ b/fs/namespace.c.orig @@ -0,0 +1,4149 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * linux/fs/namespace.c + * + * (C) Copyright Al Viro 2000, 2001 + * + * Based on code from fs/super.c, copyright Linus Torvalds and others. + * Heavily rewritten. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* init_rootfs */ +#include /* get_fs_root et.al. */ +#include /* fsnotify_vfsmount_delete */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pnode.h" +#include "internal.h" + +/* Maximum number of mounts in a mount namespace */ +unsigned int sysctl_mount_max __read_mostly = 100000; + +static unsigned int m_hash_mask __read_mostly; +static unsigned int m_hash_shift __read_mostly; +static unsigned int mp_hash_mask __read_mostly; +static unsigned int mp_hash_shift __read_mostly; + +static __initdata unsigned long mhash_entries; +static int __init set_mhash_entries(char *str) +{ + if (!str) + return 0; + mhash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("mhash_entries=", set_mhash_entries); + +static __initdata unsigned long mphash_entries; +static int __init set_mphash_entries(char *str) +{ + if (!str) + return 0; + mphash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("mphash_entries=", set_mphash_entries); + +static u64 event; +static DEFINE_IDA(mnt_id_ida); +static DEFINE_IDA(mnt_group_ida); + +static struct hlist_head *mount_hashtable __read_mostly; +static struct hlist_head *mountpoint_hashtable __read_mostly; +static struct kmem_cache *mnt_cache __read_mostly; +static DECLARE_RWSEM(namespace_sem); +static HLIST_HEAD(unmounted); /* protected by namespace_sem */ +static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ + +/* /sys/fs */ +struct kobject *fs_kobj; +EXPORT_SYMBOL_GPL(fs_kobj); + +/* + * vfsmount lock may be taken for read to prevent changes to the + * vfsmount hash, ie. during mountpoint lookups or walking back + * up the tree. + * + * It should be taken for write in all cases where the vfsmount + * tree or hash is modified or when a vfsmount structure is modified. + */ +__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); + +static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) +{ + unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); + tmp += ((unsigned long)dentry / L1_CACHE_BYTES); + tmp = tmp + (tmp >> m_hash_shift); + return &mount_hashtable[tmp & m_hash_mask]; +} + +static inline struct hlist_head *mp_hash(struct dentry *dentry) +{ + unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); + tmp = tmp + (tmp >> mp_hash_shift); + return &mountpoint_hashtable[tmp & mp_hash_mask]; +} + +static int mnt_alloc_id(struct mount *mnt) +{ + int res = ida_alloc(&mnt_id_ida, GFP_KERNEL); + + if (res < 0) + return res; + mnt->mnt_id = res; + return 0; +} + +static void mnt_free_id(struct mount *mnt) +{ + ida_free(&mnt_id_ida, mnt->mnt_id); +} + +/* + * Allocate a new peer group ID + */ +static int mnt_alloc_group_id(struct mount *mnt) +{ + int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL); + + if (res < 0) + return res; + mnt->mnt_group_id = res; + return 0; +} + +/* + * Release a peer group ID + */ +void mnt_release_group_id(struct mount *mnt) +{ + ida_free(&mnt_group_ida, mnt->mnt_group_id); + mnt->mnt_group_id = 0; +} + +/* + * vfsmount lock must be held for read + */ +static inline void mnt_add_count(struct mount *mnt, int n) +{ +#ifdef CONFIG_SMP + this_cpu_add(mnt->mnt_pcp->mnt_count, n); +#else + preempt_disable(); + mnt->mnt_count += n; + preempt_enable(); +#endif +} + +/* + * vfsmount lock must be held for write + */ +int mnt_get_count(struct mount *mnt) +{ +#ifdef CONFIG_SMP + int count = 0; + int cpu; + + for_each_possible_cpu(cpu) { + count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; + } + + return count; +#else + return mnt->mnt_count; +#endif +} + +static struct mount *alloc_vfsmnt(const char *name) +{ + struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); + if (mnt) { + int err; + + err = mnt_alloc_id(mnt); + if (err) + goto out_free_cache; + + if (name) { + mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL); + if (!mnt->mnt_devname) + goto out_free_id; + } + +#ifdef CONFIG_SMP + mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); + if (!mnt->mnt_pcp) + goto out_free_devname; + + this_cpu_add(mnt->mnt_pcp->mnt_count, 1); +#else + mnt->mnt_count = 1; + mnt->mnt_writers = 0; +#endif + + INIT_HLIST_NODE(&mnt->mnt_hash); + INIT_LIST_HEAD(&mnt->mnt_child); + INIT_LIST_HEAD(&mnt->mnt_mounts); + INIT_LIST_HEAD(&mnt->mnt_list); + INIT_LIST_HEAD(&mnt->mnt_expire); + INIT_LIST_HEAD(&mnt->mnt_share); + INIT_LIST_HEAD(&mnt->mnt_slave_list); + INIT_LIST_HEAD(&mnt->mnt_slave); + INIT_HLIST_NODE(&mnt->mnt_mp_list); + INIT_LIST_HEAD(&mnt->mnt_umounting); + INIT_HLIST_HEAD(&mnt->mnt_stuck_children); + } + return mnt; + +#ifdef CONFIG_SMP +out_free_devname: + kfree_const(mnt->mnt_devname); +#endif +out_free_id: + mnt_free_id(mnt); +out_free_cache: + kmem_cache_free(mnt_cache, mnt); + return NULL; +} + +/* + * Most r/o checks on a fs are for operations that take + * discrete amounts of time, like a write() or unlink(). + * We must keep track of when those operations start + * (for permission checks) and when they end, so that + * we can determine when writes are able to occur to + * a filesystem. + */ +/* + * __mnt_is_readonly: check whether a mount is read-only + * @mnt: the mount to check for its write status + * + * This shouldn't be used directly ouside of the VFS. + * It does not guarantee that the filesystem will stay + * r/w, just that it is right *now*. This can not and + * should not be used in place of IS_RDONLY(inode). + * mnt_want/drop_write() will _keep_ the filesystem + * r/w. + */ +bool __mnt_is_readonly(struct vfsmount *mnt) +{ + return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb); +} +EXPORT_SYMBOL_GPL(__mnt_is_readonly); + +static inline void mnt_inc_writers(struct mount *mnt) +{ +#ifdef CONFIG_SMP + this_cpu_inc(mnt->mnt_pcp->mnt_writers); +#else + mnt->mnt_writers++; +#endif +} + +static inline void mnt_dec_writers(struct mount *mnt) +{ +#ifdef CONFIG_SMP + this_cpu_dec(mnt->mnt_pcp->mnt_writers); +#else + mnt->mnt_writers--; +#endif +} + +static unsigned int mnt_get_writers(struct mount *mnt) +{ +#ifdef CONFIG_SMP + unsigned int count = 0; + int cpu; + + for_each_possible_cpu(cpu) { + count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; + } + + return count; +#else + return mnt->mnt_writers; +#endif +} + +static int mnt_is_readonly(struct vfsmount *mnt) +{ + if (mnt->mnt_sb->s_readonly_remount) + return 1; + /* Order wrt setting s_flags/s_readonly_remount in do_remount() */ + smp_rmb(); + return __mnt_is_readonly(mnt); +} + +/* + * Most r/o & frozen checks on a fs are for operations that take discrete + * amounts of time, like a write() or unlink(). We must keep track of when + * those operations start (for permission checks) and when they end, so that we + * can determine when writes are able to occur to a filesystem. + */ +/** + * __mnt_want_write - get write access to a mount without freeze protection + * @m: the mount on which to take a write + * + * This tells the low-level filesystem that a write is about to be performed to + * it, and makes sure that writes are allowed (mnt it read-write) before + * returning success. This operation does not protect against filesystem being + * frozen. When the write operation is finished, __mnt_drop_write() must be + * called. This is effectively a refcount. + */ +int __mnt_want_write(struct vfsmount *m) +{ + struct mount *mnt = real_mount(m); + int ret = 0; + + preempt_disable(); + mnt_inc_writers(mnt); + /* + * The store to mnt_inc_writers must be visible before we pass + * MNT_WRITE_HOLD loop below, so that the slowpath can see our + * incremented count after it has set MNT_WRITE_HOLD. + */ + smp_mb(); + while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) + cpu_relax(); + /* + * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will + * be set to match its requirements. So we must not load that until + * MNT_WRITE_HOLD is cleared. + */ + smp_rmb(); + if (mnt_is_readonly(m)) { + mnt_dec_writers(mnt); + ret = -EROFS; + } + preempt_enable(); + + return ret; +} + +/** + * mnt_want_write - get write access to a mount + * @m: the mount on which to take a write + * + * This tells the low-level filesystem that a write is about to be performed to + * it, and makes sure that writes are allowed (mount is read-write, filesystem + * is not frozen) before returning success. When the write operation is + * finished, mnt_drop_write() must be called. This is effectively a refcount. + */ +int mnt_want_write(struct vfsmount *m) +{ + int ret; + + sb_start_write(m->mnt_sb); + ret = __mnt_want_write(m); + if (ret) + sb_end_write(m->mnt_sb); + return ret; +} +EXPORT_SYMBOL_GPL(mnt_want_write); + +/** + * mnt_clone_write - get write access to a mount + * @mnt: the mount on which to take a write + * + * This is effectively like mnt_want_write, except + * it must only be used to take an extra write reference + * on a mountpoint that we already know has a write reference + * on it. This allows some optimisation. + * + * After finished, mnt_drop_write must be called as usual to + * drop the reference. + */ +int mnt_clone_write(struct vfsmount *mnt) +{ + /* superblock may be r/o */ + if (__mnt_is_readonly(mnt)) + return -EROFS; + preempt_disable(); + mnt_inc_writers(real_mount(mnt)); + preempt_enable(); + return 0; +} +EXPORT_SYMBOL_GPL(mnt_clone_write); + +/** + * __mnt_want_write_file - get write access to a file's mount + * @file: the file who's mount on which to take a write + * + * This is like __mnt_want_write, but it takes a file and can + * do some optimisations if the file is open for write already + */ +int __mnt_want_write_file(struct file *file) +{ + if (!(file->f_mode & FMODE_WRITER)) + return __mnt_want_write(file->f_path.mnt); + else + return mnt_clone_write(file->f_path.mnt); +} + +/** + * mnt_want_write_file - get write access to a file's mount + * @file: the file who's mount on which to take a write + * + * This is like mnt_want_write, but it takes a file and can + * do some optimisations if the file is open for write already + */ +int mnt_want_write_file(struct file *file) +{ + int ret; + + sb_start_write(file_inode(file)->i_sb); + ret = __mnt_want_write_file(file); + if (ret) + sb_end_write(file_inode(file)->i_sb); + return ret; +} +EXPORT_SYMBOL_NS_GPL(mnt_want_write_file, ANDROID_GKI_VFS_EXPORT_ONLY); + +/** + * __mnt_drop_write - give up write access to a mount + * @mnt: the mount on which to give up write access + * + * Tells the low-level filesystem that we are done + * performing writes to it. Must be matched with + * __mnt_want_write() call above. + */ +void __mnt_drop_write(struct vfsmount *mnt) +{ + preempt_disable(); + mnt_dec_writers(real_mount(mnt)); + preempt_enable(); +} + +/** + * mnt_drop_write - give up write access to a mount + * @mnt: the mount on which to give up write access + * + * Tells the low-level filesystem that we are done performing writes to it and + * also allows filesystem to be frozen again. Must be matched with + * mnt_want_write() call above. + */ +void mnt_drop_write(struct vfsmount *mnt) +{ + __mnt_drop_write(mnt); + sb_end_write(mnt->mnt_sb); +} +EXPORT_SYMBOL_GPL(mnt_drop_write); + +void __mnt_drop_write_file(struct file *file) +{ + __mnt_drop_write(file->f_path.mnt); +} + +void mnt_drop_write_file(struct file *file) +{ + __mnt_drop_write_file(file); + sb_end_write(file_inode(file)->i_sb); +} +EXPORT_SYMBOL_NS(mnt_drop_write_file, ANDROID_GKI_VFS_EXPORT_ONLY); + +static int mnt_make_readonly(struct mount *mnt) +{ + int ret = 0; + + lock_mount_hash(); + mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; + /* + * After storing MNT_WRITE_HOLD, we'll read the counters. This store + * should be visible before we do. + */ + smp_mb(); + + /* + * With writers on hold, if this value is zero, then there are + * definitely no active writers (although held writers may subsequently + * increment the count, they'll have to wait, and decrement it after + * seeing MNT_READONLY). + * + * It is OK to have counter incremented on one CPU and decremented on + * another: the sum will add up correctly. The danger would be when we + * sum up each counter, if we read a counter before it is incremented, + * but then read another CPU's count which it has been subsequently + * decremented from -- we would see more decrements than we should. + * MNT_WRITE_HOLD protects against this scenario, because + * mnt_want_write first increments count, then smp_mb, then spins on + * MNT_WRITE_HOLD, so it can't be decremented by another CPU while + * we're counting up here. + */ + if (mnt_get_writers(mnt) > 0) + ret = -EBUSY; + else + mnt->mnt.mnt_flags |= MNT_READONLY; + /* + * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers + * that become unheld will see MNT_READONLY. + */ + smp_wmb(); + mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + unlock_mount_hash(); + return ret; +} + +static int __mnt_unmake_readonly(struct mount *mnt) +{ + lock_mount_hash(); + mnt->mnt.mnt_flags &= ~MNT_READONLY; + unlock_mount_hash(); + return 0; +} + +int sb_prepare_remount_readonly(struct super_block *sb) +{ + struct mount *mnt; + int err = 0; + + /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ + if (atomic_long_read(&sb->s_remove_count)) + return -EBUSY; + + lock_mount_hash(); + list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { + if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { + mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; + smp_mb(); + if (mnt_get_writers(mnt) > 0) { + err = -EBUSY; + break; + } + } + } + if (!err && atomic_long_read(&sb->s_remove_count)) + err = -EBUSY; + + if (!err) { + sb->s_readonly_remount = 1; + smp_wmb(); + } + list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { + if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) + mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + } + unlock_mount_hash(); + + return err; +} + +static void free_vfsmnt(struct mount *mnt) +{ + kfree_const(mnt->mnt_devname); +#ifdef CONFIG_SMP + free_percpu(mnt->mnt_pcp); +#endif + kmem_cache_free(mnt_cache, mnt); +} + +static void delayed_free_vfsmnt(struct rcu_head *head) +{ + free_vfsmnt(container_of(head, struct mount, mnt_rcu)); +} + +/* call under rcu_read_lock */ +int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) +{ + struct mount *mnt; + if (read_seqretry(&mount_lock, seq)) + return 1; + if (bastard == NULL) + return 0; + mnt = real_mount(bastard); + mnt_add_count(mnt, 1); + smp_mb(); // see mntput_no_expire() + if (likely(!read_seqretry(&mount_lock, seq))) + return 0; + if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { + mnt_add_count(mnt, -1); + return 1; + } + lock_mount_hash(); + if (unlikely(bastard->mnt_flags & MNT_DOOMED)) { + mnt_add_count(mnt, -1); + unlock_mount_hash(); + return 1; + } + unlock_mount_hash(); + /* caller will mntput() */ + return -1; +} + +/* call under rcu_read_lock */ +bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) +{ + int res = __legitimize_mnt(bastard, seq); + if (likely(!res)) + return true; + if (unlikely(res < 0)) { + rcu_read_unlock(); + mntput(bastard); + rcu_read_lock(); + } + return false; +} + +/* + * find the first mount at @dentry on vfsmount @mnt. + * call under rcu_read_lock() + */ +struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) +{ + struct hlist_head *head = m_hash(mnt, dentry); + struct mount *p; + + hlist_for_each_entry_rcu(p, head, mnt_hash) + if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) + return p; + return NULL; +} + +/* + * lookup_mnt - Return the first child mount mounted at path + * + * "First" means first mounted chronologically. If you create the + * following mounts: + * + * mount /dev/sda1 /mnt + * mount /dev/sda2 /mnt + * mount /dev/sda3 /mnt + * + * Then lookup_mnt() on the base /mnt dentry in the root mount will + * return successively the root dentry and vfsmount of /dev/sda1, then + * /dev/sda2, then /dev/sda3, then NULL. + * + * lookup_mnt takes a reference to the found vfsmount. + */ +struct vfsmount *lookup_mnt(const struct path *path) +{ + struct mount *child_mnt; + struct vfsmount *m; + unsigned seq; + + rcu_read_lock(); + do { + seq = read_seqbegin(&mount_lock); + child_mnt = __lookup_mnt(path->mnt, path->dentry); + m = child_mnt ? &child_mnt->mnt : NULL; + } while (!legitimize_mnt(m, seq)); + rcu_read_unlock(); + return m; +} + +static inline void lock_ns_list(struct mnt_namespace *ns) +{ + spin_lock(&ns->ns_lock); +} + +static inline void unlock_ns_list(struct mnt_namespace *ns) +{ + spin_unlock(&ns->ns_lock); +} + +static inline bool mnt_is_cursor(struct mount *mnt) +{ + return mnt->mnt.mnt_flags & MNT_CURSOR; +} + +/* + * __is_local_mountpoint - Test to see if dentry is a mountpoint in the + * current mount namespace. + * + * The common case is dentries are not mountpoints at all and that + * test is handled inline. For the slow case when we are actually + * dealing with a mountpoint of some kind, walk through all of the + * mounts in the current mount namespace and test to see if the dentry + * is a mountpoint. + * + * The mount_hashtable is not usable in the context because we + * need to identify all mounts that may be in the current mount + * namespace not just a mount that happens to have some specified + * parent mount. + */ +bool __is_local_mountpoint(struct dentry *dentry) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + bool is_covered = false; + + down_read(&namespace_sem); + lock_ns_list(ns); + list_for_each_entry(mnt, &ns->list, mnt_list) { + if (mnt_is_cursor(mnt)) + continue; + is_covered = (mnt->mnt_mountpoint == dentry); + if (is_covered) + break; + } + unlock_ns_list(ns); + up_read(&namespace_sem); + + return is_covered; +} + +static struct mountpoint *lookup_mountpoint(struct dentry *dentry) +{ + struct hlist_head *chain = mp_hash(dentry); + struct mountpoint *mp; + + hlist_for_each_entry(mp, chain, m_hash) { + if (mp->m_dentry == dentry) { + mp->m_count++; + return mp; + } + } + return NULL; +} + +static struct mountpoint *get_mountpoint(struct dentry *dentry) +{ + struct mountpoint *mp, *new = NULL; + int ret; + + if (d_mountpoint(dentry)) { + /* might be worth a WARN_ON() */ + if (d_unlinked(dentry)) + return ERR_PTR(-ENOENT); +mountpoint: + read_seqlock_excl(&mount_lock); + mp = lookup_mountpoint(dentry); + read_sequnlock_excl(&mount_lock); + if (mp) + goto done; + } + + if (!new) + new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); + if (!new) + return ERR_PTR(-ENOMEM); + + + /* Exactly one processes may set d_mounted */ + ret = d_set_mounted(dentry); + + /* Someone else set d_mounted? */ + if (ret == -EBUSY) + goto mountpoint; + + /* The dentry is not available as a mountpoint? */ + mp = ERR_PTR(ret); + if (ret) + goto done; + + /* Add the new mountpoint to the hash table */ + read_seqlock_excl(&mount_lock); + new->m_dentry = dget(dentry); + new->m_count = 1; + hlist_add_head(&new->m_hash, mp_hash(dentry)); + INIT_HLIST_HEAD(&new->m_list); + read_sequnlock_excl(&mount_lock); + + mp = new; + new = NULL; +done: + kfree(new); + return mp; +} + +/* + * vfsmount lock must be held. Additionally, the caller is responsible + * for serializing calls for given disposal list. + */ +static void __put_mountpoint(struct mountpoint *mp, struct list_head *list) +{ + if (!--mp->m_count) { + struct dentry *dentry = mp->m_dentry; + BUG_ON(!hlist_empty(&mp->m_list)); + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_MOUNTED; + spin_unlock(&dentry->d_lock); + dput_to_list(dentry, list); + hlist_del(&mp->m_hash); + kfree(mp); + } +} + +/* called with namespace_lock and vfsmount lock */ +static void put_mountpoint(struct mountpoint *mp) +{ + __put_mountpoint(mp, &ex_mountpoints); +} + +static inline int check_mnt(struct mount *mnt) +{ + return mnt->mnt_ns == current->nsproxy->mnt_ns; +} + +/* + * vfsmount lock must be held for write + */ +static void touch_mnt_namespace(struct mnt_namespace *ns) +{ + if (ns) { + ns->event = ++event; + wake_up_interruptible(&ns->poll); + } +} + +/* + * vfsmount lock must be held for write + */ +static void __touch_mnt_namespace(struct mnt_namespace *ns) +{ + if (ns && ns->event != event) { + ns->event = event; + wake_up_interruptible(&ns->poll); + } +} + +/* + * vfsmount lock must be held for write + */ +static struct mountpoint *unhash_mnt(struct mount *mnt) +{ + struct mountpoint *mp; + mnt->mnt_parent = mnt; + mnt->mnt_mountpoint = mnt->mnt.mnt_root; + list_del_init(&mnt->mnt_child); + hlist_del_init_rcu(&mnt->mnt_hash); + hlist_del_init(&mnt->mnt_mp_list); + mp = mnt->mnt_mp; + mnt->mnt_mp = NULL; + return mp; +} + +/* + * vfsmount lock must be held for write + */ +static void umount_mnt(struct mount *mnt) +{ + put_mountpoint(unhash_mnt(mnt)); +} + +/* + * vfsmount lock must be held for write + */ +void mnt_set_mountpoint(struct mount *mnt, + struct mountpoint *mp, + struct mount *child_mnt) +{ + mp->m_count++; + mnt_add_count(mnt, 1); /* essentially, that's mntget */ + child_mnt->mnt_mountpoint = mp->m_dentry; + child_mnt->mnt_parent = mnt; + child_mnt->mnt_mp = mp; + hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); +} + +static void __attach_mnt(struct mount *mnt, struct mount *parent) +{ + hlist_add_head_rcu(&mnt->mnt_hash, + m_hash(&parent->mnt, mnt->mnt_mountpoint)); + list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); +} + +/* + * vfsmount lock must be held for write + */ +static void attach_mnt(struct mount *mnt, + struct mount *parent, + struct mountpoint *mp) +{ + mnt_set_mountpoint(parent, mp, mnt); + __attach_mnt(mnt, parent); +} + +void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) +{ + struct mountpoint *old_mp = mnt->mnt_mp; + struct mount *old_parent = mnt->mnt_parent; + + list_del_init(&mnt->mnt_child); + hlist_del_init(&mnt->mnt_mp_list); + hlist_del_init_rcu(&mnt->mnt_hash); + + attach_mnt(mnt, parent, mp); + + put_mountpoint(old_mp); + mnt_add_count(old_parent, -1); +} + +/* + * vfsmount lock must be held for write + */ +static void commit_tree(struct mount *mnt) +{ + struct mount *parent = mnt->mnt_parent; + struct mount *m; + LIST_HEAD(head); + struct mnt_namespace *n = parent->mnt_ns; + + BUG_ON(parent == mnt); + + list_add_tail(&head, &mnt->mnt_list); + list_for_each_entry(m, &head, mnt_list) + m->mnt_ns = n; + + list_splice(&head, n->list.prev); + + n->mounts += n->pending_mounts; + n->pending_mounts = 0; + + __attach_mnt(mnt, parent); + touch_mnt_namespace(n); +} + +static struct mount *next_mnt(struct mount *p, struct mount *root) +{ + struct list_head *next = p->mnt_mounts.next; + if (next == &p->mnt_mounts) { + while (1) { + if (p == root) + return NULL; + next = p->mnt_child.next; + if (next != &p->mnt_parent->mnt_mounts) + break; + p = p->mnt_parent; + } + } + return list_entry(next, struct mount, mnt_child); +} + +static struct mount *skip_mnt_tree(struct mount *p) +{ + struct list_head *prev = p->mnt_mounts.prev; + while (prev != &p->mnt_mounts) { + p = list_entry(prev, struct mount, mnt_child); + prev = p->mnt_mounts.prev; + } + return p; +} + +/** + * vfs_create_mount - Create a mount for a configured superblock + * @fc: The configuration context with the superblock attached + * + * Create a mount to an already configured superblock. If necessary, the + * caller should invoke vfs_get_tree() before calling this. + * + * Note that this does not attach the mount to anything. + */ +struct vfsmount *vfs_create_mount(struct fs_context *fc) +{ + struct mount *mnt; + + if (!fc->root) + return ERR_PTR(-EINVAL); + + mnt = alloc_vfsmnt(fc->source ?: "none"); + if (!mnt) + return ERR_PTR(-ENOMEM); + + if (fc->sb_flags & SB_KERNMOUNT) + mnt->mnt.mnt_flags = MNT_INTERNAL; + + atomic_inc(&fc->root->d_sb->s_active); + mnt->mnt.mnt_sb = fc->root->d_sb; + mnt->mnt.mnt_root = dget(fc->root); + mnt->mnt_mountpoint = mnt->mnt.mnt_root; + mnt->mnt_parent = mnt; + + lock_mount_hash(); + list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); + unlock_mount_hash(); + return &mnt->mnt; +} +EXPORT_SYMBOL(vfs_create_mount); + +struct vfsmount *fc_mount(struct fs_context *fc) +{ + int err = vfs_get_tree(fc); + if (!err) { + up_write(&fc->root->d_sb->s_umount); + return vfs_create_mount(fc); + } + return ERR_PTR(err); +} +EXPORT_SYMBOL(fc_mount); + +struct vfsmount *vfs_kern_mount(struct file_system_type *type, + int flags, const char *name, + void *data) +{ + struct fs_context *fc; + struct vfsmount *mnt; + int ret = 0; + + if (!type) + return ERR_PTR(-EINVAL); + + fc = fs_context_for_mount(type, flags); + if (IS_ERR(fc)) + return ERR_CAST(fc); + + if (name) + ret = vfs_parse_fs_string(fc, "source", + name, strlen(name)); + if (!ret) + ret = parse_monolithic_mount_data(fc, data); + if (!ret) + mnt = fc_mount(fc); + else + mnt = ERR_PTR(ret); + + put_fs_context(fc); + return mnt; +} +EXPORT_SYMBOL_GPL(vfs_kern_mount); + +struct vfsmount * +vfs_submount(const struct dentry *mountpoint, struct file_system_type *type, + const char *name, void *data) +{ + /* Until it is worked out how to pass the user namespace + * through from the parent mount to the submount don't support + * unprivileged mounts with submounts. + */ + if (mountpoint->d_sb->s_user_ns != &init_user_ns) + return ERR_PTR(-EPERM); + + return vfs_kern_mount(type, SB_SUBMOUNT, name, data); +} +EXPORT_SYMBOL_GPL(vfs_submount); + +static struct mount *clone_mnt(struct mount *old, struct dentry *root, + int flag) +{ + struct super_block *sb = old->mnt.mnt_sb; + struct mount *mnt; + int err; + + mnt = alloc_vfsmnt(old->mnt_devname); + if (!mnt) + return ERR_PTR(-ENOMEM); + + if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) + mnt->mnt_group_id = 0; /* not a peer of original */ + else + mnt->mnt_group_id = old->mnt_group_id; + + if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { + err = mnt_alloc_group_id(mnt); + if (err) + goto out_free; + } + + mnt->mnt.mnt_flags = old->mnt.mnt_flags; + mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); + + atomic_inc(&sb->s_active); + mnt->mnt.mnt_sb = sb; + mnt->mnt.mnt_root = dget(root); + mnt->mnt_mountpoint = mnt->mnt.mnt_root; + mnt->mnt_parent = mnt; + lock_mount_hash(); + list_add_tail(&mnt->mnt_instance, &sb->s_mounts); + unlock_mount_hash(); + + if ((flag & CL_SLAVE) || + ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { + list_add(&mnt->mnt_slave, &old->mnt_slave_list); + mnt->mnt_master = old; + CLEAR_MNT_SHARED(mnt); + } else if (!(flag & CL_PRIVATE)) { + if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) + list_add(&mnt->mnt_share, &old->mnt_share); + if (IS_MNT_SLAVE(old)) + list_add(&mnt->mnt_slave, &old->mnt_slave); + mnt->mnt_master = old->mnt_master; + } else { + CLEAR_MNT_SHARED(mnt); + } + if (flag & CL_MAKE_SHARED) + set_mnt_shared(mnt); + + /* stick the duplicate mount on the same expiry list + * as the original if that was on one */ + if (flag & CL_EXPIRE) { + if (!list_empty(&old->mnt_expire)) + list_add(&mnt->mnt_expire, &old->mnt_expire); + } + + return mnt; + + out_free: + mnt_free_id(mnt); + free_vfsmnt(mnt); + return ERR_PTR(err); +} + +static void cleanup_mnt(struct mount *mnt) +{ + struct hlist_node *p; + struct mount *m; + /* + * The warning here probably indicates that somebody messed + * up a mnt_want/drop_write() pair. If this happens, the + * filesystem was probably unable to make r/w->r/o transitions. + * The locking used to deal with mnt_count decrement provides barriers, + * so mnt_get_writers() below is safe. + */ + WARN_ON(mnt_get_writers(mnt)); + if (unlikely(mnt->mnt_pins.first)) + mnt_pin_kill(mnt); + hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) { + hlist_del(&m->mnt_umount); + mntput(&m->mnt); + } + fsnotify_vfsmount_delete(&mnt->mnt); + dput(mnt->mnt.mnt_root); + deactivate_super(mnt->mnt.mnt_sb); + mnt_free_id(mnt); + call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); +} + +static void __cleanup_mnt(struct rcu_head *head) +{ + cleanup_mnt(container_of(head, struct mount, mnt_rcu)); +} + +static LLIST_HEAD(delayed_mntput_list); +static void delayed_mntput(struct work_struct *unused) +{ + struct llist_node *node = llist_del_all(&delayed_mntput_list); + struct mount *m, *t; + + llist_for_each_entry_safe(m, t, node, mnt_llist) + cleanup_mnt(m); +} +static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); + +static void mntput_no_expire(struct mount *mnt) +{ + LIST_HEAD(list); + int count; + + rcu_read_lock(); + if (likely(READ_ONCE(mnt->mnt_ns))) { + /* + * Since we don't do lock_mount_hash() here, + * ->mnt_ns can change under us. However, if it's + * non-NULL, then there's a reference that won't + * be dropped until after an RCU delay done after + * turning ->mnt_ns NULL. So if we observe it + * non-NULL under rcu_read_lock(), the reference + * we are dropping is not the final one. + */ + mnt_add_count(mnt, -1); + rcu_read_unlock(); + return; + } + lock_mount_hash(); + /* + * make sure that if __legitimize_mnt() has not seen us grab + * mount_lock, we'll see their refcount increment here. + */ + smp_mb(); + mnt_add_count(mnt, -1); + count = mnt_get_count(mnt); + if (count != 0) { + WARN_ON(count < 0); + rcu_read_unlock(); + unlock_mount_hash(); + return; + } + if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { + rcu_read_unlock(); + unlock_mount_hash(); + return; + } + mnt->mnt.mnt_flags |= MNT_DOOMED; + rcu_read_unlock(); + + list_del(&mnt->mnt_instance); + + if (unlikely(!list_empty(&mnt->mnt_mounts))) { + struct mount *p, *tmp; + list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { + __put_mountpoint(unhash_mnt(p), &list); + hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children); + } + } + unlock_mount_hash(); + shrink_dentry_list(&list); + + if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { + struct task_struct *task = current; + if (likely(!(task->flags & PF_KTHREAD))) { + init_task_work(&mnt->mnt_rcu, __cleanup_mnt); + if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME)) + return; + } + if (llist_add(&mnt->mnt_llist, &delayed_mntput_list)) + schedule_delayed_work(&delayed_mntput_work, 1); + return; + } + cleanup_mnt(mnt); +} + +void mntput(struct vfsmount *mnt) +{ + if (mnt) { + struct mount *m = real_mount(mnt); + /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ + if (unlikely(m->mnt_expiry_mark)) + m->mnt_expiry_mark = 0; + mntput_no_expire(m); + } +} +EXPORT_SYMBOL(mntput); + +struct vfsmount *mntget(struct vfsmount *mnt) +{ + if (mnt) + mnt_add_count(real_mount(mnt), 1); + return mnt; +} +EXPORT_SYMBOL(mntget); + +/* path_is_mountpoint() - Check if path is a mount in the current + * namespace. + * + * d_mountpoint() can only be used reliably to establish if a dentry is + * not mounted in any namespace and that common case is handled inline. + * d_mountpoint() isn't aware of the possibility there may be multiple + * mounts using a given dentry in a different namespace. This function + * checks if the passed in path is a mountpoint rather than the dentry + * alone. + */ +bool path_is_mountpoint(const struct path *path) +{ + unsigned seq; + bool res; + + if (!d_mountpoint(path->dentry)) + return false; + + rcu_read_lock(); + do { + seq = read_seqbegin(&mount_lock); + res = __path_is_mountpoint(path); + } while (read_seqretry(&mount_lock, seq)); + rcu_read_unlock(); + + return res; +} +EXPORT_SYMBOL(path_is_mountpoint); + +struct vfsmount *mnt_clone_internal(const struct path *path) +{ + struct mount *p; + p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); + if (IS_ERR(p)) + return ERR_CAST(p); + p->mnt.mnt_flags |= MNT_INTERNAL; + return &p->mnt; +} + +#ifdef CONFIG_PROC_FS +static struct mount *mnt_list_next(struct mnt_namespace *ns, + struct list_head *p) +{ + struct mount *mnt, *ret = NULL; + + lock_ns_list(ns); + list_for_each_continue(p, &ns->list) { + mnt = list_entry(p, typeof(*mnt), mnt_list); + if (!mnt_is_cursor(mnt)) { + ret = mnt; + break; + } + } + unlock_ns_list(ns); + + return ret; +} + +/* iterator; we want it to have access to namespace_sem, thus here... */ +static void *m_start(struct seq_file *m, loff_t *pos) +{ + struct proc_mounts *p = m->private; + struct list_head *prev; + + down_read(&namespace_sem); + if (!*pos) { + prev = &p->ns->list; + } else { + prev = &p->cursor.mnt_list; + + /* Read after we'd reached the end? */ + if (list_empty(prev)) + return NULL; + } + + return mnt_list_next(p->ns, prev); +} + +static void *m_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct proc_mounts *p = m->private; + struct mount *mnt = v; + + ++*pos; + return mnt_list_next(p->ns, &mnt->mnt_list); +} + +static void m_stop(struct seq_file *m, void *v) +{ + struct proc_mounts *p = m->private; + struct mount *mnt = v; + + lock_ns_list(p->ns); + if (mnt) + list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list); + else + list_del_init(&p->cursor.mnt_list); + unlock_ns_list(p->ns); + up_read(&namespace_sem); +} + +static int m_show(struct seq_file *m, void *v) +{ + struct proc_mounts *p = m->private; + struct mount *r = v; + return p->show(m, &r->mnt); +} + +const struct seq_operations mounts_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = m_show, +}; + +void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor) +{ + down_read(&namespace_sem); + lock_ns_list(ns); + list_del(&cursor->mnt_list); + unlock_ns_list(ns); + up_read(&namespace_sem); +} +#endif /* CONFIG_PROC_FS */ + +/** + * may_umount_tree - check if a mount tree is busy + * @mnt: root of mount tree + * + * This is called to check if a tree of mounts has any + * open files, pwds, chroots or sub mounts that are + * busy. + */ +int may_umount_tree(struct vfsmount *m) +{ + struct mount *mnt = real_mount(m); + int actual_refs = 0; + int minimum_refs = 0; + struct mount *p; + BUG_ON(!m); + + /* write lock needed for mnt_get_count */ + lock_mount_hash(); + for (p = mnt; p; p = next_mnt(p, mnt)) { + actual_refs += mnt_get_count(p); + minimum_refs += 2; + } + unlock_mount_hash(); + + if (actual_refs > minimum_refs) + return 0; + + return 1; +} + +EXPORT_SYMBOL(may_umount_tree); + +/** + * may_umount - check if a mount point is busy + * @mnt: root of mount + * + * This is called to check if a mount point has any + * open files, pwds, chroots or sub mounts. If the + * mount has sub mounts this will return busy + * regardless of whether the sub mounts are busy. + * + * Doesn't take quota and stuff into account. IOW, in some cases it will + * give false negatives. The main reason why it's here is that we need + * a non-destructive way to look for easily umountable filesystems. + */ +int may_umount(struct vfsmount *mnt) +{ + int ret = 1; + down_read(&namespace_sem); + lock_mount_hash(); + if (propagate_mount_busy(real_mount(mnt), 2)) + ret = 0; + unlock_mount_hash(); + up_read(&namespace_sem); + return ret; +} + +EXPORT_SYMBOL(may_umount); + +static void namespace_unlock(void) +{ + struct hlist_head head; + struct hlist_node *p; + struct mount *m; + LIST_HEAD(list); + + hlist_move_list(&unmounted, &head); + list_splice_init(&ex_mountpoints, &list); + + up_write(&namespace_sem); + + shrink_dentry_list(&list); + + if (likely(hlist_empty(&head))) + return; + + synchronize_rcu_expedited(); + + hlist_for_each_entry_safe(m, p, &head, mnt_umount) { + hlist_del(&m->mnt_umount); + mntput(&m->mnt); + } +} + +static inline void namespace_lock(void) +{ + down_write(&namespace_sem); +} + +enum umount_tree_flags { + UMOUNT_SYNC = 1, + UMOUNT_PROPAGATE = 2, + UMOUNT_CONNECTED = 4, +}; + +static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how) +{ + /* Leaving mounts connected is only valid for lazy umounts */ + if (how & UMOUNT_SYNC) + return true; + + /* A mount without a parent has nothing to be connected to */ + if (!mnt_has_parent(mnt)) + return true; + + /* Because the reference counting rules change when mounts are + * unmounted and connected, umounted mounts may not be + * connected to mounted mounts. + */ + if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) + return true; + + /* Has it been requested that the mount remain connected? */ + if (how & UMOUNT_CONNECTED) + return false; + + /* Is the mount locked such that it needs to remain connected? */ + if (IS_MNT_LOCKED(mnt)) + return false; + + /* By default disconnect the mount */ + return true; +} + +/* + * mount_lock must be held + * namespace_sem must be held for write + */ +static void umount_tree(struct mount *mnt, enum umount_tree_flags how) +{ + LIST_HEAD(tmp_list); + struct mount *p; + + if (how & UMOUNT_PROPAGATE) + propagate_mount_unlock(mnt); + + /* Gather the mounts to umount */ + for (p = mnt; p; p = next_mnt(p, mnt)) { + p->mnt.mnt_flags |= MNT_UMOUNT; + list_move(&p->mnt_list, &tmp_list); + } + + /* Hide the mounts from mnt_mounts */ + list_for_each_entry(p, &tmp_list, mnt_list) { + list_del_init(&p->mnt_child); + } + + /* Add propogated mounts to the tmp_list */ + if (how & UMOUNT_PROPAGATE) + propagate_umount(&tmp_list); + + while (!list_empty(&tmp_list)) { + struct mnt_namespace *ns; + bool disconnect; + p = list_first_entry(&tmp_list, struct mount, mnt_list); + list_del_init(&p->mnt_expire); + list_del_init(&p->mnt_list); + ns = p->mnt_ns; + if (ns) { + ns->mounts--; + __touch_mnt_namespace(ns); + } + p->mnt_ns = NULL; + if (how & UMOUNT_SYNC) + p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; + + disconnect = disconnect_mount(p, how); + if (mnt_has_parent(p)) { + mnt_add_count(p->mnt_parent, -1); + if (!disconnect) { + /* Don't forget about p */ + list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts); + } else { + umount_mnt(p); + } + } + change_mnt_propagation(p, MS_PRIVATE); + if (disconnect) + hlist_add_head(&p->mnt_umount, &unmounted); + } +} + +static void shrink_submounts(struct mount *mnt); + +static int do_umount_root(struct super_block *sb) +{ + int ret = 0; + + down_write(&sb->s_umount); + if (!sb_rdonly(sb)) { + struct fs_context *fc; + + fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY, + SB_RDONLY); + if (IS_ERR(fc)) { + ret = PTR_ERR(fc); + } else { + ret = parse_monolithic_mount_data(fc, NULL); + if (!ret) + ret = reconfigure_super(fc); + put_fs_context(fc); + } + } + up_write(&sb->s_umount); + return ret; +} + +static int do_umount(struct mount *mnt, int flags) +{ + struct super_block *sb = mnt->mnt.mnt_sb; + int retval; + + retval = security_sb_umount(&mnt->mnt, flags); + if (retval) + return retval; + + /* + * Allow userspace to request a mountpoint be expired rather than + * unmounting unconditionally. Unmount only happens if: + * (1) the mark is already set (the mark is cleared by mntput()) + * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] + */ + if (flags & MNT_EXPIRE) { + if (&mnt->mnt == current->fs->root.mnt || + flags & (MNT_FORCE | MNT_DETACH)) + return -EINVAL; + + /* + * probably don't strictly need the lock here if we examined + * all race cases, but it's a slowpath. + */ + lock_mount_hash(); + if (mnt_get_count(mnt) != 2) { + unlock_mount_hash(); + return -EBUSY; + } + unlock_mount_hash(); + + if (!xchg(&mnt->mnt_expiry_mark, 1)) + return -EAGAIN; + } + + /* + * If we may have to abort operations to get out of this + * mount, and they will themselves hold resources we must + * allow the fs to do things. In the Unix tradition of + * 'Gee thats tricky lets do it in userspace' the umount_begin + * might fail to complete on the first run through as other tasks + * must return, and the like. Thats for the mount program to worry + * about for the moment. + */ + + if (flags & MNT_FORCE && sb->s_op->umount_begin) { + sb->s_op->umount_begin(sb); + } + + /* + * No sense to grab the lock for this test, but test itself looks + * somewhat bogus. Suggestions for better replacement? + * Ho-hum... In principle, we might treat that as umount + switch + * to rootfs. GC would eventually take care of the old vfsmount. + * Actually it makes sense, especially if rootfs would contain a + * /reboot - static binary that would close all descriptors and + * call reboot(9). Then init(8) could umount root and exec /reboot. + */ + if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { + /* + * Special case for "unmounting" root ... + * we just try to remount it readonly. + */ + if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + return do_umount_root(sb); + } + + namespace_lock(); + lock_mount_hash(); + + /* Recheck MNT_LOCKED with the locks held */ + retval = -EINVAL; + if (mnt->mnt.mnt_flags & MNT_LOCKED) + goto out; + + event++; + if (flags & MNT_DETACH) { + if (!list_empty(&mnt->mnt_list)) + umount_tree(mnt, UMOUNT_PROPAGATE); + retval = 0; + } else { + shrink_submounts(mnt); + retval = -EBUSY; + if (!propagate_mount_busy(mnt, 2)) { + if (!list_empty(&mnt->mnt_list)) + umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); + retval = 0; + } + } +out: + unlock_mount_hash(); + namespace_unlock(); + return retval; +} + +/* + * __detach_mounts - lazily unmount all mounts on the specified dentry + * + * During unlink, rmdir, and d_drop it is possible to loose the path + * to an existing mountpoint, and wind up leaking the mount. + * detach_mounts allows lazily unmounting those mounts instead of + * leaking them. + * + * The caller may hold dentry->d_inode->i_mutex. + */ +void __detach_mounts(struct dentry *dentry) +{ + struct mountpoint *mp; + struct mount *mnt; + + namespace_lock(); + lock_mount_hash(); + mp = lookup_mountpoint(dentry); + if (!mp) + goto out_unlock; + + event++; + while (!hlist_empty(&mp->m_list)) { + mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); + if (mnt->mnt.mnt_flags & MNT_UMOUNT) { + umount_mnt(mnt); + hlist_add_head(&mnt->mnt_umount, &unmounted); + } + else umount_tree(mnt, UMOUNT_CONNECTED); + } + put_mountpoint(mp); +out_unlock: + unlock_mount_hash(); + namespace_unlock(); +} + +/* + * Is the caller allowed to modify his namespace? + */ +static inline bool may_mount(void) +{ + return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); +} + +#ifdef CONFIG_MANDATORY_FILE_LOCKING +static bool may_mandlock(void) +{ + pr_warn_once("======================================================\n" + "WARNING: the mand mount option is being deprecated and\n" + " will be removed in v5.15!\n" + "======================================================\n"); + return capable(CAP_SYS_ADMIN); +} +#else +static inline bool may_mandlock(void) +{ + pr_warn("VFS: \"mand\" mount option not supported"); + return false; +} +#endif + +static int can_umount(const struct path *path, int flags) +{ + struct mount *mnt = real_mount(path->mnt); + + if (!may_mount()) + return -EPERM; + if (path->dentry != path->mnt->mnt_root) + return -EINVAL; + if (!check_mnt(mnt)) + return -EINVAL; + if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */ + return -EINVAL; + if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN)) + return -EPERM; + return 0; +} + +// caller is responsible for flags being sane +int path_umount(struct path *path, int flags) +{ + struct mount *mnt = real_mount(path->mnt); + int ret; + + ret = can_umount(path, flags); + if (!ret) + ret = do_umount(mnt, flags); + + /* we mustn't call path_put() as that would clear mnt_expiry_mark */ + dput(path->dentry); + mntput_no_expire(mnt); + return ret; +} + +static int ksys_umount(char __user *name, int flags) +{ + int lookup_flags = LOOKUP_MOUNTPOINT; + struct path path; + int ret; + + // basic validity checks done first + if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) + return -EINVAL; + + if (!(flags & UMOUNT_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + ret = user_path_at(AT_FDCWD, name, lookup_flags, &path); + if (ret) + return ret; + return path_umount(&path, flags); +} + +SYSCALL_DEFINE2(umount, char __user *, name, int, flags) +{ + return ksys_umount(name, flags); +} + +#ifdef __ARCH_WANT_SYS_OLDUMOUNT + +/* + * The 2.0 compatible umount. No flags. + */ +SYSCALL_DEFINE1(oldumount, char __user *, name) +{ + return ksys_umount(name, 0); +} + +#endif + +static bool is_mnt_ns_file(struct dentry *dentry) +{ + /* Is this a proxy for a mount namespace? */ + return dentry->d_op == &ns_dentry_operations && + dentry->d_fsdata == &mntns_operations; +} + +static struct mnt_namespace *to_mnt_ns(struct ns_common *ns) +{ + return container_of(ns, struct mnt_namespace, ns); +} + +struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) +{ + return &mnt->ns; +} + +static bool mnt_ns_loop(struct dentry *dentry) +{ + /* Could bind mounting the mount namespace inode cause a + * mount namespace loop? + */ + struct mnt_namespace *mnt_ns; + if (!is_mnt_ns_file(dentry)) + return false; + + mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode)); + return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; +} + +struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, + int flag) +{ + struct mount *res, *p, *q, *r, *parent; + + if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt)) + return ERR_PTR(-EINVAL); + + if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) + return ERR_PTR(-EINVAL); + + res = q = clone_mnt(mnt, dentry, flag); + if (IS_ERR(q)) + return q; + + q->mnt_mountpoint = mnt->mnt_mountpoint; + + p = mnt; + list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { + struct mount *s; + if (!is_subdir(r->mnt_mountpoint, dentry)) + continue; + + for (s = r; s; s = next_mnt(s, r)) { + if (!(flag & CL_COPY_UNBINDABLE) && + IS_MNT_UNBINDABLE(s)) { + if (s->mnt.mnt_flags & MNT_LOCKED) { + /* Both unbindable and locked. */ + q = ERR_PTR(-EPERM); + goto out; + } else { + s = skip_mnt_tree(s); + continue; + } + } + if (!(flag & CL_COPY_MNT_NS_FILE) && + is_mnt_ns_file(s->mnt.mnt_root)) { + s = skip_mnt_tree(s); + continue; + } + while (p != s->mnt_parent) { + p = p->mnt_parent; + q = q->mnt_parent; + } + p = s; + parent = q; + q = clone_mnt(p, p->mnt.mnt_root, flag); + if (IS_ERR(q)) + goto out; + lock_mount_hash(); + list_add_tail(&q->mnt_list, &res->mnt_list); + attach_mnt(q, parent, p->mnt_mp); + unlock_mount_hash(); + } + } + return res; +out: + if (res) { + lock_mount_hash(); + umount_tree(res, UMOUNT_SYNC); + unlock_mount_hash(); + } + return q; +} + +/* Caller should check returned pointer for errors */ + +struct vfsmount *collect_mounts(const struct path *path) +{ + struct mount *tree; + namespace_lock(); + if (!check_mnt(real_mount(path->mnt))) + tree = ERR_PTR(-EINVAL); + else + tree = copy_tree(real_mount(path->mnt), path->dentry, + CL_COPY_ALL | CL_PRIVATE); + namespace_unlock(); + if (IS_ERR(tree)) + return ERR_CAST(tree); + return &tree->mnt; +} + +static void free_mnt_ns(struct mnt_namespace *); +static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); + +void dissolve_on_fput(struct vfsmount *mnt) +{ + struct mnt_namespace *ns; + namespace_lock(); + lock_mount_hash(); + ns = real_mount(mnt)->mnt_ns; + if (ns) { + if (is_anon_ns(ns)) + umount_tree(real_mount(mnt), UMOUNT_CONNECTED); + else + ns = NULL; + } + unlock_mount_hash(); + namespace_unlock(); + if (ns) + free_mnt_ns(ns); +} + +void drop_collected_mounts(struct vfsmount *mnt) +{ + namespace_lock(); + lock_mount_hash(); + umount_tree(real_mount(mnt), 0); + unlock_mount_hash(); + namespace_unlock(); +} + +static bool has_locked_children(struct mount *mnt, struct dentry *dentry) +{ + struct mount *child; + + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + if (!is_subdir(child->mnt_mountpoint, dentry)) + continue; + + if (child->mnt.mnt_flags & MNT_LOCKED) + return true; + } + return false; +} + +/** + * clone_private_mount - create a private clone of a path + * + * This creates a new vfsmount, which will be the clone of @path. The new will + * not be attached anywhere in the namespace and will be private (i.e. changes + * to the originating mount won't be propagated into this). + * + * Release with mntput(). + */ +struct vfsmount *clone_private_mount(const struct path *path) +{ + struct mount *old_mnt = real_mount(path->mnt); + struct mount *new_mnt; + + down_read(&namespace_sem); + if (IS_MNT_UNBINDABLE(old_mnt)) + goto invalid; + + if (!check_mnt(old_mnt)) + goto invalid; + + if (has_locked_children(old_mnt, path->dentry)) + goto invalid; + + new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); + up_read(&namespace_sem); + + if (IS_ERR(new_mnt)) + return ERR_CAST(new_mnt); + + /* Longterm mount to be removed by kern_unmount*() */ + new_mnt->mnt_ns = MNT_NS_INTERNAL; + + return &new_mnt->mnt; + +invalid: + up_read(&namespace_sem); + return ERR_PTR(-EINVAL); +} +EXPORT_SYMBOL_GPL(clone_private_mount); + +int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, + struct vfsmount *root) +{ + struct mount *mnt; + int res = f(root, arg); + if (res) + return res; + list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { + res = f(&mnt->mnt, arg); + if (res) + return res; + } + return 0; +} + +static void lock_mnt_tree(struct mount *mnt) +{ + struct mount *p; + + for (p = mnt; p; p = next_mnt(p, mnt)) { + int flags = p->mnt.mnt_flags; + /* Don't allow unprivileged users to change mount flags */ + flags |= MNT_LOCK_ATIME; + + if (flags & MNT_READONLY) + flags |= MNT_LOCK_READONLY; + + if (flags & MNT_NODEV) + flags |= MNT_LOCK_NODEV; + + if (flags & MNT_NOSUID) + flags |= MNT_LOCK_NOSUID; + + if (flags & MNT_NOEXEC) + flags |= MNT_LOCK_NOEXEC; + /* Don't allow unprivileged users to reveal what is under a mount */ + if (list_empty(&p->mnt_expire)) + flags |= MNT_LOCKED; + p->mnt.mnt_flags = flags; + } +} + +static void cleanup_group_ids(struct mount *mnt, struct mount *end) +{ + struct mount *p; + + for (p = mnt; p != end; p = next_mnt(p, mnt)) { + if (p->mnt_group_id && !IS_MNT_SHARED(p)) + mnt_release_group_id(p); + } +} + +static int invent_group_ids(struct mount *mnt, bool recurse) +{ + struct mount *p; + + for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { + if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { + int err = mnt_alloc_group_id(p); + if (err) { + cleanup_group_ids(mnt, p); + return err; + } + } + } + + return 0; +} + +int count_mounts(struct mnt_namespace *ns, struct mount *mnt) +{ + unsigned int max = READ_ONCE(sysctl_mount_max); + unsigned int mounts = 0, old, pending, sum; + struct mount *p; + + for (p = mnt; p; p = next_mnt(p, mnt)) + mounts++; + + old = ns->mounts; + pending = ns->pending_mounts; + sum = old + pending; + if ((old > sum) || + (pending > sum) || + (max < sum) || + (mounts > (max - sum))) + return -ENOSPC; + + ns->pending_mounts = pending + mounts; + return 0; +} + +/* + * @source_mnt : mount tree to be attached + * @nd : place the mount tree @source_mnt is attached + * @parent_nd : if non-null, detach the source_mnt from its parent and + * store the parent mount and mountpoint dentry. + * (done when source_mnt is moved) + * + * NOTE: in the table below explains the semantics when a source mount + * of a given type is attached to a destination mount of a given type. + * --------------------------------------------------------------------------- + * | BIND MOUNT OPERATION | + * |************************************************************************** + * | source-->| shared | private | slave | unbindable | + * | dest | | | | | + * | | | | | | | + * | v | | | | | + * |************************************************************************** + * | shared | shared (++) | shared (+) | shared(+++)| invalid | + * | | | | | | + * |non-shared| shared (+) | private | slave (*) | invalid | + * *************************************************************************** + * A bind operation clones the source mount and mounts the clone on the + * destination mount. + * + * (++) the cloned mount is propagated to all the mounts in the propagation + * tree of the destination mount and the cloned mount is added to + * the peer group of the source mount. + * (+) the cloned mount is created under the destination mount and is marked + * as shared. The cloned mount is added to the peer group of the source + * mount. + * (+++) the mount is propagated to all the mounts in the propagation tree + * of the destination mount and the cloned mount is made slave + * of the same master as that of the source mount. The cloned mount + * is marked as 'shared and slave'. + * (*) the cloned mount is made a slave of the same master as that of the + * source mount. + * + * --------------------------------------------------------------------------- + * | MOVE MOUNT OPERATION | + * |************************************************************************** + * | source-->| shared | private | slave | unbindable | + * | dest | | | | | + * | | | | | | | + * | v | | | | | + * |************************************************************************** + * | shared | shared (+) | shared (+) | shared(+++) | invalid | + * | | | | | | + * |non-shared| shared (+*) | private | slave (*) | unbindable | + * *************************************************************************** + * + * (+) the mount is moved to the destination. And is then propagated to + * all the mounts in the propagation tree of the destination mount. + * (+*) the mount is moved to the destination. + * (+++) the mount is moved to the destination and is then propagated to + * all the mounts belonging to the destination mount's propagation tree. + * the mount is marked as 'shared and slave'. + * (*) the mount continues to be a slave at the new location. + * + * if the source mount is a tree, the operations explained above is + * applied to each mount in the tree. + * Must be called without spinlocks held, since this function can sleep + * in allocations. + */ +static int attach_recursive_mnt(struct mount *source_mnt, + struct mount *dest_mnt, + struct mountpoint *dest_mp, + bool moving) +{ + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; + HLIST_HEAD(tree_list); + struct mnt_namespace *ns = dest_mnt->mnt_ns; + struct mountpoint *smp; + struct mount *child, *p; + struct hlist_node *n; + int err; + + /* Preallocate a mountpoint in case the new mounts need + * to be tucked under other mounts. + */ + smp = get_mountpoint(source_mnt->mnt.mnt_root); + if (IS_ERR(smp)) + return PTR_ERR(smp); + + /* Is there space to add these mounts to the mount namespace? */ + if (!moving) { + err = count_mounts(ns, source_mnt); + if (err) + goto out; + } + + if (IS_MNT_SHARED(dest_mnt)) { + err = invent_group_ids(source_mnt, true); + if (err) + goto out; + err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); + lock_mount_hash(); + if (err) + goto out_cleanup_ids; + for (p = source_mnt; p; p = next_mnt(p, source_mnt)) + set_mnt_shared(p); + } else { + lock_mount_hash(); + } + if (moving) { + unhash_mnt(source_mnt); + attach_mnt(source_mnt, dest_mnt, dest_mp); + touch_mnt_namespace(source_mnt->mnt_ns); + } else { + if (source_mnt->mnt_ns) { + /* move from anon - the caller will destroy */ + list_del_init(&source_mnt->mnt_ns->list); + } + mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); + commit_tree(source_mnt); + } + + hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { + struct mount *q; + hlist_del_init(&child->mnt_hash); + q = __lookup_mnt(&child->mnt_parent->mnt, + child->mnt_mountpoint); + if (q) + mnt_change_mountpoint(child, smp, q); + /* Notice when we are propagating across user namespaces */ + if (child->mnt_parent->mnt_ns->user_ns != user_ns) + lock_mnt_tree(child); + child->mnt.mnt_flags &= ~MNT_LOCKED; + commit_tree(child); + } + put_mountpoint(smp); + unlock_mount_hash(); + + return 0; + + out_cleanup_ids: + while (!hlist_empty(&tree_list)) { + child = hlist_entry(tree_list.first, struct mount, mnt_hash); + child->mnt_parent->mnt_ns->pending_mounts = 0; + umount_tree(child, UMOUNT_SYNC); + } + unlock_mount_hash(); + cleanup_group_ids(source_mnt, NULL); + out: + ns->pending_mounts = 0; + + read_seqlock_excl(&mount_lock); + put_mountpoint(smp); + read_sequnlock_excl(&mount_lock); + + return err; +} + +static struct mountpoint *lock_mount(struct path *path) +{ + struct vfsmount *mnt; + struct dentry *dentry = path->dentry; +retry: + inode_lock(dentry->d_inode); + if (unlikely(cant_mount(dentry))) { + inode_unlock(dentry->d_inode); + return ERR_PTR(-ENOENT); + } + namespace_lock(); + mnt = lookup_mnt(path); + if (likely(!mnt)) { + struct mountpoint *mp = get_mountpoint(dentry); + if (IS_ERR(mp)) { + namespace_unlock(); + inode_unlock(dentry->d_inode); + return mp; + } + return mp; + } + namespace_unlock(); + inode_unlock(path->dentry->d_inode); + path_put(path); + path->mnt = mnt; + dentry = path->dentry = dget(mnt->mnt_root); + goto retry; +} + +static void unlock_mount(struct mountpoint *where) +{ + struct dentry *dentry = where->m_dentry; + + read_seqlock_excl(&mount_lock); + put_mountpoint(where); + read_sequnlock_excl(&mount_lock); + + namespace_unlock(); + inode_unlock(dentry->d_inode); +} + +static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) +{ + if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER) + return -EINVAL; + + if (d_is_dir(mp->m_dentry) != + d_is_dir(mnt->mnt.mnt_root)) + return -ENOTDIR; + + return attach_recursive_mnt(mnt, p, mp, false); +} + +/* + * Sanity check the flags to change_mnt_propagation. + */ + +static int flags_to_propagation_type(int ms_flags) +{ + int type = ms_flags & ~(MS_REC | MS_SILENT); + + /* Fail if any non-propagation flags are set */ + if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) + return 0; + /* Only one propagation flag should be set */ + if (!is_power_of_2(type)) + return 0; + return type; +} + +/* + * recursively change the type of the mountpoint. + */ +static int do_change_type(struct path *path, int ms_flags) +{ + struct mount *m; + struct mount *mnt = real_mount(path->mnt); + int recurse = ms_flags & MS_REC; + int type; + int err = 0; + + if (path->dentry != path->mnt->mnt_root) + return -EINVAL; + + type = flags_to_propagation_type(ms_flags); + if (!type) + return -EINVAL; + + namespace_lock(); + if (type == MS_SHARED) { + err = invent_group_ids(mnt, recurse); + if (err) + goto out_unlock; + } + + lock_mount_hash(); + for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) + change_mnt_propagation(m, type); + unlock_mount_hash(); + + out_unlock: + namespace_unlock(); + return err; +} + +static struct mount *__do_loopback(struct path *old_path, int recurse) +{ + struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); + + if (IS_MNT_UNBINDABLE(old)) + return mnt; + + if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations) + return mnt; + + if (!recurse && has_locked_children(old, old_path->dentry)) + return mnt; + + if (recurse) + mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); + else + mnt = clone_mnt(old, old_path->dentry, 0); + + if (!IS_ERR(mnt)) + mnt->mnt.mnt_flags &= ~MNT_LOCKED; + + return mnt; +} + +/* + * do loopback mount. + */ +static int do_loopback(struct path *path, const char *old_name, + int recurse) +{ + struct path old_path; + struct mount *mnt = NULL, *parent; + struct mountpoint *mp; + int err; + if (!old_name || !*old_name) + return -EINVAL; + err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); + if (err) + return err; + + err = -EINVAL; + if (mnt_ns_loop(old_path.dentry)) + goto out; + + mp = lock_mount(path); + if (IS_ERR(mp)) { + err = PTR_ERR(mp); + goto out; + } + + parent = real_mount(path->mnt); + if (!check_mnt(parent)) + goto out2; + + mnt = __do_loopback(&old_path, recurse); + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); + goto out2; + } + + err = graft_tree(mnt, parent, mp); + if (err) { + lock_mount_hash(); + umount_tree(mnt, UMOUNT_SYNC); + unlock_mount_hash(); + } +out2: + unlock_mount(mp); +out: + path_put(&old_path); + return err; +} + +static struct file *open_detached_copy(struct path *path, bool recursive) +{ + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; + struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true); + struct mount *mnt, *p; + struct file *file; + + if (IS_ERR(ns)) + return ERR_CAST(ns); + + namespace_lock(); + mnt = __do_loopback(path, recursive); + if (IS_ERR(mnt)) { + namespace_unlock(); + free_mnt_ns(ns); + return ERR_CAST(mnt); + } + + lock_mount_hash(); + for (p = mnt; p; p = next_mnt(p, mnt)) { + p->mnt_ns = ns; + ns->mounts++; + } + ns->root = mnt; + list_add_tail(&ns->list, &mnt->mnt_list); + mntget(&mnt->mnt); + unlock_mount_hash(); + namespace_unlock(); + + mntput(path->mnt); + path->mnt = &mnt->mnt; + file = dentry_open(path, O_PATH, current_cred()); + if (IS_ERR(file)) + dissolve_on_fput(path->mnt); + else + file->f_mode |= FMODE_NEED_UNMOUNT; + return file; +} + +SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) +{ + struct file *file; + struct path path; + int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; + bool detached = flags & OPEN_TREE_CLONE; + int error; + int fd; + + BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC); + + if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | + AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE | + OPEN_TREE_CLOEXEC)) + return -EINVAL; + + if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE) + return -EINVAL; + + if (flags & AT_NO_AUTOMOUNT) + lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + if (detached && !may_mount()) + return -EPERM; + + fd = get_unused_fd_flags(flags & O_CLOEXEC); + if (fd < 0) + return fd; + + error = user_path_at(dfd, filename, lookup_flags, &path); + if (unlikely(error)) { + file = ERR_PTR(error); + } else { + if (detached) + file = open_detached_copy(&path, flags & AT_RECURSIVE); + else + file = dentry_open(&path, O_PATH, current_cred()); + path_put(&path); + } + if (IS_ERR(file)) { + put_unused_fd(fd); + return PTR_ERR(file); + } + fd_install(fd, file); + return fd; +} + +/* + * Don't allow locked mount flags to be cleared. + * + * No locks need to be held here while testing the various MNT_LOCK + * flags because those flags can never be cleared once they are set. + */ +static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags) +{ + unsigned int fl = mnt->mnt.mnt_flags; + + if ((fl & MNT_LOCK_READONLY) && + !(mnt_flags & MNT_READONLY)) + return false; + + if ((fl & MNT_LOCK_NODEV) && + !(mnt_flags & MNT_NODEV)) + return false; + + if ((fl & MNT_LOCK_NOSUID) && + !(mnt_flags & MNT_NOSUID)) + return false; + + if ((fl & MNT_LOCK_NOEXEC) && + !(mnt_flags & MNT_NOEXEC)) + return false; + + if ((fl & MNT_LOCK_ATIME) && + ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) + return false; + + return true; +} + +static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags) +{ + bool readonly_request = (mnt_flags & MNT_READONLY); + + if (readonly_request == __mnt_is_readonly(&mnt->mnt)) + return 0; + + if (readonly_request) + return mnt_make_readonly(mnt); + + return __mnt_unmake_readonly(mnt); +} + +/* + * Update the user-settable attributes on a mount. The caller must hold + * sb->s_umount for writing. + */ +static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags) +{ + lock_mount_hash(); + mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; + mnt->mnt.mnt_flags = mnt_flags; + touch_mnt_namespace(mnt->mnt_ns); + unlock_mount_hash(); +} + +static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt) +{ + struct super_block *sb = mnt->mnt_sb; + + if (!__mnt_is_readonly(mnt) && + (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) && + (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { + char *buf, *mntpath; + + buf = (char *)__get_free_page(GFP_KERNEL); + if (buf) + mntpath = d_path(mountpoint, buf, PAGE_SIZE); + else + mntpath = ERR_PTR(-ENOMEM); + if (IS_ERR(mntpath)) + mntpath = "(unknown)"; + + pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n", + sb->s_type->name, + is_mounted(mnt) ? "remounted" : "mounted", + mntpath, &sb->s_time_max, + (unsigned long long)sb->s_time_max); + + sb->s_iflags |= SB_I_TS_EXPIRY_WARNED; + if (buf) + free_page((unsigned long)buf); + } +} + +/* + * Handle reconfiguration of the mountpoint only without alteration of the + * superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND + * to mount(2). + */ +static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) +{ + struct super_block *sb = path->mnt->mnt_sb; + struct mount *mnt = real_mount(path->mnt); + int ret; + + if (!check_mnt(mnt)) + return -EINVAL; + + if (path->dentry != mnt->mnt.mnt_root) + return -EINVAL; + + if (!can_change_locked_flags(mnt, mnt_flags)) + return -EPERM; + + down_write(&sb->s_umount); + ret = change_mount_ro_state(mnt, mnt_flags); + if (ret == 0) + set_mount_attributes(mnt, mnt_flags); + up_write(&sb->s_umount); + + mnt_warn_timestamp_expiry(path, &mnt->mnt); + + return ret; +} + +/* + * change filesystem flags. dir should be a physical root of filesystem. + * If you've mounted a non-root directory somewhere and want to do remount + * on it - tough luck. + */ +static int do_remount(struct path *path, int ms_flags, int sb_flags, + int mnt_flags, void *data) +{ + int err; + struct super_block *sb = path->mnt->mnt_sb; + struct mount *mnt = real_mount(path->mnt); + struct fs_context *fc; + + if (!check_mnt(mnt)) + return -EINVAL; + + if (path->dentry != path->mnt->mnt_root) + return -EINVAL; + + if (!can_change_locked_flags(mnt, mnt_flags)) + return -EPERM; + + fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK); + if (IS_ERR(fc)) + return PTR_ERR(fc); + + /* + * Indicate to the filesystem that the remount request is coming + * from the legacy mount system call. + */ + fc->oldapi = true; + + err = parse_monolithic_mount_data(fc, data); + if (!err) { + down_write(&sb->s_umount); + err = -EPERM; + if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { + err = reconfigure_super(fc); + if (!err) + set_mount_attributes(mnt, mnt_flags); + } + up_write(&sb->s_umount); + } + + mnt_warn_timestamp_expiry(path, &mnt->mnt); + + put_fs_context(fc); + return err; +} + +static inline int tree_contains_unbindable(struct mount *mnt) +{ + struct mount *p; + for (p = mnt; p; p = next_mnt(p, mnt)) { + if (IS_MNT_UNBINDABLE(p)) + return 1; + } + return 0; +} + +/* + * Check that there aren't references to earlier/same mount namespaces in the + * specified subtree. Such references can act as pins for mount namespaces + * that aren't checked by the mount-cycle checking code, thereby allowing + * cycles to be made. + */ +static bool check_for_nsfs_mounts(struct mount *subtree) +{ + struct mount *p; + bool ret = false; + + lock_mount_hash(); + for (p = subtree; p; p = next_mnt(p, subtree)) + if (mnt_ns_loop(p->mnt.mnt_root)) + goto out; + + ret = true; +out: + unlock_mount_hash(); + return ret; +} + +static int do_move_mount(struct path *old_path, struct path *new_path) +{ + struct mnt_namespace *ns; + struct mount *p; + struct mount *old; + struct mount *parent; + struct mountpoint *mp, *old_mp; + int err; + bool attached; + + mp = lock_mount(new_path); + if (IS_ERR(mp)) + return PTR_ERR(mp); + + old = real_mount(old_path->mnt); + p = real_mount(new_path->mnt); + parent = old->mnt_parent; + attached = mnt_has_parent(old); + old_mp = old->mnt_mp; + ns = old->mnt_ns; + + err = -EINVAL; + /* The mountpoint must be in our namespace. */ + if (!check_mnt(p)) + goto out; + + /* The thing moved must be mounted... */ + if (!is_mounted(&old->mnt)) + goto out; + + /* ... and either ours or the root of anon namespace */ + if (!(attached ? check_mnt(old) : is_anon_ns(ns))) + goto out; + + if (old->mnt.mnt_flags & MNT_LOCKED) + goto out; + + if (old_path->dentry != old_path->mnt->mnt_root) + goto out; + + if (d_is_dir(new_path->dentry) != + d_is_dir(old_path->dentry)) + goto out; + /* + * Don't move a mount residing in a shared parent. + */ + if (attached && IS_MNT_SHARED(parent)) + goto out; + /* + * Don't move a mount tree containing unbindable mounts to a destination + * mount which is shared. + */ + if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) + goto out; + err = -ELOOP; + if (!check_for_nsfs_mounts(old)) + goto out; + for (; mnt_has_parent(p); p = p->mnt_parent) + if (p == old) + goto out; + + err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, + attached); + if (err) + goto out; + + /* if the mount is moved, it should no longer be expire + * automatically */ + list_del_init(&old->mnt_expire); + if (attached) + put_mountpoint(old_mp); +out: + unlock_mount(mp); + if (!err) { + if (attached) + mntput_no_expire(parent); + else + free_mnt_ns(ns); + } + return err; +} + +static int do_move_mount_old(struct path *path, const char *old_name) +{ + struct path old_path; + int err; + + if (!old_name || !*old_name) + return -EINVAL; + + err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); + if (err) + return err; + + err = do_move_mount(&old_path, path); + path_put(&old_path); + return err; +} + +/* + * add a mount into a namespace's mount tree + */ +static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, + struct path *path, int mnt_flags) +{ + struct mount *parent = real_mount(path->mnt); + + mnt_flags &= ~MNT_INTERNAL_FLAGS; + + if (unlikely(!check_mnt(parent))) { + /* that's acceptable only for automounts done in private ns */ + if (!(mnt_flags & MNT_SHRINKABLE)) + return -EINVAL; + /* ... and for those we'd better have mountpoint still alive */ + if (!parent->mnt_ns) + return -EINVAL; + } + + /* Refuse the same filesystem on the same mount point */ + if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && + path->mnt->mnt_root == path->dentry) + return -EBUSY; + + if (d_is_symlink(newmnt->mnt.mnt_root)) + return -EINVAL; + + newmnt->mnt.mnt_flags = mnt_flags; + return graft_tree(newmnt, parent, mp); +} + +static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); + +/* + * Create a new mount using a superblock configuration and request it + * be added to the namespace tree. + */ +static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, + unsigned int mnt_flags) +{ + struct vfsmount *mnt; + struct mountpoint *mp; + struct super_block *sb = fc->root->d_sb; + int error; + + error = security_sb_kern_mount(sb); + if (!error && mount_too_revealing(sb, &mnt_flags)) + error = -EPERM; + + if (unlikely(error)) { + fc_drop_locked(fc); + return error; + } + + up_write(&sb->s_umount); + + mnt = vfs_create_mount(fc); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + mnt_warn_timestamp_expiry(mountpoint, mnt); + + mp = lock_mount(mountpoint); + if (IS_ERR(mp)) { + mntput(mnt); + return PTR_ERR(mp); + } + error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags); + unlock_mount(mp); + if (error < 0) + mntput(mnt); + return error; +} + +/* + * create a new mount for userspace and request it to be added into the + * namespace's tree + */ +static int do_new_mount(struct path *path, const char *fstype, int sb_flags, + int mnt_flags, const char *name, void *data) +{ + struct file_system_type *type; + struct fs_context *fc; + const char *subtype = NULL; + int err = 0; + + if (!fstype) + return -EINVAL; + + type = get_fs_type(fstype); + if (!type) + return -ENODEV; + + if (type->fs_flags & FS_HAS_SUBTYPE) { + subtype = strchr(fstype, '.'); + if (subtype) { + subtype++; + if (!*subtype) { + put_filesystem(type); + return -EINVAL; + } + } + } + + fc = fs_context_for_mount(type, sb_flags); + put_filesystem(type); + if (IS_ERR(fc)) + return PTR_ERR(fc); + + /* + * Indicate to the filesystem that the mount request is coming + * from the legacy mount system call. + */ + fc->oldapi = true; + + if (subtype) + err = vfs_parse_fs_string(fc, "subtype", + subtype, strlen(subtype)); + if (!err && name) + err = vfs_parse_fs_string(fc, "source", name, strlen(name)); + if (!err) + err = parse_monolithic_mount_data(fc, data); + if (!err && !mount_capable(fc)) + err = -EPERM; + if (!err) + err = vfs_get_tree(fc); + if (!err) + err = do_new_mount_fc(fc, path, mnt_flags); + + put_fs_context(fc); + return err; +} + +int finish_automount(struct vfsmount *m, struct path *path) +{ + struct dentry *dentry = path->dentry; + struct mountpoint *mp; + struct mount *mnt; + int err; + + if (!m) + return 0; + if (IS_ERR(m)) + return PTR_ERR(m); + + mnt = real_mount(m); + /* The new mount record should have at least 2 refs to prevent it being + * expired before we get a chance to add it + */ + BUG_ON(mnt_get_count(mnt) < 2); + + if (m->mnt_sb == path->mnt->mnt_sb && + m->mnt_root == dentry) { + err = -ELOOP; + goto discard; + } + + /* + * we don't want to use lock_mount() - in this case finding something + * that overmounts our mountpoint to be means "quitely drop what we've + * got", not "try to mount it on top". + */ + inode_lock(dentry->d_inode); + namespace_lock(); + if (unlikely(cant_mount(dentry))) { + err = -ENOENT; + goto discard_locked; + } + rcu_read_lock(); + if (unlikely(__lookup_mnt(path->mnt, dentry))) { + rcu_read_unlock(); + err = 0; + goto discard_locked; + } + rcu_read_unlock(); + mp = get_mountpoint(dentry); + if (IS_ERR(mp)) { + err = PTR_ERR(mp); + goto discard_locked; + } + + err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE); + unlock_mount(mp); + if (unlikely(err)) + goto discard; + mntput(m); + return 0; + +discard_locked: + namespace_unlock(); + inode_unlock(dentry->d_inode); +discard: + /* remove m from any expiration list it may be on */ + if (!list_empty(&mnt->mnt_expire)) { + namespace_lock(); + list_del_init(&mnt->mnt_expire); + namespace_unlock(); + } + mntput(m); + mntput(m); + return err; +} + +/** + * mnt_set_expiry - Put a mount on an expiration list + * @mnt: The mount to list. + * @expiry_list: The list to add the mount to. + */ +void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) +{ + namespace_lock(); + + list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); + + namespace_unlock(); +} +EXPORT_SYMBOL(mnt_set_expiry); + +/* + * process a list of expirable mountpoints with the intent of discarding any + * mountpoints that aren't in use and haven't been touched since last we came + * here + */ +void mark_mounts_for_expiry(struct list_head *mounts) +{ + struct mount *mnt, *next; + LIST_HEAD(graveyard); + + if (list_empty(mounts)) + return; + + namespace_lock(); + lock_mount_hash(); + + /* extract from the expiration list every vfsmount that matches the + * following criteria: + * - only referenced by its parent vfsmount + * - still marked for expiry (marked on the last call here; marks are + * cleared by mntput()) + */ + list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { + if (!xchg(&mnt->mnt_expiry_mark, 1) || + propagate_mount_busy(mnt, 1)) + continue; + list_move(&mnt->mnt_expire, &graveyard); + } + while (!list_empty(&graveyard)) { + mnt = list_first_entry(&graveyard, struct mount, mnt_expire); + touch_mnt_namespace(mnt->mnt_ns); + umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); + } + unlock_mount_hash(); + namespace_unlock(); +} + +EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); + +/* + * Ripoff of 'select_parent()' + * + * search the list of submounts for a given mountpoint, and move any + * shrinkable submounts to the 'graveyard' list. + */ +static int select_submounts(struct mount *parent, struct list_head *graveyard) +{ + struct mount *this_parent = parent; + struct list_head *next; + int found = 0; + +repeat: + next = this_parent->mnt_mounts.next; +resume: + while (next != &this_parent->mnt_mounts) { + struct list_head *tmp = next; + struct mount *mnt = list_entry(tmp, struct mount, mnt_child); + + next = tmp->next; + if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE)) + continue; + /* + * Descend a level if the d_mounts list is non-empty. + */ + if (!list_empty(&mnt->mnt_mounts)) { + this_parent = mnt; + goto repeat; + } + + if (!propagate_mount_busy(mnt, 1)) { + list_move_tail(&mnt->mnt_expire, graveyard); + found++; + } + } + /* + * All done at this level ... ascend and resume the search + */ + if (this_parent != parent) { + next = this_parent->mnt_child.next; + this_parent = this_parent->mnt_parent; + goto resume; + } + return found; +} + +/* + * process a list of expirable mountpoints with the intent of discarding any + * submounts of a specific parent mountpoint + * + * mount_lock must be held for write + */ +static void shrink_submounts(struct mount *mnt) +{ + LIST_HEAD(graveyard); + struct mount *m; + + /* extract submounts of 'mountpoint' from the expiration list */ + while (select_submounts(mnt, &graveyard)) { + while (!list_empty(&graveyard)) { + m = list_first_entry(&graveyard, struct mount, + mnt_expire); + touch_mnt_namespace(m->mnt_ns); + umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC); + } + } +} + +static void *copy_mount_options(const void __user * data) +{ + char *copy; + unsigned left, offset; + + if (!data) + return NULL; + + copy = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!copy) + return ERR_PTR(-ENOMEM); + + left = copy_from_user(copy, data, PAGE_SIZE); + + /* + * Not all architectures have an exact copy_from_user(). Resort to + * byte at a time. + */ + offset = PAGE_SIZE - left; + while (left) { + char c; + if (get_user(c, (const char __user *)data + offset)) + break; + copy[offset] = c; + left--; + offset++; + } + + if (left == PAGE_SIZE) { + kfree(copy); + return ERR_PTR(-EFAULT); + } + + return copy; +} + +static char *copy_mount_string(const void __user *data) +{ + return data ? strndup_user(data, PATH_MAX) : NULL; +} + +/* + * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to + * be given to the mount() call (ie: read-only, no-dev, no-suid etc). + * + * data is a (void *) that can point to any structure up to + * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent + * information (or be NULL). + * + * Pre-0.97 versions of mount() didn't have a flags word. + * When the flags word was introduced its top half was required + * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. + * Therefore, if this magic number is present, it carries no information + * and must be discarded. + */ +int path_mount(const char *dev_name, struct path *path, + const char *type_page, unsigned long flags, void *data_page) +{ + unsigned int mnt_flags = 0, sb_flags; + int ret; + + /* Discard magic */ + if ((flags & MS_MGC_MSK) == MS_MGC_VAL) + flags &= ~MS_MGC_MSK; + + /* Basic sanity checks */ + if (data_page) + ((char *)data_page)[PAGE_SIZE - 1] = 0; + + if (flags & MS_NOUSER) + return -EINVAL; + + ret = security_sb_mount(dev_name, path, type_page, flags, data_page); + if (ret) + return ret; + if (!may_mount()) + return -EPERM; + if ((flags & SB_MANDLOCK) && !may_mandlock()) + return -EPERM; + + /* Default to relatime unless overriden */ + if (!(flags & MS_NOATIME)) + mnt_flags |= MNT_RELATIME; + + /* Separate the per-mountpoint flags */ + if (flags & MS_NOSUID) + mnt_flags |= MNT_NOSUID; + if (flags & MS_NODEV) + mnt_flags |= MNT_NODEV; + if (flags & MS_NOEXEC) + mnt_flags |= MNT_NOEXEC; + if (flags & MS_NOATIME) + mnt_flags |= MNT_NOATIME; + if (flags & MS_NODIRATIME) + mnt_flags |= MNT_NODIRATIME; + if (flags & MS_STRICTATIME) + mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); + if (flags & MS_RDONLY) + mnt_flags |= MNT_READONLY; + if (flags & MS_NOSYMFOLLOW) + mnt_flags |= MNT_NOSYMFOLLOW; + + /* The default atime for remount is preservation */ + if ((flags & MS_REMOUNT) && + ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | + MS_STRICTATIME)) == 0)) { + mnt_flags &= ~MNT_ATIME_MASK; + mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK; + } + + sb_flags = flags & (SB_RDONLY | + SB_SYNCHRONOUS | + SB_MANDLOCK | + SB_DIRSYNC | + SB_SILENT | + SB_POSIXACL | + SB_LAZYTIME | + SB_I_VERSION); + + if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND)) + return do_reconfigure_mnt(path, mnt_flags); + if (flags & MS_REMOUNT) + return do_remount(path, flags, sb_flags, mnt_flags, data_page); + if (flags & MS_BIND) + return do_loopback(path, dev_name, flags & MS_REC); + if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) + return do_change_type(path, flags); + if (flags & MS_MOVE) + return do_move_mount_old(path, dev_name); + + return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name, + data_page); +} + +long do_mount(const char *dev_name, const char __user *dir_name, + const char *type_page, unsigned long flags, void *data_page) +{ + struct path path; + int ret; + + ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path); + if (ret) + return ret; + ret = path_mount(dev_name, &path, type_page, flags, data_page); + path_put(&path); + return ret; +} + +static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns) +{ + return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES); +} + +static void dec_mnt_namespaces(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES); +} + +static void free_mnt_ns(struct mnt_namespace *ns) +{ + if (!is_anon_ns(ns)) + ns_free_inum(&ns->ns); + dec_mnt_namespaces(ns->ucounts); + put_user_ns(ns->user_ns); + kfree(ns); +} + +/* + * Assign a sequence number so we can detect when we attempt to bind + * mount a reference to an older mount namespace into the current + * mount namespace, preventing reference counting loops. A 64bit + * number incrementing at 10Ghz will take 12,427 years to wrap which + * is effectively never, so we can ignore the possibility. + */ +static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); + +static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon) +{ + struct mnt_namespace *new_ns; + struct ucounts *ucounts; + int ret; + + ucounts = inc_mnt_namespaces(user_ns); + if (!ucounts) + return ERR_PTR(-ENOSPC); + + new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL); + if (!new_ns) { + dec_mnt_namespaces(ucounts); + return ERR_PTR(-ENOMEM); + } + if (!anon) { + ret = ns_alloc_inum(&new_ns->ns); + if (ret) { + kfree(new_ns); + dec_mnt_namespaces(ucounts); + return ERR_PTR(ret); + } + } + new_ns->ns.ops = &mntns_operations; + if (!anon) + new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); + atomic_set(&new_ns->count, 1); + INIT_LIST_HEAD(&new_ns->list); + init_waitqueue_head(&new_ns->poll); + spin_lock_init(&new_ns->ns_lock); + new_ns->user_ns = get_user_ns(user_ns); + new_ns->ucounts = ucounts; + return new_ns; +} + +__latent_entropy +struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, + struct user_namespace *user_ns, struct fs_struct *new_fs) +{ + struct mnt_namespace *new_ns; + struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; + struct mount *p, *q; + struct mount *old; + struct mount *new; + int copy_flags; + + BUG_ON(!ns); + + if (likely(!(flags & CLONE_NEWNS))) { + get_mnt_ns(ns); + return ns; + } + + old = ns->root; + + new_ns = alloc_mnt_ns(user_ns, false); + if (IS_ERR(new_ns)) + return new_ns; + + namespace_lock(); + /* First pass: copy the tree topology */ + copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; + if (user_ns != ns->user_ns) + copy_flags |= CL_SHARED_TO_SLAVE; + new = copy_tree(old, old->mnt.mnt_root, copy_flags); + if (IS_ERR(new)) { + namespace_unlock(); + free_mnt_ns(new_ns); + return ERR_CAST(new); + } + if (user_ns != ns->user_ns) { + lock_mount_hash(); + lock_mnt_tree(new); + unlock_mount_hash(); + } + new_ns->root = new; + list_add_tail(&new_ns->list, &new->mnt_list); + + /* + * Second pass: switch the tsk->fs->* elements and mark new vfsmounts + * as belonging to new namespace. We have already acquired a private + * fs_struct, so tsk->fs->lock is not needed. + */ + p = old; + q = new; + while (p) { + q->mnt_ns = new_ns; + new_ns->mounts++; + if (new_fs) { + if (&p->mnt == new_fs->root.mnt) { + new_fs->root.mnt = mntget(&q->mnt); + rootmnt = &p->mnt; + } + if (&p->mnt == new_fs->pwd.mnt) { + new_fs->pwd.mnt = mntget(&q->mnt); + pwdmnt = &p->mnt; + } + } + p = next_mnt(p, old); + q = next_mnt(q, new); + if (!q) + break; + while (p->mnt.mnt_root != q->mnt.mnt_root) + p = next_mnt(p, old); + } + namespace_unlock(); + + if (rootmnt) + mntput(rootmnt); + if (pwdmnt) + mntput(pwdmnt); + + return new_ns; +} + +struct dentry *mount_subtree(struct vfsmount *m, const char *name) +{ + struct mount *mnt = real_mount(m); + struct mnt_namespace *ns; + struct super_block *s; + struct path path; + int err; + + ns = alloc_mnt_ns(&init_user_ns, true); + if (IS_ERR(ns)) { + mntput(m); + return ERR_CAST(ns); + } + mnt->mnt_ns = ns; + ns->root = mnt; + ns->mounts++; + list_add(&mnt->mnt_list, &ns->list); + + err = vfs_path_lookup(m->mnt_root, m, + name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); + + put_mnt_ns(ns); + + if (err) + return ERR_PTR(err); + + /* trade a vfsmount reference for active sb one */ + s = path.mnt->mnt_sb; + atomic_inc(&s->s_active); + mntput(path.mnt); + /* lock the sucker */ + down_write(&s->s_umount); + /* ... and return the root of (sub)tree on it */ + return path.dentry; +} +EXPORT_SYMBOL(mount_subtree); + +SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, + char __user *, type, unsigned long, flags, void __user *, data) +{ + int ret; + char *kernel_type; + char *kernel_dev; + void *options; + + kernel_type = copy_mount_string(type); + ret = PTR_ERR(kernel_type); + if (IS_ERR(kernel_type)) + goto out_type; + + kernel_dev = copy_mount_string(dev_name); + ret = PTR_ERR(kernel_dev); + if (IS_ERR(kernel_dev)) + goto out_dev; + + options = copy_mount_options(data); + ret = PTR_ERR(options); + if (IS_ERR(options)) + goto out_data; + + ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options); + + kfree(options); +out_data: + kfree(kernel_dev); +out_dev: + kfree(kernel_type); +out_type: + return ret; +} + +/* + * Create a kernel mount representation for a new, prepared superblock + * (specified by fs_fd) and attach to an open_tree-like file descriptor. + */ +SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, + unsigned int, attr_flags) +{ + struct mnt_namespace *ns; + struct fs_context *fc; + struct file *file; + struct path newmount; + struct mount *mnt; + struct fd f; + unsigned int mnt_flags = 0; + long ret; + + if (!may_mount()) + return -EPERM; + + if ((flags & ~(FSMOUNT_CLOEXEC)) != 0) + return -EINVAL; + + if (attr_flags & ~(MOUNT_ATTR_RDONLY | + MOUNT_ATTR_NOSUID | + MOUNT_ATTR_NODEV | + MOUNT_ATTR_NOEXEC | + MOUNT_ATTR__ATIME | + MOUNT_ATTR_NODIRATIME)) + return -EINVAL; + + if (attr_flags & MOUNT_ATTR_RDONLY) + mnt_flags |= MNT_READONLY; + if (attr_flags & MOUNT_ATTR_NOSUID) + mnt_flags |= MNT_NOSUID; + if (attr_flags & MOUNT_ATTR_NODEV) + mnt_flags |= MNT_NODEV; + if (attr_flags & MOUNT_ATTR_NOEXEC) + mnt_flags |= MNT_NOEXEC; + if (attr_flags & MOUNT_ATTR_NODIRATIME) + mnt_flags |= MNT_NODIRATIME; + + switch (attr_flags & MOUNT_ATTR__ATIME) { + case MOUNT_ATTR_STRICTATIME: + break; + case MOUNT_ATTR_NOATIME: + mnt_flags |= MNT_NOATIME; + break; + case MOUNT_ATTR_RELATIME: + mnt_flags |= MNT_RELATIME; + break; + default: + return -EINVAL; + } + + f = fdget(fs_fd); + if (!f.file) + return -EBADF; + + ret = -EINVAL; + if (f.file->f_op != &fscontext_fops) + goto err_fsfd; + + fc = f.file->private_data; + + ret = mutex_lock_interruptible(&fc->uapi_mutex); + if (ret < 0) + goto err_fsfd; + + /* There must be a valid superblock or we can't mount it */ + ret = -EINVAL; + if (!fc->root) + goto err_unlock; + + ret = -EPERM; + if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) { + pr_warn("VFS: Mount too revealing\n"); + goto err_unlock; + } + + ret = -EBUSY; + if (fc->phase != FS_CONTEXT_AWAITING_MOUNT) + goto err_unlock; + + ret = -EPERM; + if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock()) + goto err_unlock; + + newmount.mnt = vfs_create_mount(fc); + if (IS_ERR(newmount.mnt)) { + ret = PTR_ERR(newmount.mnt); + goto err_unlock; + } + newmount.dentry = dget(fc->root); + newmount.mnt->mnt_flags = mnt_flags; + + /* We've done the mount bit - now move the file context into more or + * less the same state as if we'd done an fspick(). We don't want to + * do any memory allocation or anything like that at this point as we + * don't want to have to handle any errors incurred. + */ + vfs_clean_context(fc); + + ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true); + if (IS_ERR(ns)) { + ret = PTR_ERR(ns); + goto err_path; + } + mnt = real_mount(newmount.mnt); + mnt->mnt_ns = ns; + ns->root = mnt; + ns->mounts = 1; + list_add(&mnt->mnt_list, &ns->list); + mntget(newmount.mnt); + + /* Attach to an apparent O_PATH fd with a note that we need to unmount + * it, not just simply put it. + */ + file = dentry_open(&newmount, O_PATH, fc->cred); + if (IS_ERR(file)) { + dissolve_on_fput(newmount.mnt); + ret = PTR_ERR(file); + goto err_path; + } + file->f_mode |= FMODE_NEED_UNMOUNT; + + ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0); + if (ret >= 0) + fd_install(ret, file); + else + fput(file); + +err_path: + path_put(&newmount); +err_unlock: + mutex_unlock(&fc->uapi_mutex); +err_fsfd: + fdput(f); + return ret; +} + +/* + * Move a mount from one place to another. In combination with + * fsopen()/fsmount() this is used to install a new mount and in combination + * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy + * a mount subtree. + * + * Note the flags value is a combination of MOVE_MOUNT_* flags. + */ +SYSCALL_DEFINE5(move_mount, + int, from_dfd, const char __user *, from_pathname, + int, to_dfd, const char __user *, to_pathname, + unsigned int, flags) +{ + struct path from_path, to_path; + unsigned int lflags; + int ret = 0; + + if (!may_mount()) + return -EPERM; + + if (flags & ~MOVE_MOUNT__MASK) + return -EINVAL; + + /* If someone gives a pathname, they aren't permitted to move + * from an fd that requires unmount as we can't get at the flag + * to clear it afterwards. + */ + lflags = 0; + if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW; + if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; + if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY; + + ret = user_path_at(from_dfd, from_pathname, lflags, &from_path); + if (ret < 0) + return ret; + + lflags = 0; + if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW; + if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; + if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY; + + ret = user_path_at(to_dfd, to_pathname, lflags, &to_path); + if (ret < 0) + goto out_from; + + ret = security_move_mount(&from_path, &to_path); + if (ret < 0) + goto out_to; + + ret = do_move_mount(&from_path, &to_path); + +out_to: + path_put(&to_path); +out_from: + path_put(&from_path); + return ret; +} + +/* + * Return true if path is reachable from root + * + * namespace_sem or mount_lock is held + */ +bool is_path_reachable(struct mount *mnt, struct dentry *dentry, + const struct path *root) +{ + while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { + dentry = mnt->mnt_mountpoint; + mnt = mnt->mnt_parent; + } + return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); +} + +bool path_is_under(const struct path *path1, const struct path *path2) +{ + bool res; + read_seqlock_excl(&mount_lock); + res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); + read_sequnlock_excl(&mount_lock); + return res; +} +EXPORT_SYMBOL(path_is_under); + +/* + * pivot_root Semantics: + * Moves the root file system of the current process to the directory put_old, + * makes new_root as the new root file system of the current process, and sets + * root/cwd of all processes which had them on the current root to new_root. + * + * Restrictions: + * The new_root and put_old must be directories, and must not be on the + * same file system as the current process root. The put_old must be + * underneath new_root, i.e. adding a non-zero number of /.. to the string + * pointed to by put_old must yield the same directory as new_root. No other + * file system may be mounted on put_old. After all, new_root is a mountpoint. + * + * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. + * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives + * in this situation. + * + * Notes: + * - we don't move root/cwd if they are not at the root (reason: if something + * cared enough to change them, it's probably wrong to force them elsewhere) + * - it's okay to pick a root that isn't the root of a file system, e.g. + * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, + * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root + * first. + */ +SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, + const char __user *, put_old) +{ + struct path new, old, root; + struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; + struct mountpoint *old_mp, *root_mp; + int error; + + if (!may_mount()) + return -EPERM; + + error = user_path_at(AT_FDCWD, new_root, + LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new); + if (error) + goto out0; + + error = user_path_at(AT_FDCWD, put_old, + LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old); + if (error) + goto out1; + + error = security_sb_pivotroot(&old, &new); + if (error) + goto out2; + + get_fs_root(current->fs, &root); + old_mp = lock_mount(&old); + error = PTR_ERR(old_mp); + if (IS_ERR(old_mp)) + goto out3; + + error = -EINVAL; + new_mnt = real_mount(new.mnt); + root_mnt = real_mount(root.mnt); + old_mnt = real_mount(old.mnt); + ex_parent = new_mnt->mnt_parent; + root_parent = root_mnt->mnt_parent; + if (IS_MNT_SHARED(old_mnt) || + IS_MNT_SHARED(ex_parent) || + IS_MNT_SHARED(root_parent)) + goto out4; + if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) + goto out4; + if (new_mnt->mnt.mnt_flags & MNT_LOCKED) + goto out4; + error = -ENOENT; + if (d_unlinked(new.dentry)) + goto out4; + error = -EBUSY; + if (new_mnt == root_mnt || old_mnt == root_mnt) + goto out4; /* loop, on the same file system */ + error = -EINVAL; + if (root.mnt->mnt_root != root.dentry) + goto out4; /* not a mountpoint */ + if (!mnt_has_parent(root_mnt)) + goto out4; /* not attached */ + if (new.mnt->mnt_root != new.dentry) + goto out4; /* not a mountpoint */ + if (!mnt_has_parent(new_mnt)) + goto out4; /* not attached */ + /* make sure we can reach put_old from new_root */ + if (!is_path_reachable(old_mnt, old.dentry, &new)) + goto out4; + /* make certain new is below the root */ + if (!is_path_reachable(new_mnt, new.dentry, &root)) + goto out4; + lock_mount_hash(); + umount_mnt(new_mnt); + root_mp = unhash_mnt(root_mnt); /* we'll need its mountpoint */ + if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { + new_mnt->mnt.mnt_flags |= MNT_LOCKED; + root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; + } + /* mount old root on put_old */ + attach_mnt(root_mnt, old_mnt, old_mp); + /* mount new_root on / */ + attach_mnt(new_mnt, root_parent, root_mp); + mnt_add_count(root_parent, -1); + touch_mnt_namespace(current->nsproxy->mnt_ns); + /* A moved mount should not expire automatically */ + list_del_init(&new_mnt->mnt_expire); + put_mountpoint(root_mp); + unlock_mount_hash(); + chroot_fs_refs(&root, &new); + error = 0; +out4: + unlock_mount(old_mp); + if (!error) + mntput_no_expire(ex_parent); +out3: + path_put(&root); +out2: + path_put(&old); +out1: + path_put(&new); +out0: + return error; +} + +static void __init init_mount_tree(void) +{ + struct vfsmount *mnt; + struct mount *m; + struct mnt_namespace *ns; + struct path root; + + mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); + if (IS_ERR(mnt)) + panic("Can't create rootfs"); + + ns = alloc_mnt_ns(&init_user_ns, false); + if (IS_ERR(ns)) + panic("Can't allocate initial namespace"); + m = real_mount(mnt); + m->mnt_ns = ns; + ns->root = m; + ns->mounts = 1; + list_add(&m->mnt_list, &ns->list); + init_task.nsproxy->mnt_ns = ns; + get_mnt_ns(ns); + + root.mnt = mnt; + root.dentry = mnt->mnt_root; + mnt->mnt_flags |= MNT_LOCKED; + + set_fs_pwd(current->fs, &root); + set_fs_root(current->fs, &root); +} + +void __init mnt_init(void) +{ + int err; + + mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + + mount_hashtable = alloc_large_system_hash("Mount-cache", + sizeof(struct hlist_head), + mhash_entries, 19, + HASH_ZERO, + &m_hash_shift, &m_hash_mask, 0, 0); + mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", + sizeof(struct hlist_head), + mphash_entries, 19, + HASH_ZERO, + &mp_hash_shift, &mp_hash_mask, 0, 0); + + if (!mount_hashtable || !mountpoint_hashtable) + panic("Failed to allocate mount hash table\n"); + + kernfs_init(); + + err = sysfs_init(); + if (err) + printk(KERN_WARNING "%s: sysfs_init error: %d\n", + __func__, err); + fs_kobj = kobject_create_and_add("fs", NULL); + if (!fs_kobj) + printk(KERN_WARNING "%s: kobj create error\n", __func__); + shmem_init(); + init_rootfs(); + init_mount_tree(); +} + +void put_mnt_ns(struct mnt_namespace *ns) +{ + if (!atomic_dec_and_test(&ns->count)) + return; + drop_collected_mounts(&ns->root->mnt); + free_mnt_ns(ns); +} + +struct vfsmount *kern_mount(struct file_system_type *type) +{ + struct vfsmount *mnt; + mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); + if (!IS_ERR(mnt)) { + /* + * it is a longterm mount, don't release mnt until + * we unmount before file sys is unregistered + */ + real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; + } + return mnt; +} +EXPORT_SYMBOL_GPL(kern_mount); + +void kern_unmount(struct vfsmount *mnt) +{ + /* release long term mount so mount point can be released */ + if (!IS_ERR_OR_NULL(mnt)) { + real_mount(mnt)->mnt_ns = NULL; + synchronize_rcu(); /* yecchhh... */ + mntput(mnt); + } +} +EXPORT_SYMBOL(kern_unmount); + +void kern_unmount_array(struct vfsmount *mnt[], unsigned int num) +{ + unsigned int i; + + for (i = 0; i < num; i++) + if (mnt[i]) + real_mount(mnt[i])->mnt_ns = NULL; + synchronize_rcu_expedited(); + for (i = 0; i < num; i++) + mntput(mnt[i]); +} +EXPORT_SYMBOL(kern_unmount_array); + +bool our_mnt(struct vfsmount *mnt) +{ + return check_mnt(real_mount(mnt)); +} + +bool current_chrooted(void) +{ + /* Does the current process have a non-standard root */ + struct path ns_root; + struct path fs_root; + bool chrooted; + + /* Find the namespace root */ + ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; + ns_root.dentry = ns_root.mnt->mnt_root; + path_get(&ns_root); + while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) + ; + + get_fs_root(current->fs, &fs_root); + + chrooted = !path_equal(&fs_root, &ns_root); + + path_put(&fs_root); + path_put(&ns_root); + + return chrooted; +} + +static bool mnt_already_visible(struct mnt_namespace *ns, + const struct super_block *sb, + int *new_mnt_flags) +{ + int new_flags = *new_mnt_flags; + struct mount *mnt; + bool visible = false; + + down_read(&namespace_sem); + lock_ns_list(ns); + list_for_each_entry(mnt, &ns->list, mnt_list) { + struct mount *child; + int mnt_flags; + + if (mnt_is_cursor(mnt)) + continue; + + if (mnt->mnt.mnt_sb->s_type != sb->s_type) + continue; + + /* This mount is not fully visible if it's root directory + * is not the root directory of the filesystem. + */ + if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) + continue; + + /* A local view of the mount flags */ + mnt_flags = mnt->mnt.mnt_flags; + + /* Don't miss readonly hidden in the superblock flags */ + if (sb_rdonly(mnt->mnt.mnt_sb)) + mnt_flags |= MNT_LOCK_READONLY; + + /* Verify the mount flags are equal to or more permissive + * than the proposed new mount. + */ + if ((mnt_flags & MNT_LOCK_READONLY) && + !(new_flags & MNT_READONLY)) + continue; + if ((mnt_flags & MNT_LOCK_ATIME) && + ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK))) + continue; + + /* This mount is not fully visible if there are any + * locked child mounts that cover anything except for + * empty directories. + */ + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + struct inode *inode = child->mnt_mountpoint->d_inode; + /* Only worry about locked mounts */ + if (!(child->mnt.mnt_flags & MNT_LOCKED)) + continue; + /* Is the directory permanetly empty? */ + if (!is_empty_dir_inode(inode)) + goto next; + } + /* Preserve the locked attributes */ + *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \ + MNT_LOCK_ATIME); + visible = true; + goto found; + next: ; + } +found: + unlock_ns_list(ns); + up_read(&namespace_sem); + return visible; +} + +static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags) +{ + const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV; + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + unsigned long s_iflags; + + if (ns->user_ns == &init_user_ns) + return false; + + /* Can this filesystem be too revealing? */ + s_iflags = sb->s_iflags; + if (!(s_iflags & SB_I_USERNS_VISIBLE)) + return false; + + if ((s_iflags & required_iflags) != required_iflags) { + WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n", + required_iflags); + return true; + } + + return !mnt_already_visible(ns, sb, new_mnt_flags); +} + +bool mnt_may_suid(struct vfsmount *mnt) +{ + /* + * Foreign mounts (accessed via fchdir or through /proc + * symlinks) are always treated as if they are nosuid. This + * prevents namespaces from trusting potentially unsafe + * suid/sgid bits, file caps, or security labels that originate + * in other namespaces. + */ + return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) && + current_in_userns(mnt->mnt_sb->s_user_ns); +} + +static struct ns_common *mntns_get(struct task_struct *task) +{ + struct ns_common *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = &nsproxy->mnt_ns->ns; + get_mnt_ns(to_mnt_ns(ns)); + } + task_unlock(task); + + return ns; +} + +static void mntns_put(struct ns_common *ns) +{ + put_mnt_ns(to_mnt_ns(ns)); +} + +static int mntns_install(struct nsset *nsset, struct ns_common *ns) +{ + struct nsproxy *nsproxy = nsset->nsproxy; + struct fs_struct *fs = nsset->fs; + struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns; + struct user_namespace *user_ns = nsset->cred->user_ns; + struct path root; + int err; + + if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || + !ns_capable(user_ns, CAP_SYS_CHROOT) || + !ns_capable(user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + if (is_anon_ns(mnt_ns)) + return -EINVAL; + + if (fs->users != 1) + return -EINVAL; + + get_mnt_ns(mnt_ns); + old_mnt_ns = nsproxy->mnt_ns; + nsproxy->mnt_ns = mnt_ns; + + /* Find the root */ + err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt, + "/", LOOKUP_DOWN, &root); + if (err) { + /* revert to old namespace */ + nsproxy->mnt_ns = old_mnt_ns; + put_mnt_ns(mnt_ns); + return err; + } + + put_mnt_ns(old_mnt_ns); + + /* Update the pwd and root */ + set_fs_pwd(fs, &root); + set_fs_root(fs, &root); + + path_put(&root); + return 0; +} + +static struct user_namespace *mntns_owner(struct ns_common *ns) +{ + return to_mnt_ns(ns)->user_ns; +} + +const struct proc_ns_operations mntns_operations = { + .name = "mnt", + .type = CLONE_NEWNS, + .get = mntns_get, + .put = mntns_put, + .install = mntns_install, + .owner = mntns_owner, +}; diff --git a/fs/open.c b/fs/open.c index 61745b288dac..5395ce5fb73d 100644 --- a/fs/open.c +++ b/fs/open.c @@ -395,6 +395,12 @@ static const struct cred *access_override_creds(void) return old_cred; } +#ifdef CONFIG_KSU_SUSFS_SUS_SU +extern bool susfs_is_sus_su_hooks_enabled __read_mostly; +extern int ksu_handle_faccessat(int *dfd, const char __user **filename_user, int *mode, + int *flags); +#endif + static long do_faccessat(int dfd, const char __user *filename, int mode, int flags) { struct path path; @@ -403,6 +409,12 @@ static long do_faccessat(int dfd, const char __user *filename, int mode, int fla unsigned int lookup_flags = LOOKUP_FOLLOW; const struct cred *old_cred = NULL; +#ifdef CONFIG_KSU_SUSFS_SUS_SU + if (susfs_is_sus_su_hooks_enabled) { + ksu_handle_faccessat(&dfd, &filename, &mode, NULL); + } +#endif + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index df35954d73d2..f784def63476 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -168,6 +168,15 @@ int ovl_getattr(const struct path *path, struct kstat *stat, metacopy_blocks = ovl_is_metacopy_dentry(dentry); +#ifdef CONFIG_KSU_SUSFS_SUS_OVERLAYFS + ovl_path_lowerdata(dentry, &realpath); + if (likely(realpath.mnt && realpath.dentry)) { + old_cred = ovl_override_creds(dentry->d_sb); + err = vfs_getattr(&realpath, stat, request_mask, flags); + goto out; + } +#endif + type = ovl_path_real(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); err = vfs_getattr(&realpath, stat, request_mask, flags); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 5a35f87dd66f..2688730b168a 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -930,7 +930,19 @@ static int ovl_dir_open(struct inode *inode, struct file *file) if (!od) return -ENOMEM; +#ifdef CONFIG_KSU_SUSFS_SUS_OVERLAYFS + ovl_path_lowerdata(file->f_path.dentry, &realpath); + if (likely(realpath.mnt && realpath.dentry)) { + // We still use '__OVL_PATH_UPPER' here which should be fine. + type = __OVL_PATH_UPPER; + goto bypass_orig_flow; + } +#endif + type = ovl_path_real(file->f_path.dentry, &realpath); +#ifdef CONFIG_KSU_SUSFS_SUS_OVERLAYFS +bypass_orig_flow: +#endif realfile = ovl_dir_open_realfile(file, &realpath); if (IS_ERR(realfile)) { kfree(od); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index d59624b439f9..0622a660b0c6 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -324,6 +324,18 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) struct path path; int err; +#ifdef CONFIG_KSU_SUSFS_SUS_OVERLAYFS + ovl_path_lowerdata(root_dentry, &path); + if (likely(path.mnt && path.dentry)) { + err = vfs_statfs(&path, buf); + if (!err) { + buf->f_namelen = 255; // 255 for erofs, ext2/4, f2fs + buf->f_type = path.dentry->d_sb->s_magic; + } + return err; + } +#endif + ovl_path_real(root_dentry, &path); err = vfs_statfs(&path, buf); diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c index 2e244ada1f97..baf86392d1a2 100644 --- a/fs/proc/bootconfig.c +++ b/fs/proc/bootconfig.c @@ -12,8 +12,19 @@ static char *saved_boot_config; +#ifdef CONFIG_KSU_SUSFS_SPOOF_BOOTCONFIG +extern int susfs_spoof_bootconfig(struct seq_file *m); +#endif + static int boot_config_proc_show(struct seq_file *m, void *v) { +#ifdef CONFIG_KSU_SUSFS_SPOOF_BOOTCONFIG + if (saved_boot_config) { + if (!susfs_spoof_bootconfig(m)) { + return 0; + } + } +#endif if (saved_boot_config) seq_puts(m, saved_boot_config); return 0; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f96a51999489..fcaf56f4e83f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -318,6 +318,10 @@ static void show_vma_header_prefix(struct seq_file *m, seq_putc(m, ' '); } +#ifdef CONFIG_KSU_SUSFS_SUS_KSTAT +extern void susfs_sus_ino_for_show_map_vma(unsigned long ino, dev_t *out_dev, unsigned long *out_ino); +#endif + static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { @@ -332,8 +336,17 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) if (file) { struct inode *inode = file_inode(vma->vm_file); +#ifdef CONFIG_KSU_SUSFS_SUS_KSTAT + if (unlikely(inode->i_state & 67108864)) { + susfs_sus_ino_for_show_map_vma(inode->i_ino, &dev, &ino); + goto bypass_orig_flow; + } +#endif dev = inode->i_sb->s_dev; ino = inode->i_ino; +#ifdef CONFIG_KSU_SUSFS_SUS_KSTAT +bypass_orig_flow: +#endif pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; } diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index eafb75755fa3..362b2bf59b88 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -18,6 +18,10 @@ #include "pnode.h" #include "internal.h" +#ifdef CONFIG_KSU_SUSFS_SUS_MOUNT +extern bool susfs_is_current_ksu_domain(void); +#endif + static __poll_t mounts_poll(struct file *file, poll_table *wait) { struct seq_file *m = file->private_data; @@ -103,6 +107,11 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) struct super_block *sb = mnt_path.dentry->d_sb; int err; +#ifdef CONFIG_KSU_SUSFS_SUS_MOUNT + if (unlikely((r->mnt.mnt_root->d_inode->i_state & 33554432) && !susfs_is_current_ksu_domain())) + return 0; +#endif + if (sb->s_op->show_devname) { err = sb->s_op->show_devname(m, mnt_path.dentry); if (err) @@ -137,6 +146,10 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; int err; +#ifdef CONFIG_KSU_SUSFS_SUS_MOUNT + if (unlikely((r->mnt.mnt_root->d_inode->i_state & 33554432) && !susfs_is_current_ksu_domain())) + return 0; +#endif seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id, MAJOR(sb->s_dev), MINOR(sb->s_dev)); if (sb->s_op->show_path) { @@ -199,6 +212,11 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) struct super_block *sb = mnt_path.dentry->d_sb; int err; +#ifdef CONFIG_KSU_SUSFS_SUS_MOUNT + if (unlikely((r->mnt.mnt_root->d_inode->i_state & 33554432) && !susfs_is_current_ksu_domain())) + return 0; +#endif + /* device */ if (sb->s_op->show_devname) { seq_puts(m, "device "); diff --git a/fs/readdir.c b/fs/readdir.c index 09e8ed7d4161..b4604517b047 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -307,6 +307,10 @@ struct getdents_callback64 { int error; }; +#ifdef CONFIG_KSU_SUSFS_SUS_PATH +extern int susfs_sus_ino_for_filldir64(unsigned long ino); +#endif + static int filldir64(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { @@ -317,6 +321,11 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, sizeof(u64)); int prev_reclen; +#ifdef CONFIG_KSU_SUSFS_SUS_PATH + if (likely(current_cred()->user->android_kabi_reserved2 & 16777216) && susfs_sus_ino_for_filldir64(ino)) { + return 0; + } +#endif buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) return buf->error; diff --git a/fs/stat.c b/fs/stat.c index c6a2e10983f8..55afd4327883 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -24,6 +24,10 @@ #include "internal.h" #include "mount.h" +#ifdef CONFIG_KSU_SUSFS_SUS_KSTAT +extern void susfs_sus_ino_for_generic_fillattr(unsigned long ino, struct kstat *stat); +#endif + /** * generic_fillattr - Fill in the basic attributes from the inode struct * @inode: Inode to use as the source @@ -35,6 +39,16 @@ */ void generic_fillattr(struct inode *inode, struct kstat *stat) { +#ifdef CONFIG_KSU_SUSFS_SUS_KSTAT + if (unlikely(inode->i_state & 67108864)) { + susfs_sus_ino_for_generic_fillattr(inode->i_ino, stat); + stat->mode = inode->i_mode; + stat->rdev = inode->i_rdev; + stat->uid = inode->i_uid; + stat->gid = inode->i_gid; + return; + } +#endif stat->dev = inode->i_sb->s_dev; stat->ino = inode->i_ino; stat->mode = inode->i_mode; @@ -171,6 +185,12 @@ int vfs_fstat(int fd, struct kstat *stat) * * 0 will be returned on success, and a -ve error code if unsuccessful. */ + +#ifdef CONFIG_KSU_SUSFS_SUS_SU +extern bool susfs_is_sus_su_hooks_enabled __read_mostly; +extern int ksu_handle_stat(int *dfd, const char __user **filename_user, int *flags); +#endif + static int vfs_statx(int dfd, const char __user *filename, int flags, struct kstat *stat, u32 request_mask) { @@ -178,6 +198,12 @@ static int vfs_statx(int dfd, const char __user *filename, int flags, unsigned lookup_flags = 0; int error; +#ifdef CONFIG_KSU_SUSFS_SUS_SU + if (susfs_is_sus_su_hooks_enabled) { + ksu_handle_stat(&dfd, &filename, &flags); + } +#endif + if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH | AT_STATX_SYNC_TYPE)) return -EINVAL; diff --git a/fs/statfs.c b/fs/statfs.c index d42b44dc0e49..553f995a04a1 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -109,6 +109,22 @@ int user_statfs(const char __user *pathname, struct kstatfs *st) goto retry; } } +#ifdef CONFIG_KSU_SUSFS_SUS_OVERLAYFS + /* - When mounting overlay, the f_flags are set with 'ro' and 'relatime', + * but this is an abnormal status, as when we inspect the output from mountinfo, + * we will find that all partitions set with 'ro' will have 'noatime' set as well. + * - But what is strange here is that the vfsmnt f_flags of the lowest layer has corrent f_flags set, + * and still it is always changed to 'relatime' instead of 'noatime' for the final result, + * I can't think of any other reason to explain about this, maybe the f_flags is set by its own + * filesystem implementation but not the one from overlayfs. + * - Anyway we just cannot use the retrieved f_flags from ovl_getattr() of overlayfs, + * we need to run one more check for user_statfs() and fd_statfs() by ourselves. + */ + if (unlikely((st->f_flags & ST_RDONLY) && (st->f_flags & ST_RELATIME))) { + st->f_flags &= ~ST_RELATIME; + st->f_flags |= ST_NOATIME; + } +#endif return error; } @@ -120,6 +136,12 @@ int fd_statfs(int fd, struct kstatfs *st) error = vfs_statfs(&f.file->f_path, st); fdput(f); } +#ifdef CONFIG_KSU_SUSFS_SUS_OVERLAYFS + if (unlikely((st->f_flags & ST_RDONLY) && (st->f_flags & ST_RELATIME))) { + st->f_flags &= ~ST_RELATIME; + st->f_flags |= ST_NOATIME; + } +#endif return error; } diff --git a/fs/sus_su.c b/fs/sus_su.c new file mode 100644 index 000000000000..d140468d0714 --- /dev/null +++ b/fs/sus_su.c @@ -0,0 +1,140 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_KSU_SUSFS_ENABLE_LOG +extern bool susfs_is_log_enabled __read_mostly; +#define SUSFS_LOGI(fmt, ...) if (susfs_is_log_enabled) pr_info("susfs_sus_su:[%u][%u][%s] " fmt, current_uid().val, current->pid, __func__, ##__VA_ARGS__) +#define SUSFS_LOGE(fmt, ...) if (susfs_is_log_enabled) pr_err("susfs_sus_su:[%u][%u][%s]" fmt, current_uid().val, current->pid, __func__, ##__VA_ARGS__) +#else +#define SUSFS_LOGI(fmt, ...) +#define SUSFS_LOGE(fmt, ...) +#endif + +#define FIFO_SIZE 1024 +#define MAX_DRV_NAME 255 + +static int cur_maj_dev_num = -1; +static char fifo_buffer[FIFO_SIZE]; +static struct cdev sus_su_cdev; +static const char *sus_su_token = "!@#$SU_IS_SUS$#@!-pRE6W9BKXrJr1hEKyvDq0CvWziVKbatT8yzq06fhtrEGky2tVS7Q2QTjhtMfVMGV"; +static char rand_drv_path[MAX_DRV_NAME+1] = "/dev/"; +static bool is_sus_su_enabled_before = false; + +extern bool susfs_is_allow_su(void); +extern void ksu_escape_to_root(void); + +static void gen_rand_drv_name(char *buffer, size_t min_length, size_t max_length) { + const char *symbols = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_-+@#:="; + size_t symbols_length = strlen(symbols); + size_t length, i; + unsigned int rand_value; + + // Determine the random length of the string + get_random_bytes(&rand_value, sizeof(rand_value)); + length = min_length + (rand_value % (max_length - min_length + 1)); + + for (i = 0; i < length; ++i) { + get_random_bytes(&rand_value, sizeof(rand_value)); + buffer[i] = symbols[rand_value % symbols_length]; + } + buffer[length] = '\0'; // Null-terminate the string +} + +static int fifo_open(struct inode *inode, struct file *file) { + return 0; +} + +static int fifo_release(struct inode *inode, struct file *file) { + return 0; +} + +static ssize_t fifo_read(struct file *file, char __user *buf, size_t len, loff_t *offset) { + return 0; +} + +static ssize_t fifo_write(struct file *file, const char __user *buf, size_t len, loff_t *offset) { + int sus_su_token_len = strlen(sus_su_token); + + if (!susfs_is_allow_su()) { + SUSFS_LOGE("root is not allowed for uid: '%d', pid: '%d'\n", current_uid().val, current->pid); + return 0; + } + + if (copy_from_user(fifo_buffer, buf, sus_su_token_len+1)) { + SUSFS_LOGE("copy_from_user() failed, uid: '%d', pid: '%d'\n", current_uid().val, current->pid); + return 0; + } + + if (!memcmp(fifo_buffer, sus_su_token, sus_su_token_len+1)) { + SUSFS_LOGI("granting root access for uid: '%d', pid: '%d'\n", current_uid().val, current->pid); + ksu_escape_to_root(); + } else { + SUSFS_LOGI("wrong token! deny root access for uid: '%d', pid: '%d'\n", current_uid().val, current->pid); + } + memset(fifo_buffer, 0, FIFO_SIZE); + return 0; +} + +static struct file_operations fops = { + .owner = THIS_MODULE, + .open = fifo_open, + .release = fifo_release, + .read = fifo_read, + .write = fifo_write, +}; + +int sus_su_fifo_init(int *maj_dev_num, char *drv_path) { + if (cur_maj_dev_num > 0) { + SUSFS_LOGE("'%s' is already registered\n", rand_drv_path); + return -1; + } + + // generate a random driver name if it is executed for the first time + if (!is_sus_su_enabled_before) { + // min length 192, max length 248, just make sure max length doesn't exceeds 255 + gen_rand_drv_name(rand_drv_path+5, 192, 248); + } + + cur_maj_dev_num = register_chrdev(0, rand_drv_path+5, &fops); + if (cur_maj_dev_num < 0) { + SUSFS_LOGE("Failed to register character device\n"); + return -1; + } + + cdev_init(&sus_su_cdev, &fops); + if (cdev_add(&sus_su_cdev, MKDEV(cur_maj_dev_num, 0), 1) < 0) { + unregister_chrdev(cur_maj_dev_num, rand_drv_path+5); + SUSFS_LOGE("Failed to add cdev\n"); + return -1; + } + + strncpy(drv_path, rand_drv_path, strlen(rand_drv_path)); + *maj_dev_num = cur_maj_dev_num; + SUSFS_LOGI("'%s' registered with major device number %d\n", rand_drv_path, cur_maj_dev_num); + + if (!is_sus_su_enabled_before) + is_sus_su_enabled_before = true; + + return 0; +} + +int sus_su_fifo_exit(int *maj_dev_num, char *drv_path) { + if (cur_maj_dev_num < 0) { + SUSFS_LOGE("'%s' was already unregistered before\n", rand_drv_path); + return 0; + } + + cdev_del(&sus_su_cdev); + unregister_chrdev(cur_maj_dev_num, rand_drv_path+5); + cur_maj_dev_num = -1; + *maj_dev_num = cur_maj_dev_num; + strncpy(drv_path, rand_drv_path, strlen(rand_drv_path)); + SUSFS_LOGI("'%s' unregistered\n", rand_drv_path); + return 0; +} diff --git a/fs/susfs.c b/fs/susfs.c new file mode 100644 index 000000000000..5c46cffdcaa1 --- /dev/null +++ b/fs/susfs.c @@ -0,0 +1,819 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(5,15,0) +#include "pnode.h" +#endif + +static spinlock_t susfs_spin_lock; + +extern bool susfs_is_current_ksu_domain(void); + +#ifdef CONFIG_KSU_SUSFS_ENABLE_LOG +bool susfs_is_log_enabled __read_mostly = true; +#define SUSFS_LOGI(fmt, ...) if (susfs_is_log_enabled) pr_info("susfs:[%u][%d][%s] " fmt, current_uid().val, current->pid, __func__, ##__VA_ARGS__) +#define SUSFS_LOGE(fmt, ...) if (susfs_is_log_enabled) pr_err("susfs:[%u][%d][%s]" fmt, current_uid().val, current->pid, __func__, ##__VA_ARGS__) +#else +#define SUSFS_LOGI(fmt, ...) +#define SUSFS_LOGE(fmt, ...) +#endif + +#ifdef CONFIG_KSU_SUSFS_SUS_MOUNT +extern void ksu_try_umount(const char *mnt, bool check_mnt, int flags); +#endif + +/* sus_path */ +#ifdef CONFIG_KSU_SUSFS_SUS_PATH +static DEFINE_HASHTABLE(SUS_PATH_HLIST, 10); +static int susfs_update_sus_path_inode(char *target_pathname) { + struct path p; + struct inode *inode = NULL; + + if (kern_path(target_pathname, LOOKUP_FOLLOW, &p)) { + SUSFS_LOGE("Failed opening file '%s'\n", target_pathname); + return 1; + } + + // We don't allow path of which filesystem type is "tmpfs", because its inode->i_ino is starting from 1 again, + // which will cause wrong comparison in function susfs_sus_ino_for_filldir64() + if (strcmp(p.mnt->mnt_sb->s_type->name, "tmpfs") == 0) { + SUSFS_LOGE("target_pathname: '%s' cannot be added since its filesystem is 'tmpfs'\n", target_pathname); + path_put(&p); + return 1; + } + + inode = d_inode(p.dentry); + if (!inode) { + SUSFS_LOGE("inode is NULL\n"); + path_put(&p); + return 1; + } + + spin_lock(&inode->i_lock); + inode->i_state |= INODE_STATE_SUS_PATH; + spin_unlock(&inode->i_lock); + + path_put(&p); + return 0; +} + +int susfs_add_sus_path(struct st_susfs_sus_path* __user user_info) { + struct st_susfs_sus_path info; + struct st_susfs_sus_path_hlist *new_entry, *tmp_entry; + struct hlist_node *tmp_node; + int bkt; + bool update_hlist = false; + + if (copy_from_user(&info, user_info, sizeof(info))) { + SUSFS_LOGE("failed copying from userspace\n"); + return 1; + } + + spin_lock(&susfs_spin_lock); + hash_for_each_safe(SUS_PATH_HLIST, bkt, tmp_node, tmp_entry, node) { + if (!strcmp(tmp_entry->target_pathname, info.target_pathname)) { + hash_del(&tmp_entry->node); + kfree(tmp_entry); + update_hlist = true; + break; + } + } + spin_unlock(&susfs_spin_lock); + + new_entry = kmalloc(sizeof(struct st_susfs_sus_path_hlist), GFP_KERNEL); + if (!new_entry) { + SUSFS_LOGE("no enough memory\n"); + return 1; + } + + new_entry->target_ino = info.target_ino; + strncpy(new_entry->target_pathname, info.target_pathname, SUSFS_MAX_LEN_PATHNAME-1); + if (susfs_update_sus_path_inode(new_entry->target_pathname)) { + kfree(new_entry); + return 1; + } + spin_lock(&susfs_spin_lock); + hash_add(SUS_PATH_HLIST, &new_entry->node, info.target_ino); + if (update_hlist) { + SUSFS_LOGI("target_ino: '%lu', target_pathname: '%s' is successfully updated to SUS_PATH_HLIST\n", + new_entry->target_ino, new_entry->target_pathname); + } else { + SUSFS_LOGI("target_ino: '%lu', target_pathname: '%s' is successfully added to SUS_PATH_HLIST\n", + new_entry->target_ino, new_entry->target_pathname); + } + spin_unlock(&susfs_spin_lock); + return 0; +} + +int susfs_sus_ino_for_filldir64(unsigned long ino) { + struct st_susfs_sus_path_hlist *entry; + + hash_for_each_possible(SUS_PATH_HLIST, entry, node, ino) { + if (entry->target_ino == ino) + return 1; + } + return 0; +} +#endif // #ifdef CONFIG_KSU_SUSFS_SUS_PATH + +/* sus_mount */ +#ifdef CONFIG_KSU_SUSFS_SUS_MOUNT +static LIST_HEAD(LH_SUS_MOUNT); +static void susfs_update_sus_mount_inode(char *target_pathname) { + struct path p; + struct inode *inode = NULL; + int err = 0; + + err = kern_path(target_pathname, LOOKUP_FOLLOW, &p); + if (err) { + SUSFS_LOGE("Failed opening file '%s'\n", target_pathname); + return; + } + + inode = d_inode(p.dentry); + if (!inode) { + path_put(&p); + SUSFS_LOGE("inode is NULL\n"); + return; + } + + spin_lock(&inode->i_lock); + inode->i_state |= INODE_STATE_SUS_MOUNT; + spin_unlock(&inode->i_lock); + + path_put(&p); +} + +int susfs_add_sus_mount(struct st_susfs_sus_mount* __user user_info) { + struct st_susfs_sus_mount_list *cursor = NULL, *temp = NULL; + struct st_susfs_sus_mount_list *new_list = NULL; + struct st_susfs_sus_mount info; + + if (copy_from_user(&info, user_info, sizeof(info))) { + SUSFS_LOGE("failed copying from userspace\n"); + return 1; + } + +#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) +#ifdef CONFIG_MIPS + info.target_dev = new_decode_dev(info.target_dev); +#else + info.target_dev = huge_decode_dev(info.target_dev); +#endif /* CONFIG_MIPS */ +#else + info.target_dev = old_decode_dev(info.target_dev); +#endif /* defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) */ + + list_for_each_entry_safe(cursor, temp, &LH_SUS_MOUNT, list) { + if (unlikely(!strcmp(cursor->info.target_pathname, info.target_pathname))) { + spin_lock(&susfs_spin_lock); + memcpy(&cursor->info, &info, sizeof(info)); + susfs_update_sus_mount_inode(cursor->info.target_pathname); + SUSFS_LOGI("target_pathname: '%s', target_dev: '%lu', is successfully updated to LH_SUS_MOUNT\n", + cursor->info.target_pathname, cursor->info.target_dev); + spin_unlock(&susfs_spin_lock); + return 0; + } + } + + new_list = kmalloc(sizeof(struct st_susfs_sus_mount_list), GFP_KERNEL); + if (!new_list) { + SUSFS_LOGE("no enough memory\n"); + return 1; + } + + memcpy(&new_list->info, &info, sizeof(info)); + susfs_update_sus_mount_inode(new_list->info.target_pathname); + + INIT_LIST_HEAD(&new_list->list); + spin_lock(&susfs_spin_lock); + list_add_tail(&new_list->list, &LH_SUS_MOUNT); + SUSFS_LOGI("target_pathname: '%s', target_dev: '%lu', is successfully added to LH_SUS_MOUNT\n", + new_list->info.target_pathname, new_list->info.target_dev); + spin_unlock(&susfs_spin_lock); + return 0; +} + +#ifdef CONFIG_KSU_SUSFS_AUTO_ADD_SUS_BIND_MOUNT +int susfs_auto_add_sus_bind_mount(const char *pathname, struct path *path_target) { + struct inode *inode; + // Only source mount path starting with '/data/adb/' will be hidden + if (strncmp(pathname, "/data/adb/", 10)) { + SUSFS_LOGE("skip setting SUS_MOUNT inode state for source bind mount path '%s'\n", pathname); + return 1; + } + inode = path_target->dentry->d_inode; + if (!inode) return 1; + spin_lock(&inode->i_lock); + inode->i_state |= INODE_STATE_SUS_MOUNT; + SUSFS_LOGI("set SUS_MOUNT inode state for source bind mount path '%s'\n", pathname); + spin_unlock(&inode->i_lock); + return 0; +} +#endif // #ifdef CONFIG_KSU_SUSFS_AUTO_ADD_SUS_BIND_MOUNT + +#ifdef CONFIG_KSU_SUSFS_AUTO_ADD_SUS_KSU_DEFAULT_MOUNT +void susfs_auto_add_sus_ksu_default_mount(const char __user *to_pathname) { + char pathname[SUSFS_MAX_LEN_PATHNAME]; + struct path path; + struct inode *inode; + + // Here we need to re-retrieve the struct path as we want the new struct path, not the old one + if (strncpy_from_user(pathname, to_pathname, SUSFS_MAX_LEN_PATHNAME-1) < 0) + return; + //SUSFS_LOGI("pathname: '%s'\n", pathname); + if ((!strncmp(pathname, "/data/adb/modules", 17) || + !strncmp(pathname, "/debug_ramdisk", 14) || + !strncmp(pathname, "/system", 7) || + !strncmp(pathname, "/system_ext", 11) || + !strncmp(pathname, "/vendor", 7) || + !strncmp(pathname, "/product", 8)) && + !kern_path(pathname, LOOKUP_FOLLOW, &path)) { + goto set_inode_sus_mount; + } + return; +set_inode_sus_mount: + inode = path.dentry->d_inode; + if (!inode) return; + spin_lock(&inode->i_lock); + inode->i_state |= INODE_STATE_SUS_MOUNT; + SUSFS_LOGI("set SUS_MOUNT inode state for default KSU mount path '%s'\n", pathname); + spin_unlock(&inode->i_lock); + path_put(&path); +} +#endif // #ifdef CONFIG_KSU_SUSFS_AUTO_ADD_SUS_KSU_DEFAULT_MOUNT +#endif // #ifdef CONFIG_KSU_SUSFS_SUS_MOUNT + +/* sus_kstat */ +#ifdef CONFIG_KSU_SUSFS_SUS_KSTAT +static DEFINE_HASHTABLE(SUS_KSTAT_HLIST, 10); +static int susfs_update_sus_kstat_inode(char *target_pathname) { + struct path p; + struct inode *inode = NULL; + int err = 0; + + err = kern_path(target_pathname, LOOKUP_FOLLOW, &p); + if (err) { + SUSFS_LOGE("Failed opening file '%s'\n", target_pathname); + return 1; + } + + // We don't allow path of which filesystem type is "tmpfs", because its inode->i_ino is starting from 1 again, + // which will cause wrong comparison in function susfs_sus_ino_for_filldir64() + if (strcmp(p.mnt->mnt_sb->s_type->name, "tmpfs") == 0) { + SUSFS_LOGE("target_pathname: '%s' cannot be added since its filesystem is 'tmpfs'\n", target_pathname); + path_put(&p); + return 1; + } + + inode = d_inode(p.dentry); + if (!inode) { + path_put(&p); + SUSFS_LOGE("inode is NULL\n"); + return 1; + } + + spin_lock(&inode->i_lock); + inode->i_state |= INODE_STATE_SUS_KSTAT; + spin_unlock(&inode->i_lock); + + path_put(&p); + return 0; +} + +int susfs_add_sus_kstat(struct st_susfs_sus_kstat* __user user_info) { + struct st_susfs_sus_kstat info; + struct st_susfs_sus_kstat_hlist *new_entry, *tmp_entry; + struct hlist_node *tmp_node; + int bkt; + bool update_hlist = false; + + if (copy_from_user(&info, user_info, sizeof(info))) { + SUSFS_LOGE("failed copying from userspace\n"); + return 1; + } + + if (strlen(info.target_pathname) == 0) { + SUSFS_LOGE("target_pathname is an empty string\n"); + return 1; + } + + spin_lock(&susfs_spin_lock); + hash_for_each_safe(SUS_KSTAT_HLIST, bkt, tmp_node, tmp_entry, node) { + if (!strcmp(tmp_entry->info.target_pathname, info.target_pathname)) { + hash_del(&tmp_entry->node); + kfree(tmp_entry); + update_hlist = true; + break; + } + } + spin_unlock(&susfs_spin_lock); + + new_entry = kmalloc(sizeof(struct st_susfs_sus_kstat_hlist), GFP_KERNEL); + if (!new_entry) { + SUSFS_LOGE("no enough memory\n"); + return 1; + } + +#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) +#ifdef CONFIG_MIPS + info.spoofed_dev = new_decode_dev(info.spoofed_dev); +#else + info.spoofed_dev = huge_decode_dev(info.spoofed_dev); +#endif /* CONFIG_MIPS */ +#else + info.spoofed_dev = old_decode_dev(info.spoofed_dev); +#endif /* defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64) */ + + new_entry->target_ino = info.target_ino; + memcpy(&new_entry->info, &info, sizeof(info)); + // only if the target is added statically needs to have flag INODE_STATE_SUS_KSTAT set here, + // otherwise the flag INODE_STATE_SUS_KSTAT should be set in function susfs_update_sus_kstat() + if (new_entry->info.is_statically && susfs_update_sus_kstat_inode(new_entry->info.target_pathname)) { + kfree(new_entry); + return 1; + } + + spin_lock(&susfs_spin_lock); + hash_add(SUS_KSTAT_HLIST, &new_entry->node, info.target_ino); + if (update_hlist) { + SUSFS_LOGI("is_statically: '%d', target_ino: '%lu', target_pathname: '%s', spoofed_ino: '%lu', spoofed_dev: '%lu', spoofed_nlink: '%u', spoofed_size: '%u', spoofed_atime_tv_sec: '%ld', spoofed_mtime_tv_sec: '%ld', spoofed_ctime_tv_sec: '%ld', spoofed_atime_tv_nsec: '%ld', spoofed_mtime_tv_nsec: '%ld', spoofed_ctime_tv_nsec: '%ld', spoofed_blksize: '%lu', spoofed_blocks: '%llu', is successfully added to SUS_KSTAT_HLIST\n", + new_entry->info.is_statically, new_entry->info.target_ino, new_entry->info.target_pathname, + new_entry->info.spoofed_ino, new_entry->info.spoofed_dev, + new_entry->info.spoofed_nlink, new_entry->info.spoofed_size, + new_entry->info.spoofed_atime_tv_sec, new_entry->info.spoofed_mtime_tv_sec, new_entry->info.spoofed_ctime_tv_sec, + new_entry->info.spoofed_atime_tv_nsec, new_entry->info.spoofed_mtime_tv_nsec, new_entry->info.spoofed_ctime_tv_nsec, + new_entry->info.spoofed_blksize, new_entry->info.spoofed_blocks); + } else { + SUSFS_LOGI("is_statically: '%d', target_ino: '%lu', target_pathname: '%s', spoofed_ino: '%lu', spoofed_dev: '%lu', spoofed_nlink: '%u', spoofed_size: '%u', spoofed_atime_tv_sec: '%ld', spoofed_mtime_tv_sec: '%ld', spoofed_ctime_tv_sec: '%ld', spoofed_atime_tv_nsec: '%ld', spoofed_mtime_tv_nsec: '%ld', spoofed_ctime_tv_nsec: '%ld', spoofed_blksize: '%lu', spoofed_blocks: '%llu', is successfully updated to SUS_KSTAT_HLIST\n", + new_entry->info.is_statically, new_entry->info.target_ino, new_entry->info.target_pathname, + new_entry->info.spoofed_ino, new_entry->info.spoofed_dev, + new_entry->info.spoofed_nlink, new_entry->info.spoofed_size, + new_entry->info.spoofed_atime_tv_sec, new_entry->info.spoofed_mtime_tv_sec, new_entry->info.spoofed_ctime_tv_sec, + new_entry->info.spoofed_atime_tv_nsec, new_entry->info.spoofed_mtime_tv_nsec, new_entry->info.spoofed_ctime_tv_nsec, + new_entry->info.spoofed_blksize, new_entry->info.spoofed_blocks); + } + spin_unlock(&susfs_spin_lock); + return 0; +} + +int susfs_update_sus_kstat(struct st_susfs_sus_kstat* __user user_info) { + struct st_susfs_sus_kstat info; + struct st_susfs_sus_kstat_hlist *new_entry, *tmp_entry; + struct hlist_node *tmp_node; + int bkt; + int err = 0; + + if (copy_from_user(&info, user_info, sizeof(info))) { + SUSFS_LOGE("failed copying from userspace\n"); + return 1; + } + + spin_lock(&susfs_spin_lock); + hash_for_each_safe(SUS_KSTAT_HLIST, bkt, tmp_node, tmp_entry, node) { + if (!strcmp(tmp_entry->info.target_pathname, info.target_pathname)) { + if (susfs_update_sus_kstat_inode(tmp_entry->info.target_pathname)) { + err = 1; + goto out_spin_unlock; + } + new_entry = kmalloc(sizeof(struct st_susfs_sus_kstat_hlist), GFP_KERNEL); + if (!new_entry) { + SUSFS_LOGE("no enough memory\n"); + err = 1; + goto out_spin_unlock; + } + memcpy(&new_entry->info, &tmp_entry->info, sizeof(tmp_entry->info)); + SUSFS_LOGI("updating target_ino from '%lu' to '%lu' for pathname: '%s' in SUS_KSTAT_HLIST\n", + new_entry->info.target_ino, info.target_ino, info.target_pathname); + new_entry->target_ino = info.target_ino; + new_entry->info.target_ino = info.target_ino; + if (info.spoofed_size > 0) { + SUSFS_LOGI("updating spoofed_size from '%lld' to '%lld' for pathname: '%s' in SUS_KSTAT_HLIST\n", + new_entry->info.spoofed_size, info.spoofed_size, info.target_pathname); + new_entry->info.spoofed_size = info.spoofed_size; + } + if (info.spoofed_blocks > 0) { + SUSFS_LOGI("updating spoofed_blocks from '%llu' to '%llu' for pathname: '%s' in SUS_KSTAT_HLIST\n", + new_entry->info.spoofed_blocks, info.spoofed_blocks, info.target_pathname); + new_entry->info.spoofed_blocks = info.spoofed_blocks; + } + hash_del(&tmp_entry->node); + kfree(tmp_entry); + hash_add(SUS_KSTAT_HLIST, &new_entry->node, info.target_ino); + goto out_spin_unlock; + } + } +out_spin_unlock: + spin_unlock(&susfs_spin_lock); + return err; +} + +void susfs_sus_ino_for_generic_fillattr(unsigned long ino, struct kstat *stat) { + struct st_susfs_sus_kstat_hlist *entry; + + hash_for_each_possible(SUS_KSTAT_HLIST, entry, node, ino) { + if (entry->target_ino == ino) { + stat->dev = entry->info.spoofed_dev; + stat->ino = entry->info.spoofed_ino; + stat->nlink = entry->info.spoofed_nlink; + stat->size = entry->info.spoofed_size; + stat->atime.tv_sec = entry->info.spoofed_atime_tv_sec; + stat->atime.tv_nsec = entry->info.spoofed_atime_tv_nsec; + stat->mtime.tv_sec = entry->info.spoofed_mtime_tv_sec; + stat->mtime.tv_nsec = entry->info.spoofed_mtime_tv_nsec; + stat->ctime.tv_sec = entry->info.spoofed_ctime_tv_sec; + stat->ctime.tv_nsec = entry->info.spoofed_ctime_tv_nsec; + stat->blocks = entry->info.spoofed_blocks; + stat->blksize = entry->info.spoofed_blksize; + return; + } + } +} + +void susfs_sus_ino_for_show_map_vma(unsigned long ino, dev_t *out_dev, unsigned long *out_ino) { + struct st_susfs_sus_kstat_hlist *entry; + + hash_for_each_possible(SUS_KSTAT_HLIST, entry, node, ino) { + if (entry->target_ino == ino) { + *out_dev = entry->info.spoofed_dev; + *out_ino = entry->info.spoofed_ino; + return; + } + } +} +#endif // #ifdef CONFIG_KSU_SUSFS_SUS_KSTAT + +/* try_umount */ +#ifdef CONFIG_KSU_SUSFS_TRY_UMOUNT +static LIST_HEAD(LH_TRY_UMOUNT_PATH); +int susfs_add_try_umount(struct st_susfs_try_umount* __user user_info) { + struct st_susfs_try_umount_list *cursor = NULL, *temp = NULL; + struct st_susfs_try_umount_list *new_list = NULL; + struct st_susfs_try_umount info; + + if (copy_from_user(&info, user_info, sizeof(info))) { + SUSFS_LOGE("failed copying from userspace\n"); + return 1; + } + + list_for_each_entry_safe(cursor, temp, &LH_TRY_UMOUNT_PATH, list) { + if (unlikely(!strcmp(info.target_pathname, cursor->info.target_pathname))) { + SUSFS_LOGE("target_pathname: '%s' is already created in LH_TRY_UMOUNT_PATH\n", info.target_pathname); + return 1; + } + } + + new_list = kmalloc(sizeof(struct st_susfs_try_umount_list), GFP_KERNEL); + if (!new_list) { + SUSFS_LOGE("no enough memory\n"); + return 1; + } + + memcpy(&new_list->info, &info, sizeof(info)); + + INIT_LIST_HEAD(&new_list->list); + spin_lock(&susfs_spin_lock); + list_add_tail(&new_list->list, &LH_TRY_UMOUNT_PATH); + spin_unlock(&susfs_spin_lock); + SUSFS_LOGI("target_pathname: '%s', mnt_mode: %d, is successfully added to LH_TRY_UMOUNT_PATH\n", new_list->info.target_pathname, new_list->info.mnt_mode); + return 0; +} + +void susfs_try_umount(uid_t target_uid) { + struct st_susfs_try_umount_list *cursor = NULL, *temp = NULL; + + list_for_each_entry_safe(cursor, temp, &LH_TRY_UMOUNT_PATH, list) { + SUSFS_LOGI("umounting '%s' for uid: %d\n", cursor->info.target_pathname, target_uid); + if (cursor->info.mnt_mode == TRY_UMOUNT_DEFAULT) { + ksu_try_umount(cursor->info.target_pathname, false, 0); + } else if (cursor->info.mnt_mode == TRY_UMOUNT_DETACH) { + ksu_try_umount(cursor->info.target_pathname, false, MNT_DETACH); + } else { + SUSFS_LOGE("failed umounting '%s' for uid: %d, mnt_mode '%d' not supported\n", + cursor->info.target_pathname, target_uid, cursor->info.mnt_mode); + } + } +} + +#ifdef CONFIG_KSU_SUSFS_AUTO_ADD_TRY_UMOUNT_FOR_BIND_MOUNT +void susfs_auto_add_try_umount_for_bind_mount(struct path *path) { + struct st_susfs_try_umount_list *cursor = NULL, *temp = NULL; + struct st_susfs_try_umount_list *new_list = NULL; + char *pathname = NULL, *dpath = NULL; + + pathname = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!pathname) { + SUSFS_LOGE("no enough memory\n"); + return; + } + + dpath = d_path(path, pathname, PAGE_SIZE); + if (!dpath) { + SUSFS_LOGE("dpath is NULL\n"); + goto out_free_pathname; + } + + list_for_each_entry_safe(cursor, temp, &LH_TRY_UMOUNT_PATH, list) { + if (unlikely(!strcmp(dpath, cursor->info.target_pathname))) { + SUSFS_LOGE("target_pathname: '%s' is already created in LH_TRY_UMOUNT_PATH\n", dpath); + goto out_free_pathname; + } + } + + new_list = kmalloc(sizeof(struct st_susfs_try_umount_list), GFP_KERNEL); + if (!new_list) { + SUSFS_LOGE("no enough memory\n"); + goto out_free_pathname; + } + + strncpy(new_list->info.target_pathname, dpath, SUSFS_MAX_LEN_PATHNAME-1); + new_list->info.mnt_mode = TRY_UMOUNT_DETACH; + + INIT_LIST_HEAD(&new_list->list); + spin_lock(&susfs_spin_lock); + list_add_tail(&new_list->list, &LH_TRY_UMOUNT_PATH); + spin_unlock(&susfs_spin_lock); + SUSFS_LOGI("target_pathname: '%s', mnt_mode: %d, is successfully added to LH_TRY_UMOUNT_PATH\n", new_list->info.target_pathname, new_list->info.mnt_mode); +out_free_pathname: + kfree(pathname); +} +#endif // #ifdef CONFIG_KSU_SUSFS_AUTO_ADD_TRY_UMOUNT_FOR_BIND_MOUNT +#endif // #ifdef CONFIG_KSU_SUSFS_TRY_UMOUNT + +/* spoof_uname */ +#ifdef CONFIG_KSU_SUSFS_SPOOF_UNAME +static spinlock_t susfs_uname_spin_lock; +static struct st_susfs_uname my_uname; +static void susfs_my_uname_init(void) { + memset(&my_uname, 0, sizeof(my_uname)); +} + +int susfs_set_uname(struct st_susfs_uname* __user user_info) { + struct st_susfs_uname info; + + if (copy_from_user(&info, user_info, sizeof(struct st_susfs_uname))) { + SUSFS_LOGE("failed copying from userspace.\n"); + return 1; + } + + spin_lock(&susfs_uname_spin_lock); + if (!strcmp(info.release, "default")) { + strncpy(my_uname.release, utsname()->release, __NEW_UTS_LEN); + } else { + strncpy(my_uname.release, info.release, __NEW_UTS_LEN); + } + if (!strcmp(info.version, "default")) { + strncpy(my_uname.version, utsname()->version, __NEW_UTS_LEN); + } else { + strncpy(my_uname.version, info.version, __NEW_UTS_LEN); + } + spin_unlock(&susfs_uname_spin_lock); + SUSFS_LOGI("setting spoofed release: '%s', version: '%s'\n", + my_uname.release, my_uname.version); + return 0; +} + +int susfs_spoof_uname(struct new_utsname* tmp) { + if (unlikely(my_uname.release[0] == '\0' || spin_is_locked(&susfs_uname_spin_lock))) + return 1; + strncpy(tmp->sysname, utsname()->sysname, __NEW_UTS_LEN); + strncpy(tmp->nodename, utsname()->nodename, __NEW_UTS_LEN); + strncpy(tmp->release, my_uname.release, __NEW_UTS_LEN); + strncpy(tmp->version, my_uname.version, __NEW_UTS_LEN); + strncpy(tmp->machine, utsname()->machine, __NEW_UTS_LEN); + strncpy(tmp->domainname, utsname()->domainname, __NEW_UTS_LEN); + return 0; +} +#endif // #ifdef CONFIG_KSU_SUSFS_SPOOF_UNAME + +/* set_log */ +#ifdef CONFIG_KSU_SUSFS_ENABLE_LOG +void susfs_set_log(bool enabled) { + spin_lock(&susfs_spin_lock); + susfs_is_log_enabled = enabled; + spin_unlock(&susfs_spin_lock); + if (susfs_is_log_enabled) { + pr_info("susfs: enable logging to kernel"); + } else { + pr_info("susfs: disable logging to kernel"); + } +} +#endif // #ifdef CONFIG_KSU_SUSFS_ENABLE_LOG + +/* spoof_bootconfig */ +#ifdef CONFIG_KSU_SUSFS_SPOOF_BOOTCONFIG +static char *fake_boot_config = NULL; +int susfs_set_bootconfig(char* __user user_fake_boot_config) { + int res; + + if (!fake_boot_config) { + // 4096 is enough I guess + fake_boot_config = kmalloc(SUSFS_FAKE_BOOT_CONFIG_SIZE, GFP_KERNEL); + if (!fake_boot_config) { + SUSFS_LOGE("no enough memory\n"); + return -ENOMEM; + } + } + + spin_lock(&susfs_spin_lock); + memset(fake_boot_config, 0, SUSFS_FAKE_BOOT_CONFIG_SIZE); + res = strncpy_from_user(fake_boot_config, user_fake_boot_config, SUSFS_FAKE_BOOT_CONFIG_SIZE-1); + spin_unlock(&susfs_spin_lock); + + if (res > 0) { + SUSFS_LOGI("fake_boot_config is set, length of string: %u\n", strlen(fake_boot_config)); + return 0; + } + SUSFS_LOGI("failed setting fake_boot_config\n"); + return res; +} + +int susfs_spoof_bootconfig(struct seq_file *m) { + if (fake_boot_config != NULL) { + seq_puts(m, fake_boot_config); + return 0; + } + return 1; +} +#endif + +/* open_redirect */ +#ifdef CONFIG_KSU_SUSFS_OPEN_REDIRECT +static DEFINE_HASHTABLE(OPEN_REDIRECT_HLIST, 10); +static int susfs_update_open_redirect_inode(struct st_susfs_open_redirect_hlist *new_entry) { + struct path path_target; + struct inode *inode_target; + int err = 0; + + err = kern_path(new_entry->target_pathname, LOOKUP_FOLLOW, &path_target); + if (err) { + SUSFS_LOGE("Failed opening file '%s'\n", new_entry->target_pathname); + return err; + } + + inode_target = d_inode(path_target.dentry); + if (!inode_target) { + SUSFS_LOGE("inode_target is NULL\n"); + err = 1; + goto out_path_put_target; + } + + spin_lock(&inode_target->i_lock); + inode_target->i_state |= INODE_STATE_OPEN_REDIRECT; + spin_unlock(&inode_target->i_lock); + +out_path_put_target: + path_put(&path_target); + return err; +} + +int susfs_add_open_redirect(struct st_susfs_open_redirect* __user user_info) { + struct st_susfs_open_redirect info; + struct st_susfs_open_redirect_hlist *new_entry, *tmp_entry; + struct hlist_node *tmp_node; + int bkt; + bool update_hlist = false; + + if (copy_from_user(&info, user_info, sizeof(info))) { + SUSFS_LOGE("failed copying from userspace\n"); + return 1; + } + + spin_lock(&susfs_spin_lock); + hash_for_each_safe(OPEN_REDIRECT_HLIST, bkt, tmp_node, tmp_entry, node) { + if (!strcmp(tmp_entry->target_pathname, info.target_pathname)) { + hash_del(&tmp_entry->node); + kfree(tmp_entry); + update_hlist = true; + break; + } + } + spin_unlock(&susfs_spin_lock); + + new_entry = kmalloc(sizeof(struct st_susfs_open_redirect_hlist), GFP_KERNEL); + if (!new_entry) { + SUSFS_LOGE("no enough memory\n"); + return 1; + } + + new_entry->target_ino = info.target_ino; + strncpy(new_entry->target_pathname, info.target_pathname, SUSFS_MAX_LEN_PATHNAME-1); + strncpy(new_entry->redirected_pathname, info.redirected_pathname, SUSFS_MAX_LEN_PATHNAME-1); + if (susfs_update_open_redirect_inode(new_entry)) { + SUSFS_LOGE("failed adding path '%s' to OPEN_REDIRECT_HLIST\n", new_entry->target_pathname); + kfree(new_entry); + return 1; + } + + spin_lock(&susfs_spin_lock); + hash_add(OPEN_REDIRECT_HLIST, &new_entry->node, info.target_ino); + if (update_hlist) { + SUSFS_LOGI("target_ino: '%lu', target_pathname: '%s', redirected_pathname: '%s', is successfully updated to OPEN_REDIRECT_HLIST\n", + new_entry->target_ino, new_entry->target_pathname, new_entry->redirected_pathname); + } else { + SUSFS_LOGI("target_ino: '%lu', target_pathname: '%s' redirected_pathname: '%s', is successfully added to OPEN_REDIRECT_HLIST\n", + new_entry->target_ino, new_entry->target_pathname, new_entry->redirected_pathname); + } + spin_unlock(&susfs_spin_lock); + return 0; +} + +struct filename* susfs_get_redirected_path(unsigned long ino) { + struct st_susfs_open_redirect_hlist *entry; + + hash_for_each_possible(OPEN_REDIRECT_HLIST, entry, node, ino) { + if (entry->target_ino == ino) { + SUSFS_LOGI("Redirect for ino: %lu\n", ino); + return getname_kernel(entry->redirected_pathname); + } + } + return ERR_PTR(-ENOENT); +} +#endif // #ifdef CONFIG_KSU_SUSFS_OPEN_REDIRECT + +/* sus_su */ +#ifdef CONFIG_KSU_SUSFS_SUS_SU +bool susfs_is_sus_su_hooks_enabled __read_mostly = false; +static int susfs_sus_su_working_mode = 0; +extern void ksu_susfs_enable_sus_su(void); +extern void ksu_susfs_disable_sus_su(void); + +int susfs_get_sus_su_working_mode(void) { + return susfs_sus_su_working_mode; +} + +int susfs_sus_su(struct st_sus_su* __user user_info) { + struct st_sus_su info; + int last_working_mode = susfs_sus_su_working_mode; + + if (copy_from_user(&info, user_info, sizeof(struct st_sus_su))) { + SUSFS_LOGE("failed copying from userspace\n"); + return 1; + } + + if (info.mode == SUS_SU_WITH_HOOKS) { + if (last_working_mode == SUS_SU_WITH_HOOKS) { + SUSFS_LOGE("current sus_su mode is already %d\n", SUS_SU_WITH_HOOKS); + return 1; + } + if (last_working_mode != SUS_SU_DISABLED) { + SUSFS_LOGE("please make sure the current sus_su mode is %d first\n", SUS_SU_DISABLED); + return 2; + } + ksu_susfs_enable_sus_su(); + susfs_sus_su_working_mode = SUS_SU_WITH_HOOKS; + susfs_is_sus_su_hooks_enabled = true; + SUSFS_LOGI("core kprobe hooks for ksu are disabled!\n"); + SUSFS_LOGI("non-kprobe hook sus_su is enabled!\n"); + SUSFS_LOGI("sus_su mode: %d\n", SUS_SU_WITH_HOOKS); + return 0; + } else if (info.mode == SUS_SU_DISABLED) { + if (last_working_mode == SUS_SU_DISABLED) { + SUSFS_LOGE("current sus_su mode is already %d\n", SUS_SU_DISABLED); + return 1; + } + susfs_is_sus_su_hooks_enabled = false; + ksu_susfs_disable_sus_su(); + susfs_sus_su_working_mode = SUS_SU_DISABLED; + if (last_working_mode == SUS_SU_WITH_HOOKS) { + SUSFS_LOGI("core kprobe hooks for ksu are enabled!\n"); + goto out; + } +out: + if (copy_to_user(user_info, &info, sizeof(info))) + SUSFS_LOGE("copy_to_user() failed\n"); + return 0; + } else if (info.mode == SUS_SU_WITH_OVERLAY) { + SUSFS_LOGE("sus_su mode %d is deprecated\n", SUS_SU_WITH_OVERLAY); + return 1; + } + return 1; +} +#endif // #ifdef CONFIG_KSU_SUSFS_SUS_SU + +/* susfs_init */ +void susfs_init(void) { + spin_lock_init(&susfs_spin_lock); +#ifdef CONFIG_KSU_SUSFS_SPOOF_UNAME + spin_lock_init(&susfs_uname_spin_lock); + susfs_my_uname_init(); +#endif + SUSFS_LOGI("susfs is initialized! version: " SUSFS_VERSION " \n"); +} + +/* No module exit is needed becuase it should never be a loadable kernel module */ +//void __init susfs_exit(void) + diff --git a/include/linux/sus_su.h b/include/linux/sus_su.h new file mode 100644 index 000000000000..98e8f3b357ac --- /dev/null +++ b/include/linux/sus_su.h @@ -0,0 +1,9 @@ +#ifndef __KSU_H_SUS_SU +#define __KSU_H_SUS_SU + +#include "../../drivers/kernelsu/core_hook.h" + +int sus_su_fifo_init(int *maj_dev_num, char *drv_path); +int sus_su_fifo_exit(int *maj_dev_num, char *drv_path); + +#endif diff --git a/include/linux/susfs.h b/include/linux/susfs.h new file mode 100644 index 000000000000..a7a5b555b7a3 --- /dev/null +++ b/include/linux/susfs.h @@ -0,0 +1,231 @@ +#ifndef KSU_SUSFS_H +#define KSU_SUSFS_H + +#include +#include +#include +#include +#include + +#define SUSFS_VERSION "v1.5.3" +#if LINUX_VERSION_CODE < KERNEL_VERSION(5,0,0) +#define SUSFS_VARIANT "NON-GKI" +#else +#define SUSFS_VARIANT "GKI" +#endif + +/********/ +/* ENUM */ +/********/ +/* shared with userspace ksu_susfs tool */ +#define CMD_SUSFS_ADD_SUS_PATH 0x55550 +#define CMD_SUSFS_ADD_SUS_MOUNT 0x55560 +#define CMD_SUSFS_ADD_SUS_KSTAT 0x55570 +#define CMD_SUSFS_UPDATE_SUS_KSTAT 0x55571 +#define CMD_SUSFS_ADD_SUS_KSTAT_STATICALLY 0x55572 +#define CMD_SUSFS_ADD_TRY_UMOUNT 0x55580 +#define CMD_SUSFS_SET_UNAME 0x55590 +#define CMD_SUSFS_ENABLE_LOG 0x555a0 +#define CMD_SUSFS_SET_BOOTCONFIG 0x555b0 +#define CMD_SUSFS_ADD_OPEN_REDIRECT 0x555c0 +#define CMD_SUSFS_RUN_UMOUNT_FOR_CURRENT_MNT_NS 0x555d0 +#define CMD_SUSFS_SHOW_VERSION 0x555e1 +#define CMD_SUSFS_SHOW_ENABLED_FEATURES 0x555e2 +#define CMD_SUSFS_SHOW_VARIANT 0x555e3 +#define CMD_SUSFS_SHOW_SUS_SU_WORKING_MODE 0x555e4 +#define CMD_SUSFS_IS_SUS_SU_READY 0x555f0 +#define CMD_SUSFS_SUS_SU 0x60000 + +#define SUSFS_MAX_LEN_PATHNAME 256 // 256 should address many paths already unless you are doing some strange experimental stuff, then set your own desired length +#define SUSFS_FAKE_BOOT_CONFIG_SIZE 4096 + +#define TRY_UMOUNT_DEFAULT 0 +#define TRY_UMOUNT_DETACH 1 + +#define SUS_SU_DISABLED 0 +#define SUS_SU_WITH_OVERLAY 1 /* deprecated */ +#define SUS_SU_WITH_HOOKS 2 + +/* + * inode->i_state => storing flag 'INODE_STATE_' + * mount->mnt.android_kabi_reserved4 => storing original mnt_id + * task_struct->android_kabi_reserved8 => storing last valid fake mnt_id + * user_struct->android_kabi_reserved2 => storing flag 'USER_STRUCT_KABI2_' + */ + +#define INODE_STATE_SUS_PATH 16777216 // 1 << 24 +#define INODE_STATE_SUS_MOUNT 33554432 // 1 << 25 +#define INODE_STATE_SUS_KSTAT 67108864 // 1 << 26 +#define INODE_STATE_OPEN_REDIRECT 134217728 // 1 << 27 + +#define USER_STRUCT_KABI2_NON_ROOT_USER_APP_PROFILE 16777216 // 1 << 24, for distinguishing root/no-root granted user app process + +/*********/ +/* MACRO */ +/*********/ +#define getname_safe(name) (name == NULL ? ERR_PTR(-EINVAL) : getname(name)) +#define putname_safe(name) (IS_ERR(name) ? NULL : putname(name)) + +/**********/ +/* STRUCT */ +/**********/ +/* sus_path */ +#ifdef CONFIG_KSU_SUSFS_SUS_PATH +struct st_susfs_sus_path { + unsigned long target_ino; + char target_pathname[SUSFS_MAX_LEN_PATHNAME]; +}; + +struct st_susfs_sus_path_hlist { + unsigned long target_ino; + char target_pathname[SUSFS_MAX_LEN_PATHNAME]; + struct hlist_node node; +}; +#endif + +/* sus_mount */ +#ifdef CONFIG_KSU_SUSFS_SUS_MOUNT +struct st_susfs_sus_mount { + char target_pathname[SUSFS_MAX_LEN_PATHNAME]; + unsigned long target_dev; +}; + +struct st_susfs_sus_mount_list { + struct list_head list; + struct st_susfs_sus_mount info; +}; +#endif + +/* sus_kstat */ +#ifdef CONFIG_KSU_SUSFS_SUS_KSTAT +struct st_susfs_sus_kstat { + int is_statically; + unsigned long target_ino; // the ino after bind mounted or overlayed + char target_pathname[SUSFS_MAX_LEN_PATHNAME]; + unsigned long spoofed_ino; + unsigned long spoofed_dev; + unsigned int spoofed_nlink; + long long spoofed_size; + long spoofed_atime_tv_sec; + long spoofed_mtime_tv_sec; + long spoofed_ctime_tv_sec; + long spoofed_atime_tv_nsec; + long spoofed_mtime_tv_nsec; + long spoofed_ctime_tv_nsec; + unsigned long spoofed_blksize; + unsigned long long spoofed_blocks; +}; + +struct st_susfs_sus_kstat_hlist { + unsigned long target_ino; + struct st_susfs_sus_kstat info; + struct hlist_node node; +}; +#endif + +/* try_umount */ +#ifdef CONFIG_KSU_SUSFS_TRY_UMOUNT +struct st_susfs_try_umount { + char target_pathname[SUSFS_MAX_LEN_PATHNAME]; + int mnt_mode; +}; + +struct st_susfs_try_umount_list { + struct list_head list; + struct st_susfs_try_umount info; +}; +#endif + +/* spoof_uname */ +#ifdef CONFIG_KSU_SUSFS_SPOOF_UNAME +struct st_susfs_uname { + char release[__NEW_UTS_LEN+1]; + char version[__NEW_UTS_LEN+1]; +}; +#endif + +/* open_redirect */ +#ifdef CONFIG_KSU_SUSFS_OPEN_REDIRECT +struct st_susfs_open_redirect { + unsigned long target_ino; + char target_pathname[SUSFS_MAX_LEN_PATHNAME]; + char redirected_pathname[SUSFS_MAX_LEN_PATHNAME]; +}; + +struct st_susfs_open_redirect_hlist { + unsigned long target_ino; + char target_pathname[SUSFS_MAX_LEN_PATHNAME]; + char redirected_pathname[SUSFS_MAX_LEN_PATHNAME]; + struct hlist_node node; +}; +#endif + +/* sus_su */ +#ifdef CONFIG_KSU_SUSFS_SUS_SU +struct st_sus_su { + int mode; +}; +#endif + +/***********************/ +/* FORWARD DECLARATION */ +/***********************/ +/* sus_path */ +#ifdef CONFIG_KSU_SUSFS_SUS_PATH +int susfs_add_sus_path(struct st_susfs_sus_path* __user user_info); +int susfs_sus_ino_for_filldir64(unsigned long ino); +#endif +/* sus_mount */ +#ifdef CONFIG_KSU_SUSFS_SUS_MOUNT +int susfs_add_sus_mount(struct st_susfs_sus_mount* __user user_info); +#ifdef CONFIG_KSU_SUSFS_AUTO_ADD_SUS_BIND_MOUNT +int susfs_auto_add_sus_bind_mount(const char *pathname, struct path *path_target); +#endif // #ifdef CONFIG_KSU_SUSFS_AUTO_ADD_SUS_BIND_MOUNT +#ifdef CONFIG_KSU_SUSFS_AUTO_ADD_SUS_KSU_DEFAULT_MOUNT +void susfs_auto_add_sus_ksu_default_mount(const char __user *to_pathname); +#endif // #ifdef CONFIG_KSU_SUSFS_AUTO_ADD_SUS_KSU_DEFAULT_MOUNT +#endif // #ifdef CONFIG_KSU_SUSFS_SUS_MOUNT + +/* sus_kstat */ +#ifdef CONFIG_KSU_SUSFS_SUS_KSTAT +int susfs_add_sus_kstat(struct st_susfs_sus_kstat* __user user_info); +int susfs_update_sus_kstat(struct st_susfs_sus_kstat* __user user_info); +void susfs_sus_ino_for_generic_fillattr(unsigned long ino, struct kstat *stat); +void susfs_sus_ino_for_show_map_vma(unsigned long ino, dev_t *out_dev, unsigned long *out_ino); +#endif +/* try_umount */ +#ifdef CONFIG_KSU_SUSFS_TRY_UMOUNT +int susfs_add_try_umount(struct st_susfs_try_umount* __user user_info); +void susfs_try_umount(uid_t target_uid); +#ifdef CONFIG_KSU_SUSFS_AUTO_ADD_TRY_UMOUNT_FOR_BIND_MOUNT +void susfs_auto_add_try_umount_for_bind_mount(struct path *path); +#endif // #ifdef CONFIG_KSU_SUSFS_AUTO_ADD_TRY_UMOUNT_FOR_BIND_MOUNT +#endif // #ifdef CONFIG_KSU_SUSFS_TRY_UMOUNT +/* spoof_uname */ +#ifdef CONFIG_KSU_SUSFS_SPOOF_UNAME +int susfs_set_uname(struct st_susfs_uname* __user user_info); +int susfs_spoof_uname(struct new_utsname* tmp); +#endif +/* set_log */ +#ifdef CONFIG_KSU_SUSFS_ENABLE_LOG +void susfs_set_log(bool enabled); +#endif +/* spoof_bootconfig */ +#ifdef CONFIG_KSU_SUSFS_SPOOF_BOOTCONFIG +int susfs_set_bootconfig(char* __user user_fake_boot_config); +int susfs_spoof_bootconfig(struct seq_file *m); +#endif +/* open_redirect */ +#ifdef CONFIG_KSU_SUSFS_OPEN_REDIRECT +int susfs_add_open_redirect(struct st_susfs_open_redirect* __user user_info); +struct filename* susfs_get_redirected_path(unsigned long ino); +#endif +/* sus_su */ +#ifdef CONFIG_KSU_SUSFS_SUS_SU +int susfs_get_sus_su_working_mode(void); +int susfs_sus_su(struct st_sus_su* __user user_info); +#endif +/* susfs_init */ +void susfs_init(void); + +#endif diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 96505113b907..de70a3b01324 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -692,8 +692,18 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, "%px %c %s\t[%s]\n", value, type, iter->name, iter->module_name); } else +#ifndef CONFIG_KSU_SUSFS_HIDE_KSU_SUSFS_SYMBOLS seq_printf(m, "%px %c %s\n", value, iter->type, iter->name); +#else + { + if (strstr(iter->name, "ksu_") || !strncmp(iter->name, "susfs_", 6) || !strncmp(iter->name, "ksud", 4)) { + return 0; + } + seq_printf(m, "%px %c %s\n", value, + iter->type, iter->name); + } +#endif return 0; } diff --git a/kernel/sys.c b/kernel/sys.c index 1de01fab5788..c0db32e08399 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1271,12 +1271,23 @@ static int override_release(char __user *release, size_t len) return ret; } +#ifdef CONFIG_KSU_SUSFS_SPOOF_UNAME +extern int susfs_spoof_uname(struct new_utsname* tmp); +#endif + SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) { struct new_utsname tmp; down_read(&uts_sem); +#ifdef CONFIG_KSU_SUSFS_SPOOF_UNAME + if (likely(!susfs_spoof_uname(&tmp))) + goto bypass_orig_flow; +#endif memcpy(&tmp, utsname(), sizeof(tmp)); +#ifdef CONFIG_KSU_SUSFS_SPOOF_UNAME +bypass_orig_flow: +#endif up_read(&uts_sem); if (copy_to_user(name, &tmp, sizeof(tmp))) return -EFAULT;