Revision - 472e5b0 - pipe: remove pipe_wait() and fix wakeup race with splice

Revision 472e5b056f000a778abb41f1e443de58eb259783 authored by Linus Torvalds on 02 October 2020, 02:14:36 UTC, committed by Linus Torvalds on 02 October 2020, 02:14:36 UTC

pipe: remove pipe_wait() and fix wakeup race with splice

The pipe splice code still used the old model of waiting for pipe IO by
using a non-specific "pipe_wait()" that waited for any pipe event to
happen, which depended on all pipe IO being entirely serialized by the
pipe lock.  So by checking the state you were waiting for, and then
adding yourself to the wait queue before dropping the lock, you were
guaranteed to see all the wakeups.

Strictly speaking, the actual wakeups were not done under the lock, but
the pipe_wait() model still worked, because since the waiter held the
lock when checking whether it should sleep, it would always see the
current state, and the wakeup was always done after updating the state.

However, commit 0ddad21d3e99 ("pipe: use exclusive waits when reading or
writing") split the single wait-queue into two, and in the process also
made the "wait for event" code wait for _two_ wait queues, and that then
showed a race with the wakers that were not serialized by the pipe lock.

It's only splice that used that "pipe_wait()" model, so the problem
wasn't obvious, but Josef Bacik reports:

 "I hit a hang with fstest btrfs/187, which does a btrfs send into
  /dev/null. This works by creating a pipe, the write side is given to
  the kernel to write into, and the read side is handed to a thread that
  splices into a file, in this case /dev/null.

  The box that was hung had the write side stuck here [pipe_write] and
  the read side stuck here [splice_from_pipe_next -> pipe_wait].

  [ more details about pipe_wait() scenario ]

  The problem is we're doing the prepare_to_wait, which sets our state
  each time, however we can be woken up either with reads or writes. In
  the case above we race with the WRITER waking us up, and re-set our
  state to INTERRUPTIBLE, and thus never break out of schedule"

Josef had a patch that avoided the issue in pipe_wait() by just making
it set the state only once, but the deeper problem is that pipe_wait()
depends on a level of synchonization by the pipe mutex that it really
shouldn't.  And the whole "wait for any pipe state change" model really
isn't very good to begin with.

So rather than trying to work around things in pipe_wait(), remove that
legacy model of "wait for arbitrary pipe event" entirely, and actually
create functions that wait for the pipe actually being readable or
writable, and can do so without depending on the pipe lock serializing
everything.

Fixes: 0ddad21d3e99 ("pipe: use exclusive waits when reading or writing")
Link: https://lore.kernel.org/linux-fsdevel/bfa88b5ad6f069b2b679316b9e495a970130416c.1601567868.git.josef@toxicpanda.com/
Reported-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-and-tested-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

1 parent 44b6e23

Files
Changes

Permalinks

mount.h

/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/ns_common.h>
#include <linux/fs_pin.h>

struct mnt_namespace {
	atomic_t		count;
	struct ns_common	ns;
	struct mount *	root;
	/*
	 * Traversal and modification of .list is protected by either
	 * - taking namespace_sem for write, OR
	 * - taking namespace_sem for read AND taking .ns_lock.
	 */
	struct list_head	list;
	spinlock_t		ns_lock;
	struct user_namespace	*user_ns;
	struct ucounts		*ucounts;
	u64			seq;	/* Sequence number to prevent loops */
	wait_queue_head_t poll;
	u64 event;
	unsigned int		mounts; /* # of mounts in the namespace */
	unsigned int		pending_mounts;
} __randomize_layout;

struct mnt_pcp {
	int mnt_count;
	int mnt_writers;
};

struct mountpoint {
	struct hlist_node m_hash;
	struct dentry *m_dentry;
	struct hlist_head m_list;
	int m_count;
};

struct mount {
	struct hlist_node mnt_hash;
	struct mount *mnt_parent;
	struct dentry *mnt_mountpoint;
	struct vfsmount mnt;
	union {
		struct rcu_head mnt_rcu;
		struct llist_node mnt_llist;
	};
#ifdef CONFIG_SMP
	struct mnt_pcp __percpu *mnt_pcp;
#else
	int mnt_count;
	int mnt_writers;
#endif
	struct list_head mnt_mounts;	/* list of children, anchored here */
	struct list_head mnt_child;	/* and going through their mnt_child */
	struct list_head mnt_instance;	/* mount instance on sb->s_mounts */
	const char *mnt_devname;	/* Name of device e.g. /dev/dsk/hda1 */
	struct list_head mnt_list;
	struct list_head mnt_expire;	/* link in fs-specific expiry list */
	struct list_head mnt_share;	/* circular list of shared mounts */
	struct list_head mnt_slave_list;/* list of slave mounts */
	struct list_head mnt_slave;	/* slave list entry */
	struct mount *mnt_master;	/* slave is on master->mnt_slave_list */
	struct mnt_namespace *mnt_ns;	/* containing namespace */
	struct mountpoint *mnt_mp;	/* where is it mounted */
	union {
		struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
		struct hlist_node mnt_umount;
	};
	struct list_head mnt_umounting; /* list entry for umount propagation */
#ifdef CONFIG_FSNOTIFY
	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
	__u32 mnt_fsnotify_mask;
#endif
	int mnt_id;			/* mount identifier */
	int mnt_group_id;		/* peer group identifier */
	int mnt_expiry_mark;		/* true if marked for expiry */
	struct hlist_head mnt_pins;
	struct hlist_head mnt_stuck_children;
} __randomize_layout;

#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */

static inline struct mount *real_mount(struct vfsmount *mnt)
{
	return container_of(mnt, struct mount, mnt);
}

static inline int mnt_has_parent(struct mount *mnt)
{
	return mnt != mnt->mnt_parent;
}

static inline int is_mounted(struct vfsmount *mnt)
{
	/* neither detached nor internal? */
	return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
}

extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);

extern int __legitimize_mnt(struct vfsmount *, unsigned);
extern bool legitimize_mnt(struct vfsmount *, unsigned);

static inline bool __path_is_mountpoint(const struct path *path)
{
	struct mount *m = __lookup_mnt(path->mnt, path->dentry);
	return m && likely(!(m->mnt.mnt_flags & MNT_SYNC_UMOUNT));
}

extern void __detach_mounts(struct dentry *dentry);

static inline void detach_mounts(struct dentry *dentry)
{
	if (!d_mountpoint(dentry))
		return;
	__detach_mounts(dentry);
}

static inline void get_mnt_ns(struct mnt_namespace *ns)
{
	atomic_inc(&ns->count);
}

extern seqlock_t mount_lock;

static inline void lock_mount_hash(void)
{
	write_seqlock(&mount_lock);
}

static inline void unlock_mount_hash(void)
{
	write_sequnlock(&mount_lock);
}

struct proc_mounts {
	struct mnt_namespace *ns;
	struct path root;
	int (*show)(struct seq_file *, struct vfsmount *);
	struct mount cursor;
};

extern const struct seq_operations mounts_op;

extern bool __is_local_mountpoint(struct dentry *dentry);
static inline bool is_local_mountpoint(struct dentry *dentry)
{
	if (!d_mountpoint(dentry))
		return false;

	return __is_local_mountpoint(dentry);
}

static inline bool is_anon_ns(struct mnt_namespace *ns)
{
	return ns->seq == 0;
}

extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);

Showing with 0 additions and 0 deletions (0 / 0 diffs computed)

Computing file changes ...