Revision 8aef18845266f5c05904c610088f2d1ed58f6be3 authored by Al Viro on 16 June 2011, 14:10:06 UTC, committed by Al Viro on 16 June 2011, 15:28:16 UTC
[Kudos to dhowells for tracking that crap down]

If two processes attempt to cause automounting on the same mountpoint at the
same time, the vfsmount holding the mountpoint will be left with one too few
references on it, causing a BUG when the kernel tries to clean up.

The problem is that lock_mount() drops the caller's reference to the
mountpoint's vfsmount in the case where it finds something already mounted on
the mountpoint as it transits to the mounted filesystem and replaces path->mnt
with the new mountpoint vfsmount.

During a pathwalk, however, we don't take a reference on the vfsmount if it is
the same as the one in the nameidata struct, but do_add_mount() doesn't know
this.

The fix is to make sure we have a ref on the vfsmount of the mountpoint before
calling do_add_mount().  However, if lock_mount() doesn't transit, we're then
left with an extra ref on the mountpoint vfsmount which needs releasing.
We can handle that in follow_managed() by not making assumptions about what
we can and what we cannot get from lookup_mnt() as the current code does.

The callers of follow_managed() expect that reference to path->mnt will be
grabbed iff path->mnt has been changed.  follow_managed() and follow_automount()
keep track of whether such reference has been grabbed and assume that it'll
happen in those and only those cases that'll have us return with changed
path->mnt.  That assumption is almost correct - it breaks in case of
racing automounts and in even harder to hit race between following a mountpoint
and a couple of mount --move.  The thing is, we don't need to make that
assumption at all - after the end of loop in follow_manage() we can check
if path->mnt has ended up unchanged and do mntput() if needed.

The BUG can be reproduced with the following test program:

	#include <stdio.h>
	#include <sys/types.h>
	#include <sys/stat.h>
	#include <unistd.h>
	#include <sys/wait.h>
	int main(int argc, char **argv)
	{
		int pid, ws;
		struct stat buf;
		pid = fork();
		stat(argv[1], &buf);
		if (pid > 0) wait(&ws);
		return 0;
	}

and the following procedure:

 (1) Mount an NFS volume that on the server has something else mounted on a
     subdirectory.  For instance, I can mount / from my server:

	mount warthog:/ /mnt -t nfs4 -r

     On the server /data has another filesystem mounted on it, so NFS will see
     a change in FSID as it walks down the path, and will mark /mnt/data as
     being a mountpoint.  This will cause the automount code to be triggered.

     !!! Do not look inside the mounted fs at this point !!!

 (2) Run the above program on a file within the submount to generate two
     simultaneous automount requests:

	/tmp/forkstat /mnt/data/testfile

 (3) Unmount the automounted submount:

	umount /mnt/data

 (4) Unmount the original mount:

	umount /mnt

     At this point the kernel should throw a BUG with something like the
     following:

	BUG: Dentry ffff880032e3c5c0{i=2,n=} still in use (1) [unmount of nfs4 0:12]

Note that the bug appears on the root dentry of the original mount, not the
mountpoint and not the submount because sys_umount() hasn't got to its final
mntput_no_expire() yet, but this isn't so obvious from the call trace:

 [<ffffffff8117cd82>] shrink_dcache_for_umount+0x69/0x82
 [<ffffffff8116160e>] generic_shutdown_super+0x37/0x15b
 [<ffffffffa00fae56>] ? nfs_super_return_all_delegations+0x2e/0x1b1 [nfs]
 [<ffffffff811617f3>] kill_anon_super+0x1d/0x7e
 [<ffffffffa00d0be1>] nfs4_kill_super+0x60/0xb6 [nfs]
 [<ffffffff81161c17>] deactivate_locked_super+0x34/0x83
 [<ffffffff811629ff>] deactivate_super+0x6f/0x7b
 [<ffffffff81186261>] mntput_no_expire+0x18d/0x199
 [<ffffffff811862a8>] mntput+0x3b/0x44
 [<ffffffff81186d87>] release_mounts+0xa2/0xbf
 [<ffffffff811876af>] sys_umount+0x47a/0x4ba
 [<ffffffff8109e1ca>] ? trace_hardirqs_on_caller+0x1fd/0x22f
 [<ffffffff816ea86b>] system_call_fastpath+0x16/0x1b

as do_umount() is inlined.  However, you can see release_mounts() in there.

Note also that it may be necessary to have multiple CPU cores to be able to
trigger this bug.

Tested-by: Jeff Layton <jlayton@redhat.com>
Tested-by: Ian Kent <raven@themaw.net>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
1 parent 50338b8
Raw File
nr_in.c
/*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
 * Copyright Darryl Miles G7LED (dlm@g7led.demon.co.uk)
 */
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/slab.h>
#include <net/ax25.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <net/netrom.h>

static int nr_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more)
{
	struct sk_buff *skbo, *skbn = skb;
	struct nr_sock *nr = nr_sk(sk);

	skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN);

	nr_start_idletimer(sk);

	if (more) {
		nr->fraglen += skb->len;
		skb_queue_tail(&nr->frag_queue, skb);
		return 0;
	}

	if (!more && nr->fraglen > 0) {	/* End of fragment */
		nr->fraglen += skb->len;
		skb_queue_tail(&nr->frag_queue, skb);

		if ((skbn = alloc_skb(nr->fraglen, GFP_ATOMIC)) == NULL)
			return 1;

		skb_reset_transport_header(skbn);

		while ((skbo = skb_dequeue(&nr->frag_queue)) != NULL) {
			skb_copy_from_linear_data(skbo,
						  skb_put(skbn, skbo->len),
						  skbo->len);
			kfree_skb(skbo);
		}

		nr->fraglen = 0;
	}

	return sock_queue_rcv_skb(sk, skbn);
}

/*
 * State machine for state 1, Awaiting Connection State.
 * The handling of the timer(s) is in file nr_timer.c.
 * Handling of state 0 and connection release is in netrom.c.
 */
static int nr_state1_machine(struct sock *sk, struct sk_buff *skb,
	int frametype)
{
	switch (frametype) {
	case NR_CONNACK: {
		struct nr_sock *nr = nr_sk(sk);

		nr_stop_t1timer(sk);
		nr_start_idletimer(sk);
		nr->your_index = skb->data[17];
		nr->your_id    = skb->data[18];
		nr->vs	       = 0;
		nr->va	       = 0;
		nr->vr	       = 0;
		nr->vl	       = 0;
		nr->state      = NR_STATE_3;
		nr->n2count    = 0;
		nr->window     = skb->data[20];
		sk->sk_state   = TCP_ESTABLISHED;
		if (!sock_flag(sk, SOCK_DEAD))
			sk->sk_state_change(sk);
		break;
	}

	case NR_CONNACK | NR_CHOKE_FLAG:
		nr_disconnect(sk, ECONNREFUSED);
		break;

	case NR_RESET:
		if (sysctl_netrom_reset_circuit)
			nr_disconnect(sk, ECONNRESET);
		break;

	default:
		break;
	}
	return 0;
}

/*
 * State machine for state 2, Awaiting Release State.
 * The handling of the timer(s) is in file nr_timer.c
 * Handling of state 0 and connection release is in netrom.c.
 */
static int nr_state2_machine(struct sock *sk, struct sk_buff *skb,
	int frametype)
{
	switch (frametype) {
	case NR_CONNACK | NR_CHOKE_FLAG:
		nr_disconnect(sk, ECONNRESET);
		break;

	case NR_DISCREQ:
		nr_write_internal(sk, NR_DISCACK);

	case NR_DISCACK:
		nr_disconnect(sk, 0);
		break;

	case NR_RESET:
		if (sysctl_netrom_reset_circuit)
			nr_disconnect(sk, ECONNRESET);
		break;

	default:
		break;
	}
	return 0;
}

/*
 * State machine for state 3, Connected State.
 * The handling of the timer(s) is in file nr_timer.c
 * Handling of state 0 and connection release is in netrom.c.
 */
static int nr_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype)
{
	struct nr_sock *nrom = nr_sk(sk);
	struct sk_buff_head temp_queue;
	struct sk_buff *skbn;
	unsigned short save_vr;
	unsigned short nr, ns;
	int queued = 0;

	nr = skb->data[18];
	ns = skb->data[17];

	switch (frametype) {
	case NR_CONNREQ:
		nr_write_internal(sk, NR_CONNACK);
		break;

	case NR_DISCREQ:
		nr_write_internal(sk, NR_DISCACK);
		nr_disconnect(sk, 0);
		break;

	case NR_CONNACK | NR_CHOKE_FLAG:
	case NR_DISCACK:
		nr_disconnect(sk, ECONNRESET);
		break;

	case NR_INFOACK:
	case NR_INFOACK | NR_CHOKE_FLAG:
	case NR_INFOACK | NR_NAK_FLAG:
	case NR_INFOACK | NR_NAK_FLAG | NR_CHOKE_FLAG:
		if (frametype & NR_CHOKE_FLAG) {
			nrom->condition |= NR_COND_PEER_RX_BUSY;
			nr_start_t4timer(sk);
		} else {
			nrom->condition &= ~NR_COND_PEER_RX_BUSY;
			nr_stop_t4timer(sk);
		}
		if (!nr_validate_nr(sk, nr)) {
			break;
		}
		if (frametype & NR_NAK_FLAG) {
			nr_frames_acked(sk, nr);
			nr_send_nak_frame(sk);
		} else {
			if (nrom->condition & NR_COND_PEER_RX_BUSY) {
				nr_frames_acked(sk, nr);
			} else {
				nr_check_iframes_acked(sk, nr);
			}
		}
		break;

	case NR_INFO:
	case NR_INFO | NR_NAK_FLAG:
	case NR_INFO | NR_CHOKE_FLAG:
	case NR_INFO | NR_MORE_FLAG:
	case NR_INFO | NR_NAK_FLAG | NR_CHOKE_FLAG:
	case NR_INFO | NR_CHOKE_FLAG | NR_MORE_FLAG:
	case NR_INFO | NR_NAK_FLAG | NR_MORE_FLAG:
	case NR_INFO | NR_NAK_FLAG | NR_CHOKE_FLAG | NR_MORE_FLAG:
		if (frametype & NR_CHOKE_FLAG) {
			nrom->condition |= NR_COND_PEER_RX_BUSY;
			nr_start_t4timer(sk);
		} else {
			nrom->condition &= ~NR_COND_PEER_RX_BUSY;
			nr_stop_t4timer(sk);
		}
		if (nr_validate_nr(sk, nr)) {
			if (frametype & NR_NAK_FLAG) {
				nr_frames_acked(sk, nr);
				nr_send_nak_frame(sk);
			} else {
				if (nrom->condition & NR_COND_PEER_RX_BUSY) {
					nr_frames_acked(sk, nr);
				} else {
					nr_check_iframes_acked(sk, nr);
				}
			}
		}
		queued = 1;
		skb_queue_head(&nrom->reseq_queue, skb);
		if (nrom->condition & NR_COND_OWN_RX_BUSY)
			break;
		skb_queue_head_init(&temp_queue);
		do {
			save_vr = nrom->vr;
			while ((skbn = skb_dequeue(&nrom->reseq_queue)) != NULL) {
				ns = skbn->data[17];
				if (ns == nrom->vr) {
					if (nr_queue_rx_frame(sk, skbn, frametype & NR_MORE_FLAG) == 0) {
						nrom->vr = (nrom->vr + 1) % NR_MODULUS;
					} else {
						nrom->condition |= NR_COND_OWN_RX_BUSY;
						skb_queue_tail(&temp_queue, skbn);
					}
				} else if (nr_in_rx_window(sk, ns)) {
					skb_queue_tail(&temp_queue, skbn);
				} else {
					kfree_skb(skbn);
				}
			}
			while ((skbn = skb_dequeue(&temp_queue)) != NULL) {
				skb_queue_tail(&nrom->reseq_queue, skbn);
			}
		} while (save_vr != nrom->vr);
		/*
		 * Window is full, ack it immediately.
		 */
		if (((nrom->vl + nrom->window) % NR_MODULUS) == nrom->vr) {
			nr_enquiry_response(sk);
		} else {
			if (!(nrom->condition & NR_COND_ACK_PENDING)) {
				nrom->condition |= NR_COND_ACK_PENDING;
				nr_start_t2timer(sk);
			}
		}
		break;

	case NR_RESET:
		if (sysctl_netrom_reset_circuit)
			nr_disconnect(sk, ECONNRESET);
		break;

	default:
		break;
	}
	return queued;
}

/* Higher level upcall for a LAPB frame - called with sk locked */
int nr_process_rx_frame(struct sock *sk, struct sk_buff *skb)
{
	struct nr_sock *nr = nr_sk(sk);
	int queued = 0, frametype;

	if (nr->state == NR_STATE_0)
		return 0;

	frametype = skb->data[19];

	switch (nr->state) {
	case NR_STATE_1:
		queued = nr_state1_machine(sk, skb, frametype);
		break;
	case NR_STATE_2:
		queued = nr_state2_machine(sk, skb, frametype);
		break;
	case NR_STATE_3:
		queued = nr_state3_machine(sk, skb, frametype);
		break;
	}

	nr_kick(sk);

	return queued;
}
back to top