Revision 8aef18845266f5c05904c610088f2d1ed58f6be3 authored by Al Viro on 16 June 2011, 14:10:06 UTC, committed by Al Viro on 16 June 2011, 15:28:16 UTC
[Kudos to dhowells for tracking that crap down] If two processes attempt to cause automounting on the same mountpoint at the same time, the vfsmount holding the mountpoint will be left with one too few references on it, causing a BUG when the kernel tries to clean up. The problem is that lock_mount() drops the caller's reference to the mountpoint's vfsmount in the case where it finds something already mounted on the mountpoint as it transits to the mounted filesystem and replaces path->mnt with the new mountpoint vfsmount. During a pathwalk, however, we don't take a reference on the vfsmount if it is the same as the one in the nameidata struct, but do_add_mount() doesn't know this. The fix is to make sure we have a ref on the vfsmount of the mountpoint before calling do_add_mount(). However, if lock_mount() doesn't transit, we're then left with an extra ref on the mountpoint vfsmount which needs releasing. We can handle that in follow_managed() by not making assumptions about what we can and what we cannot get from lookup_mnt() as the current code does. The callers of follow_managed() expect that reference to path->mnt will be grabbed iff path->mnt has been changed. follow_managed() and follow_automount() keep track of whether such reference has been grabbed and assume that it'll happen in those and only those cases that'll have us return with changed path->mnt. That assumption is almost correct - it breaks in case of racing automounts and in even harder to hit race between following a mountpoint and a couple of mount --move. The thing is, we don't need to make that assumption at all - after the end of loop in follow_manage() we can check if path->mnt has ended up unchanged and do mntput() if needed. The BUG can be reproduced with the following test program: #include <stdio.h> #include <sys/types.h> #include <sys/stat.h> #include <unistd.h> #include <sys/wait.h> int main(int argc, char **argv) { int pid, ws; struct stat buf; pid = fork(); stat(argv[1], &buf); if (pid > 0) wait(&ws); return 0; } and the following procedure: (1) Mount an NFS volume that on the server has something else mounted on a subdirectory. For instance, I can mount / from my server: mount warthog:/ /mnt -t nfs4 -r On the server /data has another filesystem mounted on it, so NFS will see a change in FSID as it walks down the path, and will mark /mnt/data as being a mountpoint. This will cause the automount code to be triggered. !!! Do not look inside the mounted fs at this point !!! (2) Run the above program on a file within the submount to generate two simultaneous automount requests: /tmp/forkstat /mnt/data/testfile (3) Unmount the automounted submount: umount /mnt/data (4) Unmount the original mount: umount /mnt At this point the kernel should throw a BUG with something like the following: BUG: Dentry ffff880032e3c5c0{i=2,n=} still in use (1) [unmount of nfs4 0:12] Note that the bug appears on the root dentry of the original mount, not the mountpoint and not the submount because sys_umount() hasn't got to its final mntput_no_expire() yet, but this isn't so obvious from the call trace: [<ffffffff8117cd82>] shrink_dcache_for_umount+0x69/0x82 [<ffffffff8116160e>] generic_shutdown_super+0x37/0x15b [<ffffffffa00fae56>] ? nfs_super_return_all_delegations+0x2e/0x1b1 [nfs] [<ffffffff811617f3>] kill_anon_super+0x1d/0x7e [<ffffffffa00d0be1>] nfs4_kill_super+0x60/0xb6 [nfs] [<ffffffff81161c17>] deactivate_locked_super+0x34/0x83 [<ffffffff811629ff>] deactivate_super+0x6f/0x7b [<ffffffff81186261>] mntput_no_expire+0x18d/0x199 [<ffffffff811862a8>] mntput+0x3b/0x44 [<ffffffff81186d87>] release_mounts+0xa2/0xbf [<ffffffff811876af>] sys_umount+0x47a/0x4ba [<ffffffff8109e1ca>] ? trace_hardirqs_on_caller+0x1fd/0x22f [<ffffffff816ea86b>] system_call_fastpath+0x16/0x1b as do_umount() is inlined. However, you can see release_mounts() in there. Note also that it may be necessary to have multiple CPU cores to be able to trigger this bug. Tested-by: Jeff Layton <jlayton@redhat.com> Tested-by: Ian Kent <raven@themaw.net> Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
1 parent 50338b8
nr_in.c
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
* Copyright Darryl Miles G7LED (dlm@g7led.demon.co.uk)
*/
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/slab.h>
#include <net/ax25.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <net/netrom.h>
static int nr_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more)
{
struct sk_buff *skbo, *skbn = skb;
struct nr_sock *nr = nr_sk(sk);
skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN);
nr_start_idletimer(sk);
if (more) {
nr->fraglen += skb->len;
skb_queue_tail(&nr->frag_queue, skb);
return 0;
}
if (!more && nr->fraglen > 0) { /* End of fragment */
nr->fraglen += skb->len;
skb_queue_tail(&nr->frag_queue, skb);
if ((skbn = alloc_skb(nr->fraglen, GFP_ATOMIC)) == NULL)
return 1;
skb_reset_transport_header(skbn);
while ((skbo = skb_dequeue(&nr->frag_queue)) != NULL) {
skb_copy_from_linear_data(skbo,
skb_put(skbn, skbo->len),
skbo->len);
kfree_skb(skbo);
}
nr->fraglen = 0;
}
return sock_queue_rcv_skb(sk, skbn);
}
/*
* State machine for state 1, Awaiting Connection State.
* The handling of the timer(s) is in file nr_timer.c.
* Handling of state 0 and connection release is in netrom.c.
*/
static int nr_state1_machine(struct sock *sk, struct sk_buff *skb,
int frametype)
{
switch (frametype) {
case NR_CONNACK: {
struct nr_sock *nr = nr_sk(sk);
nr_stop_t1timer(sk);
nr_start_idletimer(sk);
nr->your_index = skb->data[17];
nr->your_id = skb->data[18];
nr->vs = 0;
nr->va = 0;
nr->vr = 0;
nr->vl = 0;
nr->state = NR_STATE_3;
nr->n2count = 0;
nr->window = skb->data[20];
sk->sk_state = TCP_ESTABLISHED;
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_state_change(sk);
break;
}
case NR_CONNACK | NR_CHOKE_FLAG:
nr_disconnect(sk, ECONNREFUSED);
break;
case NR_RESET:
if (sysctl_netrom_reset_circuit)
nr_disconnect(sk, ECONNRESET);
break;
default:
break;
}
return 0;
}
/*
* State machine for state 2, Awaiting Release State.
* The handling of the timer(s) is in file nr_timer.c
* Handling of state 0 and connection release is in netrom.c.
*/
static int nr_state2_machine(struct sock *sk, struct sk_buff *skb,
int frametype)
{
switch (frametype) {
case NR_CONNACK | NR_CHOKE_FLAG:
nr_disconnect(sk, ECONNRESET);
break;
case NR_DISCREQ:
nr_write_internal(sk, NR_DISCACK);
case NR_DISCACK:
nr_disconnect(sk, 0);
break;
case NR_RESET:
if (sysctl_netrom_reset_circuit)
nr_disconnect(sk, ECONNRESET);
break;
default:
break;
}
return 0;
}
/*
* State machine for state 3, Connected State.
* The handling of the timer(s) is in file nr_timer.c
* Handling of state 0 and connection release is in netrom.c.
*/
static int nr_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype)
{
struct nr_sock *nrom = nr_sk(sk);
struct sk_buff_head temp_queue;
struct sk_buff *skbn;
unsigned short save_vr;
unsigned short nr, ns;
int queued = 0;
nr = skb->data[18];
ns = skb->data[17];
switch (frametype) {
case NR_CONNREQ:
nr_write_internal(sk, NR_CONNACK);
break;
case NR_DISCREQ:
nr_write_internal(sk, NR_DISCACK);
nr_disconnect(sk, 0);
break;
case NR_CONNACK | NR_CHOKE_FLAG:
case NR_DISCACK:
nr_disconnect(sk, ECONNRESET);
break;
case NR_INFOACK:
case NR_INFOACK | NR_CHOKE_FLAG:
case NR_INFOACK | NR_NAK_FLAG:
case NR_INFOACK | NR_NAK_FLAG | NR_CHOKE_FLAG:
if (frametype & NR_CHOKE_FLAG) {
nrom->condition |= NR_COND_PEER_RX_BUSY;
nr_start_t4timer(sk);
} else {
nrom->condition &= ~NR_COND_PEER_RX_BUSY;
nr_stop_t4timer(sk);
}
if (!nr_validate_nr(sk, nr)) {
break;
}
if (frametype & NR_NAK_FLAG) {
nr_frames_acked(sk, nr);
nr_send_nak_frame(sk);
} else {
if (nrom->condition & NR_COND_PEER_RX_BUSY) {
nr_frames_acked(sk, nr);
} else {
nr_check_iframes_acked(sk, nr);
}
}
break;
case NR_INFO:
case NR_INFO | NR_NAK_FLAG:
case NR_INFO | NR_CHOKE_FLAG:
case NR_INFO | NR_MORE_FLAG:
case NR_INFO | NR_NAK_FLAG | NR_CHOKE_FLAG:
case NR_INFO | NR_CHOKE_FLAG | NR_MORE_FLAG:
case NR_INFO | NR_NAK_FLAG | NR_MORE_FLAG:
case NR_INFO | NR_NAK_FLAG | NR_CHOKE_FLAG | NR_MORE_FLAG:
if (frametype & NR_CHOKE_FLAG) {
nrom->condition |= NR_COND_PEER_RX_BUSY;
nr_start_t4timer(sk);
} else {
nrom->condition &= ~NR_COND_PEER_RX_BUSY;
nr_stop_t4timer(sk);
}
if (nr_validate_nr(sk, nr)) {
if (frametype & NR_NAK_FLAG) {
nr_frames_acked(sk, nr);
nr_send_nak_frame(sk);
} else {
if (nrom->condition & NR_COND_PEER_RX_BUSY) {
nr_frames_acked(sk, nr);
} else {
nr_check_iframes_acked(sk, nr);
}
}
}
queued = 1;
skb_queue_head(&nrom->reseq_queue, skb);
if (nrom->condition & NR_COND_OWN_RX_BUSY)
break;
skb_queue_head_init(&temp_queue);
do {
save_vr = nrom->vr;
while ((skbn = skb_dequeue(&nrom->reseq_queue)) != NULL) {
ns = skbn->data[17];
if (ns == nrom->vr) {
if (nr_queue_rx_frame(sk, skbn, frametype & NR_MORE_FLAG) == 0) {
nrom->vr = (nrom->vr + 1) % NR_MODULUS;
} else {
nrom->condition |= NR_COND_OWN_RX_BUSY;
skb_queue_tail(&temp_queue, skbn);
}
} else if (nr_in_rx_window(sk, ns)) {
skb_queue_tail(&temp_queue, skbn);
} else {
kfree_skb(skbn);
}
}
while ((skbn = skb_dequeue(&temp_queue)) != NULL) {
skb_queue_tail(&nrom->reseq_queue, skbn);
}
} while (save_vr != nrom->vr);
/*
* Window is full, ack it immediately.
*/
if (((nrom->vl + nrom->window) % NR_MODULUS) == nrom->vr) {
nr_enquiry_response(sk);
} else {
if (!(nrom->condition & NR_COND_ACK_PENDING)) {
nrom->condition |= NR_COND_ACK_PENDING;
nr_start_t2timer(sk);
}
}
break;
case NR_RESET:
if (sysctl_netrom_reset_circuit)
nr_disconnect(sk, ECONNRESET);
break;
default:
break;
}
return queued;
}
/* Higher level upcall for a LAPB frame - called with sk locked */
int nr_process_rx_frame(struct sock *sk, struct sk_buff *skb)
{
struct nr_sock *nr = nr_sk(sk);
int queued = 0, frametype;
if (nr->state == NR_STATE_0)
return 0;
frametype = skb->data[19];
switch (nr->state) {
case NR_STATE_1:
queued = nr_state1_machine(sk, skb, frametype);
break;
case NR_STATE_2:
queued = nr_state2_machine(sk, skb, frametype);
break;
case NR_STATE_3:
queued = nr_state3_machine(sk, skb, frametype);
break;
}
nr_kick(sk);
return queued;
}
Computing file changes ...