Revision 8aef18845266f5c05904c610088f2d1ed58f6be3 authored by Al Viro on 16 June 2011, 14:10:06 UTC, committed by Al Viro on 16 June 2011, 15:28:16 UTC
[Kudos to dhowells for tracking that crap down]

If two processes attempt to cause automounting on the same mountpoint at the
same time, the vfsmount holding the mountpoint will be left with one too few
references on it, causing a BUG when the kernel tries to clean up.

The problem is that lock_mount() drops the caller's reference to the
mountpoint's vfsmount in the case where it finds something already mounted on
the mountpoint as it transits to the mounted filesystem and replaces path->mnt
with the new mountpoint vfsmount.

During a pathwalk, however, we don't take a reference on the vfsmount if it is
the same as the one in the nameidata struct, but do_add_mount() doesn't know
this.

The fix is to make sure we have a ref on the vfsmount of the mountpoint before
calling do_add_mount().  However, if lock_mount() doesn't transit, we're then
left with an extra ref on the mountpoint vfsmount which needs releasing.
We can handle that in follow_managed() by not making assumptions about what
we can and what we cannot get from lookup_mnt() as the current code does.

The callers of follow_managed() expect that reference to path->mnt will be
grabbed iff path->mnt has been changed.  follow_managed() and follow_automount()
keep track of whether such reference has been grabbed and assume that it'll
happen in those and only those cases that'll have us return with changed
path->mnt.  That assumption is almost correct - it breaks in case of
racing automounts and in even harder to hit race between following a mountpoint
and a couple of mount --move.  The thing is, we don't need to make that
assumption at all - after the end of loop in follow_manage() we can check
if path->mnt has ended up unchanged and do mntput() if needed.

The BUG can be reproduced with the following test program:

	#include <stdio.h>
	#include <sys/types.h>
	#include <sys/stat.h>
	#include <unistd.h>
	#include <sys/wait.h>
	int main(int argc, char **argv)
	{
		int pid, ws;
		struct stat buf;
		pid = fork();
		stat(argv[1], &buf);
		if (pid > 0) wait(&ws);
		return 0;
	}

and the following procedure:

 (1) Mount an NFS volume that on the server has something else mounted on a
     subdirectory.  For instance, I can mount / from my server:

	mount warthog:/ /mnt -t nfs4 -r

     On the server /data has another filesystem mounted on it, so NFS will see
     a change in FSID as it walks down the path, and will mark /mnt/data as
     being a mountpoint.  This will cause the automount code to be triggered.

     !!! Do not look inside the mounted fs at this point !!!

 (2) Run the above program on a file within the submount to generate two
     simultaneous automount requests:

	/tmp/forkstat /mnt/data/testfile

 (3) Unmount the automounted submount:

	umount /mnt/data

 (4) Unmount the original mount:

	umount /mnt

     At this point the kernel should throw a BUG with something like the
     following:

	BUG: Dentry ffff880032e3c5c0{i=2,n=} still in use (1) [unmount of nfs4 0:12]

Note that the bug appears on the root dentry of the original mount, not the
mountpoint and not the submount because sys_umount() hasn't got to its final
mntput_no_expire() yet, but this isn't so obvious from the call trace:

 [<ffffffff8117cd82>] shrink_dcache_for_umount+0x69/0x82
 [<ffffffff8116160e>] generic_shutdown_super+0x37/0x15b
 [<ffffffffa00fae56>] ? nfs_super_return_all_delegations+0x2e/0x1b1 [nfs]
 [<ffffffff811617f3>] kill_anon_super+0x1d/0x7e
 [<ffffffffa00d0be1>] nfs4_kill_super+0x60/0xb6 [nfs]
 [<ffffffff81161c17>] deactivate_locked_super+0x34/0x83
 [<ffffffff811629ff>] deactivate_super+0x6f/0x7b
 [<ffffffff81186261>] mntput_no_expire+0x18d/0x199
 [<ffffffff811862a8>] mntput+0x3b/0x44
 [<ffffffff81186d87>] release_mounts+0xa2/0xbf
 [<ffffffff811876af>] sys_umount+0x47a/0x4ba
 [<ffffffff8109e1ca>] ? trace_hardirqs_on_caller+0x1fd/0x22f
 [<ffffffff816ea86b>] system_call_fastpath+0x16/0x1b

as do_umount() is inlined.  However, you can see release_mounts() in there.

Note also that it may be necessary to have multiple CPU cores to be able to
trigger this bug.

Tested-by: Jeff Layton <jlayton@redhat.com>
Tested-by: Ian Kent <raven@themaw.net>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
1 parent 50338b8
Raw File
icmp_socket.c
/*
 * Copyright (C) 2007-2011 B.A.T.M.A.N. contributors:
 *
 * Marek Lindner
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA
 *
 */

#include "main.h"
#include <linux/debugfs.h>
#include <linux/slab.h>
#include "icmp_socket.h"
#include "send.h"
#include "hash.h"
#include "originator.h"
#include "hard-interface.h"

static struct socket_client *socket_client_hash[256];

static void bat_socket_add_packet(struct socket_client *socket_client,
				  struct icmp_packet_rr *icmp_packet,
				  size_t icmp_len);

void bat_socket_init(void)
{
	memset(socket_client_hash, 0, sizeof(socket_client_hash));
}

static int bat_socket_open(struct inode *inode, struct file *file)
{
	unsigned int i;
	struct socket_client *socket_client;

	nonseekable_open(inode, file);

	socket_client = kmalloc(sizeof(struct socket_client), GFP_KERNEL);

	if (!socket_client)
		return -ENOMEM;

	for (i = 0; i < ARRAY_SIZE(socket_client_hash); i++) {
		if (!socket_client_hash[i]) {
			socket_client_hash[i] = socket_client;
			break;
		}
	}

	if (i == ARRAY_SIZE(socket_client_hash)) {
		pr_err("Error - can't add another packet client: "
		       "maximum number of clients reached\n");
		kfree(socket_client);
		return -EXFULL;
	}

	INIT_LIST_HEAD(&socket_client->queue_list);
	socket_client->queue_len = 0;
	socket_client->index = i;
	socket_client->bat_priv = inode->i_private;
	spin_lock_init(&socket_client->lock);
	init_waitqueue_head(&socket_client->queue_wait);

	file->private_data = socket_client;

	inc_module_count();
	return 0;
}

static int bat_socket_release(struct inode *inode, struct file *file)
{
	struct socket_client *socket_client = file->private_data;
	struct socket_packet *socket_packet;
	struct list_head *list_pos, *list_pos_tmp;

	spin_lock_bh(&socket_client->lock);

	/* for all packets in the queue ... */
	list_for_each_safe(list_pos, list_pos_tmp, &socket_client->queue_list) {
		socket_packet = list_entry(list_pos,
					   struct socket_packet, list);

		list_del(list_pos);
		kfree(socket_packet);
	}

	socket_client_hash[socket_client->index] = NULL;
	spin_unlock_bh(&socket_client->lock);

	kfree(socket_client);
	dec_module_count();

	return 0;
}

static ssize_t bat_socket_read(struct file *file, char __user *buf,
			       size_t count, loff_t *ppos)
{
	struct socket_client *socket_client = file->private_data;
	struct socket_packet *socket_packet;
	size_t packet_len;
	int error;

	if ((file->f_flags & O_NONBLOCK) && (socket_client->queue_len == 0))
		return -EAGAIN;

	if ((!buf) || (count < sizeof(struct icmp_packet)))
		return -EINVAL;

	if (!access_ok(VERIFY_WRITE, buf, count))
		return -EFAULT;

	error = wait_event_interruptible(socket_client->queue_wait,
					 socket_client->queue_len);

	if (error)
		return error;

	spin_lock_bh(&socket_client->lock);

	socket_packet = list_first_entry(&socket_client->queue_list,
					 struct socket_packet, list);
	list_del(&socket_packet->list);
	socket_client->queue_len--;

	spin_unlock_bh(&socket_client->lock);

	error = __copy_to_user(buf, &socket_packet->icmp_packet,
			       socket_packet->icmp_len);

	packet_len = socket_packet->icmp_len;
	kfree(socket_packet);

	if (error)
		return -EFAULT;

	return packet_len;
}

static ssize_t bat_socket_write(struct file *file, const char __user *buff,
				size_t len, loff_t *off)
{
	struct socket_client *socket_client = file->private_data;
	struct bat_priv *bat_priv = socket_client->bat_priv;
	struct hard_iface *primary_if = NULL;
	struct sk_buff *skb;
	struct icmp_packet_rr *icmp_packet;

	struct orig_node *orig_node = NULL;
	struct neigh_node *neigh_node = NULL;
	size_t packet_len = sizeof(struct icmp_packet);

	if (len < sizeof(struct icmp_packet)) {
		bat_dbg(DBG_BATMAN, bat_priv,
			"Error - can't send packet from char device: "
			"invalid packet size\n");
		return -EINVAL;
	}

	primary_if = primary_if_get_selected(bat_priv);

	if (!primary_if) {
		len = -EFAULT;
		goto out;
	}

	if (len >= sizeof(struct icmp_packet_rr))
		packet_len = sizeof(struct icmp_packet_rr);

	skb = dev_alloc_skb(packet_len + sizeof(struct ethhdr));
	if (!skb) {
		len = -ENOMEM;
		goto out;
	}

	skb_reserve(skb, sizeof(struct ethhdr));
	icmp_packet = (struct icmp_packet_rr *)skb_put(skb, packet_len);

	if (!access_ok(VERIFY_READ, buff, packet_len)) {
		len = -EFAULT;
		goto free_skb;
	}

	if (__copy_from_user(icmp_packet, buff, packet_len)) {
		len = -EFAULT;
		goto free_skb;
	}

	if (icmp_packet->packet_type != BAT_ICMP) {
		bat_dbg(DBG_BATMAN, bat_priv,
			"Error - can't send packet from char device: "
			"got bogus packet type (expected: BAT_ICMP)\n");
		len = -EINVAL;
		goto free_skb;
	}

	if (icmp_packet->msg_type != ECHO_REQUEST) {
		bat_dbg(DBG_BATMAN, bat_priv,
			"Error - can't send packet from char device: "
			"got bogus message type (expected: ECHO_REQUEST)\n");
		len = -EINVAL;
		goto free_skb;
	}

	icmp_packet->uid = socket_client->index;

	if (icmp_packet->version != COMPAT_VERSION) {
		icmp_packet->msg_type = PARAMETER_PROBLEM;
		icmp_packet->ttl = COMPAT_VERSION;
		bat_socket_add_packet(socket_client, icmp_packet, packet_len);
		goto free_skb;
	}

	if (atomic_read(&bat_priv->mesh_state) != MESH_ACTIVE)
		goto dst_unreach;

	orig_node = orig_hash_find(bat_priv, icmp_packet->dst);
	if (!orig_node)
		goto dst_unreach;

	neigh_node = orig_node_get_router(orig_node);
	if (!neigh_node)
		goto dst_unreach;

	if (!neigh_node->if_incoming)
		goto dst_unreach;

	if (neigh_node->if_incoming->if_status != IF_ACTIVE)
		goto dst_unreach;

	memcpy(icmp_packet->orig,
	       primary_if->net_dev->dev_addr, ETH_ALEN);

	if (packet_len == sizeof(struct icmp_packet_rr))
		memcpy(icmp_packet->rr,
		       neigh_node->if_incoming->net_dev->dev_addr, ETH_ALEN);

	send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr);
	goto out;

dst_unreach:
	icmp_packet->msg_type = DESTINATION_UNREACHABLE;
	bat_socket_add_packet(socket_client, icmp_packet, packet_len);
free_skb:
	kfree_skb(skb);
out:
	if (primary_if)
		hardif_free_ref(primary_if);
	if (neigh_node)
		neigh_node_free_ref(neigh_node);
	if (orig_node)
		orig_node_free_ref(orig_node);
	return len;
}

static unsigned int bat_socket_poll(struct file *file, poll_table *wait)
{
	struct socket_client *socket_client = file->private_data;

	poll_wait(file, &socket_client->queue_wait, wait);

	if (socket_client->queue_len > 0)
		return POLLIN | POLLRDNORM;

	return 0;
}

static const struct file_operations fops = {
	.owner = THIS_MODULE,
	.open = bat_socket_open,
	.release = bat_socket_release,
	.read = bat_socket_read,
	.write = bat_socket_write,
	.poll = bat_socket_poll,
	.llseek = no_llseek,
};

int bat_socket_setup(struct bat_priv *bat_priv)
{
	struct dentry *d;

	if (!bat_priv->debug_dir)
		goto err;

	d = debugfs_create_file(ICMP_SOCKET, S_IFREG | S_IWUSR | S_IRUSR,
				bat_priv->debug_dir, bat_priv, &fops);
	if (d)
		goto err;

	return 0;

err:
	return 1;
}

static void bat_socket_add_packet(struct socket_client *socket_client,
				  struct icmp_packet_rr *icmp_packet,
				  size_t icmp_len)
{
	struct socket_packet *socket_packet;

	socket_packet = kmalloc(sizeof(struct socket_packet), GFP_ATOMIC);

	if (!socket_packet)
		return;

	INIT_LIST_HEAD(&socket_packet->list);
	memcpy(&socket_packet->icmp_packet, icmp_packet, icmp_len);
	socket_packet->icmp_len = icmp_len;

	spin_lock_bh(&socket_client->lock);

	/* while waiting for the lock the socket_client could have been
	 * deleted */
	if (!socket_client_hash[icmp_packet->uid]) {
		spin_unlock_bh(&socket_client->lock);
		kfree(socket_packet);
		return;
	}

	list_add_tail(&socket_packet->list, &socket_client->queue_list);
	socket_client->queue_len++;

	if (socket_client->queue_len > 100) {
		socket_packet = list_first_entry(&socket_client->queue_list,
						 struct socket_packet, list);

		list_del(&socket_packet->list);
		kfree(socket_packet);
		socket_client->queue_len--;
	}

	spin_unlock_bh(&socket_client->lock);

	wake_up(&socket_client->queue_wait);
}

void bat_socket_receive_packet(struct icmp_packet_rr *icmp_packet,
			       size_t icmp_len)
{
	struct socket_client *hash = socket_client_hash[icmp_packet->uid];

	if (hash)
		bat_socket_add_packet(hash, icmp_packet, icmp_len);
}
back to top