Revision 8aef18845266f5c05904c610088f2d1ed58f6be3 authored by Al Viro on 16 June 2011, 14:10:06 UTC, committed by Al Viro on 16 June 2011, 15:28:16 UTC
[Kudos to dhowells for tracking that crap down]

If two processes attempt to cause automounting on the same mountpoint at the
same time, the vfsmount holding the mountpoint will be left with one too few
references on it, causing a BUG when the kernel tries to clean up.

The problem is that lock_mount() drops the caller's reference to the
mountpoint's vfsmount in the case where it finds something already mounted on
the mountpoint as it transits to the mounted filesystem and replaces path->mnt
with the new mountpoint vfsmount.

During a pathwalk, however, we don't take a reference on the vfsmount if it is
the same as the one in the nameidata struct, but do_add_mount() doesn't know
this.

The fix is to make sure we have a ref on the vfsmount of the mountpoint before
calling do_add_mount().  However, if lock_mount() doesn't transit, we're then
left with an extra ref on the mountpoint vfsmount which needs releasing.
We can handle that in follow_managed() by not making assumptions about what
we can and what we cannot get from lookup_mnt() as the current code does.

The callers of follow_managed() expect that reference to path->mnt will be
grabbed iff path->mnt has been changed.  follow_managed() and follow_automount()
keep track of whether such reference has been grabbed and assume that it'll
happen in those and only those cases that'll have us return with changed
path->mnt.  That assumption is almost correct - it breaks in case of
racing automounts and in even harder to hit race between following a mountpoint
and a couple of mount --move.  The thing is, we don't need to make that
assumption at all - after the end of loop in follow_manage() we can check
if path->mnt has ended up unchanged and do mntput() if needed.

The BUG can be reproduced with the following test program:

	#include <stdio.h>
	#include <sys/types.h>
	#include <sys/stat.h>
	#include <unistd.h>
	#include <sys/wait.h>
	int main(int argc, char **argv)
	{
		int pid, ws;
		struct stat buf;
		pid = fork();
		stat(argv[1], &buf);
		if (pid > 0) wait(&ws);
		return 0;
	}

and the following procedure:

 (1) Mount an NFS volume that on the server has something else mounted on a
     subdirectory.  For instance, I can mount / from my server:

	mount warthog:/ /mnt -t nfs4 -r

     On the server /data has another filesystem mounted on it, so NFS will see
     a change in FSID as it walks down the path, and will mark /mnt/data as
     being a mountpoint.  This will cause the automount code to be triggered.

     !!! Do not look inside the mounted fs at this point !!!

 (2) Run the above program on a file within the submount to generate two
     simultaneous automount requests:

	/tmp/forkstat /mnt/data/testfile

 (3) Unmount the automounted submount:

	umount /mnt/data

 (4) Unmount the original mount:

	umount /mnt

     At this point the kernel should throw a BUG with something like the
     following:

	BUG: Dentry ffff880032e3c5c0{i=2,n=} still in use (1) [unmount of nfs4 0:12]

Note that the bug appears on the root dentry of the original mount, not the
mountpoint and not the submount because sys_umount() hasn't got to its final
mntput_no_expire() yet, but this isn't so obvious from the call trace:

 [<ffffffff8117cd82>] shrink_dcache_for_umount+0x69/0x82
 [<ffffffff8116160e>] generic_shutdown_super+0x37/0x15b
 [<ffffffffa00fae56>] ? nfs_super_return_all_delegations+0x2e/0x1b1 [nfs]
 [<ffffffff811617f3>] kill_anon_super+0x1d/0x7e
 [<ffffffffa00d0be1>] nfs4_kill_super+0x60/0xb6 [nfs]
 [<ffffffff81161c17>] deactivate_locked_super+0x34/0x83
 [<ffffffff811629ff>] deactivate_super+0x6f/0x7b
 [<ffffffff81186261>] mntput_no_expire+0x18d/0x199
 [<ffffffff811862a8>] mntput+0x3b/0x44
 [<ffffffff81186d87>] release_mounts+0xa2/0xbf
 [<ffffffff811876af>] sys_umount+0x47a/0x4ba
 [<ffffffff8109e1ca>] ? trace_hardirqs_on_caller+0x1fd/0x22f
 [<ffffffff816ea86b>] system_call_fastpath+0x16/0x1b

as do_umount() is inlined.  However, you can see release_mounts() in there.

Note also that it may be necessary to have multiple CPU cores to be able to
trigger this bug.

Tested-by: Jeff Layton <jlayton@redhat.com>
Tested-by: Ian Kent <raven@themaw.net>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
1 parent 50338b8
Raw File
sunkbd.c
/*
 *  Copyright (c) 1999-2001 Vojtech Pavlik
 */

/*
 * Sun keyboard driver for Linux
 */

/*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *
 * Should you need to contact me, the author, you can do so either by
 * e-mail - mail your message to <vojtech@ucw.cz>, or by paper mail:
 * Vojtech Pavlik, Simunkova 1594, Prague 8, 182 00 Czech Republic
 */

#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/input.h>
#include <linux/serio.h>
#include <linux/workqueue.h>

#define DRIVER_DESC	"Sun keyboard driver"

MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>");
MODULE_DESCRIPTION(DRIVER_DESC);
MODULE_LICENSE("GPL");

static unsigned char sunkbd_keycode[128] = {
	  0,128,114,129,115, 59, 60, 68, 61, 87, 62, 88, 63,100, 64,112,
	 65, 66, 67, 56,103,119, 99, 70,105,130,131,108,106,  1,  2,  3,
	  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 41, 14,110,113, 98, 55,
	116,132, 83,133,102, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
	 26, 27,111,127, 71, 72, 73, 74,134,135,107,  0, 29, 30, 31, 32,
	 33, 34, 35, 36, 37, 38, 39, 40, 43, 28, 96, 75, 76, 77, 82,136,
	104,137, 69, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,101,
	 79, 80, 81,  0,  0,  0,138, 58,125, 57,126,109, 86, 78
};

#define SUNKBD_CMD_RESET	0x1
#define SUNKBD_CMD_BELLON	0x2
#define SUNKBD_CMD_BELLOFF	0x3
#define SUNKBD_CMD_CLICK	0xa
#define SUNKBD_CMD_NOCLICK	0xb
#define SUNKBD_CMD_SETLED	0xe
#define SUNKBD_CMD_LAYOUT	0xf

#define SUNKBD_RET_RESET	0xff
#define SUNKBD_RET_ALLUP	0x7f
#define SUNKBD_RET_LAYOUT	0xfe

#define SUNKBD_LAYOUT_5_MASK	0x20
#define SUNKBD_RELEASE		0x80
#define SUNKBD_KEY		0x7f

/*
 * Per-keyboard data.
 */

struct sunkbd {
	unsigned char keycode[ARRAY_SIZE(sunkbd_keycode)];
	struct input_dev *dev;
	struct serio *serio;
	struct work_struct tq;
	wait_queue_head_t wait;
	char name[64];
	char phys[32];
	char type;
	bool enabled;
	volatile s8 reset;
	volatile s8 layout;
};

/*
 * sunkbd_interrupt() is called by the low level driver when a character
 * is received.
 */

static irqreturn_t sunkbd_interrupt(struct serio *serio,
		unsigned char data, unsigned int flags)
{
	struct sunkbd *sunkbd = serio_get_drvdata(serio);

	if (sunkbd->reset <= -1) {
		/*
		 * If cp[i] is 0xff, sunkbd->reset will stay -1.
		 * The keyboard sends 0xff 0xff 0xID on powerup.
		 */
		sunkbd->reset = data;
		wake_up_interruptible(&sunkbd->wait);
		goto out;
	}

	if (sunkbd->layout == -1) {
		sunkbd->layout = data;
		wake_up_interruptible(&sunkbd->wait);
		goto out;
	}

	switch (data) {

	case SUNKBD_RET_RESET:
		schedule_work(&sunkbd->tq);
		sunkbd->reset = -1;
		break;

	case SUNKBD_RET_LAYOUT:
		sunkbd->layout = -1;
		break;

	case SUNKBD_RET_ALLUP: /* All keys released */
		break;

	default:
		if (!sunkbd->enabled)
			break;

		if (sunkbd->keycode[data & SUNKBD_KEY]) {
			input_report_key(sunkbd->dev,
					 sunkbd->keycode[data & SUNKBD_KEY],
					 !(data & SUNKBD_RELEASE));
			input_sync(sunkbd->dev);
		} else {
			printk(KERN_WARNING
				"sunkbd.c: Unknown key (scancode %#x) %s.\n",
				data & SUNKBD_KEY,
				data & SUNKBD_RELEASE ? "released" : "pressed");
		}
	}
out:
	return IRQ_HANDLED;
}

/*
 * sunkbd_event() handles events from the input module.
 */

static int sunkbd_event(struct input_dev *dev,
			unsigned int type, unsigned int code, int value)
{
	struct sunkbd *sunkbd = input_get_drvdata(dev);

	switch (type) {

	case EV_LED:

		serio_write(sunkbd->serio, SUNKBD_CMD_SETLED);
		serio_write(sunkbd->serio,
			(!!test_bit(LED_CAPSL,   dev->led) << 3) |
			(!!test_bit(LED_SCROLLL, dev->led) << 2) |
			(!!test_bit(LED_COMPOSE, dev->led) << 1) |
			 !!test_bit(LED_NUML,    dev->led));
		return 0;

	case EV_SND:

		switch (code) {

		case SND_CLICK:
			serio_write(sunkbd->serio, SUNKBD_CMD_NOCLICK - value);
			return 0;

		case SND_BELL:
			serio_write(sunkbd->serio, SUNKBD_CMD_BELLOFF - value);
			return 0;
		}

		break;
	}

	return -1;
}

/*
 * sunkbd_initialize() checks for a Sun keyboard attached, and determines
 * its type.
 */

static int sunkbd_initialize(struct sunkbd *sunkbd)
{
	sunkbd->reset = -2;
	serio_write(sunkbd->serio, SUNKBD_CMD_RESET);
	wait_event_interruptible_timeout(sunkbd->wait, sunkbd->reset >= 0, HZ);
	if (sunkbd->reset < 0)
		return -1;

	sunkbd->type = sunkbd->reset;

	if (sunkbd->type == 4) {	/* Type 4 keyboard */
		sunkbd->layout = -2;
		serio_write(sunkbd->serio, SUNKBD_CMD_LAYOUT);
		wait_event_interruptible_timeout(sunkbd->wait,
						 sunkbd->layout >= 0, HZ / 4);
		if (sunkbd->layout < 0)
			return -1;
		if (sunkbd->layout & SUNKBD_LAYOUT_5_MASK)
			sunkbd->type = 5;
	}

	return 0;
}

/*
 * sunkbd_reinit() sets leds and beeps to a state the computer remembers they
 * were in.
 */

static void sunkbd_reinit(struct work_struct *work)
{
	struct sunkbd *sunkbd = container_of(work, struct sunkbd, tq);

	wait_event_interruptible_timeout(sunkbd->wait, sunkbd->reset >= 0, HZ);

	serio_write(sunkbd->serio, SUNKBD_CMD_SETLED);
	serio_write(sunkbd->serio,
		(!!test_bit(LED_CAPSL,   sunkbd->dev->led) << 3) |
		(!!test_bit(LED_SCROLLL, sunkbd->dev->led) << 2) |
		(!!test_bit(LED_COMPOSE, sunkbd->dev->led) << 1) |
		 !!test_bit(LED_NUML,    sunkbd->dev->led));
	serio_write(sunkbd->serio,
		SUNKBD_CMD_NOCLICK - !!test_bit(SND_CLICK, sunkbd->dev->snd));
	serio_write(sunkbd->serio,
		SUNKBD_CMD_BELLOFF - !!test_bit(SND_BELL, sunkbd->dev->snd));
}

static void sunkbd_enable(struct sunkbd *sunkbd, bool enable)
{
	serio_pause_rx(sunkbd->serio);
	sunkbd->enabled = enable;
	serio_continue_rx(sunkbd->serio);
}

/*
 * sunkbd_connect() probes for a Sun keyboard and fills the necessary
 * structures.
 */

static int sunkbd_connect(struct serio *serio, struct serio_driver *drv)
{
	struct sunkbd *sunkbd;
	struct input_dev *input_dev;
	int err = -ENOMEM;
	int i;

	sunkbd = kzalloc(sizeof(struct sunkbd), GFP_KERNEL);
	input_dev = input_allocate_device();
	if (!sunkbd || !input_dev)
		goto fail1;

	sunkbd->serio = serio;
	sunkbd->dev = input_dev;
	init_waitqueue_head(&sunkbd->wait);
	INIT_WORK(&sunkbd->tq, sunkbd_reinit);
	snprintf(sunkbd->phys, sizeof(sunkbd->phys), "%s/input0", serio->phys);

	serio_set_drvdata(serio, sunkbd);

	err = serio_open(serio, drv);
	if (err)
		goto fail2;

	if (sunkbd_initialize(sunkbd) < 0) {
		err = -ENODEV;
		goto fail3;
	}

	snprintf(sunkbd->name, sizeof(sunkbd->name),
		 "Sun Type %d keyboard", sunkbd->type);
	memcpy(sunkbd->keycode, sunkbd_keycode, sizeof(sunkbd->keycode));

	input_dev->name = sunkbd->name;
	input_dev->phys = sunkbd->phys;
	input_dev->id.bustype = BUS_RS232;
	input_dev->id.vendor  = SERIO_SUNKBD;
	input_dev->id.product = sunkbd->type;
	input_dev->id.version = 0x0100;
	input_dev->dev.parent = &serio->dev;

	input_set_drvdata(input_dev, sunkbd);

	input_dev->event = sunkbd_event;

	input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_LED) |
		BIT_MASK(EV_SND) | BIT_MASK(EV_REP);
	input_dev->ledbit[0] = BIT_MASK(LED_CAPSL) | BIT_MASK(LED_COMPOSE) |
		BIT_MASK(LED_SCROLLL) | BIT_MASK(LED_NUML);
	input_dev->sndbit[0] = BIT_MASK(SND_CLICK) | BIT_MASK(SND_BELL);

	input_dev->keycode = sunkbd->keycode;
	input_dev->keycodesize = sizeof(unsigned char);
	input_dev->keycodemax = ARRAY_SIZE(sunkbd_keycode);
	for (i = 0; i < ARRAY_SIZE(sunkbd_keycode); i++)
		__set_bit(sunkbd->keycode[i], input_dev->keybit);
	__clear_bit(KEY_RESERVED, input_dev->keybit);

	sunkbd_enable(sunkbd, true);

	err = input_register_device(sunkbd->dev);
	if (err)
		goto fail4;

	return 0;

 fail4:	sunkbd_enable(sunkbd, false);
 fail3:	serio_close(serio);
 fail2:	serio_set_drvdata(serio, NULL);
 fail1:	input_free_device(input_dev);
	kfree(sunkbd);
	return err;
}

/*
 * sunkbd_disconnect() unregisters and closes behind us.
 */

static void sunkbd_disconnect(struct serio *serio)
{
	struct sunkbd *sunkbd = serio_get_drvdata(serio);

	sunkbd_enable(sunkbd, false);
	input_unregister_device(sunkbd->dev);
	serio_close(serio);
	serio_set_drvdata(serio, NULL);
	kfree(sunkbd);
}

static struct serio_device_id sunkbd_serio_ids[] = {
	{
		.type	= SERIO_RS232,
		.proto	= SERIO_SUNKBD,
		.id	= SERIO_ANY,
		.extra	= SERIO_ANY,
	},
	{
		.type	= SERIO_RS232,
		.proto	= SERIO_UNKNOWN, /* sunkbd does probe */
		.id	= SERIO_ANY,
		.extra	= SERIO_ANY,
	},
	{ 0 }
};

MODULE_DEVICE_TABLE(serio, sunkbd_serio_ids);

static struct serio_driver sunkbd_drv = {
	.driver		= {
		.name	= "sunkbd",
	},
	.description	= DRIVER_DESC,
	.id_table	= sunkbd_serio_ids,
	.interrupt	= sunkbd_interrupt,
	.connect	= sunkbd_connect,
	.disconnect	= sunkbd_disconnect,
};

/*
 * The functions for insering/removing us as a module.
 */

static int __init sunkbd_init(void)
{
	return serio_register_driver(&sunkbd_drv);
}

static void __exit sunkbd_exit(void)
{
	serio_unregister_driver(&sunkbd_drv);
}

module_init(sunkbd_init);
module_exit(sunkbd_exit);
back to top