Revision - 8aef188 - VFS: Fix vfsmount overput on simultaneous automount

Revision 8aef18845266f5c05904c610088f2d1ed58f6be3 authored by Al Viro on 16 June 2011, 14:10:06 UTC, committed by Al Viro on 16 June 2011, 15:28:16 UTC

VFS: Fix vfsmount overput on simultaneous automount

[Kudos to dhowells for tracking that crap down]

If two processes attempt to cause automounting on the same mountpoint at the
same time, the vfsmount holding the mountpoint will be left with one too few
references on it, causing a BUG when the kernel tries to clean up.

The problem is that lock_mount() drops the caller's reference to the
mountpoint's vfsmount in the case where it finds something already mounted on
the mountpoint as it transits to the mounted filesystem and replaces path->mnt
with the new mountpoint vfsmount.

During a pathwalk, however, we don't take a reference on the vfsmount if it is
the same as the one in the nameidata struct, but do_add_mount() doesn't know
this.

The fix is to make sure we have a ref on the vfsmount of the mountpoint before
calling do_add_mount().  However, if lock_mount() doesn't transit, we're then
left with an extra ref on the mountpoint vfsmount which needs releasing.
We can handle that in follow_managed() by not making assumptions about what
we can and what we cannot get from lookup_mnt() as the current code does.

The callers of follow_managed() expect that reference to path->mnt will be
grabbed iff path->mnt has been changed.  follow_managed() and follow_automount()
keep track of whether such reference has been grabbed and assume that it'll
happen in those and only those cases that'll have us return with changed
path->mnt.  That assumption is almost correct - it breaks in case of
racing automounts and in even harder to hit race between following a mountpoint
and a couple of mount --move.  The thing is, we don't need to make that
assumption at all - after the end of loop in follow_manage() we can check
if path->mnt has ended up unchanged and do mntput() if needed.

The BUG can be reproduced with the following test program:

	#include <stdio.h>
	#include <sys/types.h>
	#include <sys/stat.h>
	#include <unistd.h>
	#include <sys/wait.h>
	int main(int argc, char **argv)
	{
		int pid, ws;
		struct stat buf;
		pid = fork();
		stat(argv[1], &buf);
		if (pid > 0) wait(&ws);
		return 0;
	}

and the following procedure:

 (1) Mount an NFS volume that on the server has something else mounted on a
     subdirectory.  For instance, I can mount / from my server:

	mount warthog:/ /mnt -t nfs4 -r

     On the server /data has another filesystem mounted on it, so NFS will see
     a change in FSID as it walks down the path, and will mark /mnt/data as
     being a mountpoint.  This will cause the automount code to be triggered.

     !!! Do not look inside the mounted fs at this point !!!

 (2) Run the above program on a file within the submount to generate two
     simultaneous automount requests:

	/tmp/forkstat /mnt/data/testfile

 (3) Unmount the automounted submount:

	umount /mnt/data

 (4) Unmount the original mount:

	umount /mnt

     At this point the kernel should throw a BUG with something like the
     following:

	BUG: Dentry ffff880032e3c5c0{i=2,n=} still in use (1) [unmount of nfs4 0:12]

Note that the bug appears on the root dentry of the original mount, not the
mountpoint and not the submount because sys_umount() hasn't got to its final
mntput_no_expire() yet, but this isn't so obvious from the call trace:

 [<ffffffff8117cd82>] shrink_dcache_for_umount+0x69/0x82
 [<ffffffff8116160e>] generic_shutdown_super+0x37/0x15b
 [<ffffffffa00fae56>] ? nfs_super_return_all_delegations+0x2e/0x1b1 [nfs]
 [<ffffffff811617f3>] kill_anon_super+0x1d/0x7e
 [<ffffffffa00d0be1>] nfs4_kill_super+0x60/0xb6 [nfs]
 [<ffffffff81161c17>] deactivate_locked_super+0x34/0x83
 [<ffffffff811629ff>] deactivate_super+0x6f/0x7b
 [<ffffffff81186261>] mntput_no_expire+0x18d/0x199
 [<ffffffff811862a8>] mntput+0x3b/0x44
 [<ffffffff81186d87>] release_mounts+0xa2/0xbf
 [<ffffffff811876af>] sys_umount+0x47a/0x4ba
 [<ffffffff8109e1ca>] ? trace_hardirqs_on_caller+0x1fd/0x22f
 [<ffffffff816ea86b>] system_call_fastpath+0x16/0x1b

as do_umount() is inlined.  However, you can see release_mounts() in there.

Note also that it may be necessary to have multiple CPU cores to be able to
trigger this bug.

Tested-by: Jeff Layton <jlayton@redhat.com>
Tested-by: Ian Kent <raven@themaw.net>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

1 parent 50338b8

Files
Changes

Permalinks

twl4030-pwrbutton.c

/**
 * twl4030-pwrbutton.c - TWL4030 Power Button Input Driver
 *
 * Copyright (C) 2008-2009 Nokia Corporation
 *
 * Written by Peter De Schrijver <peter.de-schrijver@nokia.com>
 * Several fixes by Felipe Balbi <felipe.balbi@nokia.com>
 *
 * This file is subject to the terms and conditions of the GNU General
 * Public License. See the file "COPYING" in the main directory of this
 * archive for more details.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/input.h>
#include <linux/interrupt.h>
#include <linux/platform_device.h>
#include <linux/i2c/twl.h>

#define PWR_PWRON_IRQ (1 << 0)

#define STS_HW_CONDITIONS 0xf

static irqreturn_t powerbutton_irq(int irq, void *_pwr)
{
	struct input_dev *pwr = _pwr;
	int err;
	u8 value;

	err = twl_i2c_read_u8(TWL4030_MODULE_PM_MASTER, &value,
				STS_HW_CONDITIONS);
	if (!err)  {
		input_report_key(pwr, KEY_POWER, value & PWR_PWRON_IRQ);
		input_sync(pwr);
	} else {
		dev_err(pwr->dev.parent, "twl4030: i2c error %d while reading"
			" TWL4030 PM_MASTER STS_HW_CONDITIONS register\n", err);
	}

	return IRQ_HANDLED;
}

static int __init twl4030_pwrbutton_probe(struct platform_device *pdev)
{
	struct input_dev *pwr;
	int irq = platform_get_irq(pdev, 0);
	int err;

	pwr = input_allocate_device();
	if (!pwr) {
		dev_dbg(&pdev->dev, "Can't allocate power button\n");
		return -ENOMEM;
	}

	pwr->evbit[0] = BIT_MASK(EV_KEY);
	pwr->keybit[BIT_WORD(KEY_POWER)] = BIT_MASK(KEY_POWER);
	pwr->name = "twl4030_pwrbutton";
	pwr->phys = "twl4030_pwrbutton/input0";
	pwr->dev.parent = &pdev->dev;

	err = request_threaded_irq(irq, NULL, powerbutton_irq,
			IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING,
			"twl4030_pwrbutton", pwr);
	if (err < 0) {
		dev_dbg(&pdev->dev, "Can't get IRQ for pwrbutton: %d\n", err);
		goto free_input_dev;
	}

	err = input_register_device(pwr);
	if (err) {
		dev_dbg(&pdev->dev, "Can't register power button: %d\n", err);
		goto free_irq;
	}

	platform_set_drvdata(pdev, pwr);

	return 0;

free_irq:
	free_irq(irq, pwr);
free_input_dev:
	input_free_device(pwr);
	return err;
}

static int __exit twl4030_pwrbutton_remove(struct platform_device *pdev)
{
	struct input_dev *pwr = platform_get_drvdata(pdev);
	int irq = platform_get_irq(pdev, 0);

	free_irq(irq, pwr);
	input_unregister_device(pwr);

	return 0;
}

static struct platform_driver twl4030_pwrbutton_driver = {
	.remove		= __exit_p(twl4030_pwrbutton_remove),
	.driver		= {
		.name	= "twl4030_pwrbutton",
		.owner	= THIS_MODULE,
	},
};

static int __init twl4030_pwrbutton_init(void)
{
	return platform_driver_probe(&twl4030_pwrbutton_driver,
			twl4030_pwrbutton_probe);
}
module_init(twl4030_pwrbutton_init);

static void __exit twl4030_pwrbutton_exit(void)
{
	platform_driver_unregister(&twl4030_pwrbutton_driver);
}
module_exit(twl4030_pwrbutton_exit);

MODULE_ALIAS("platform:twl4030_pwrbutton");
MODULE_DESCRIPTION("Triton2 Power Button");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Peter De Schrijver <peter.de-schrijver@nokia.com>");
MODULE_AUTHOR("Felipe Balbi <felipe.balbi@nokia.com>");

Showing with 0 additions and 0 deletions (0 / 0 diffs computed)

Computing file changes ...