Revision 8aef18845266f5c05904c610088f2d1ed58f6be3 authored by Al Viro on 16 June 2011, 14:10:06 UTC, committed by Al Viro on 16 June 2011, 15:28:16 UTC
[Kudos to dhowells for tracking that crap down] If two processes attempt to cause automounting on the same mountpoint at the same time, the vfsmount holding the mountpoint will be left with one too few references on it, causing a BUG when the kernel tries to clean up. The problem is that lock_mount() drops the caller's reference to the mountpoint's vfsmount in the case where it finds something already mounted on the mountpoint as it transits to the mounted filesystem and replaces path->mnt with the new mountpoint vfsmount. During a pathwalk, however, we don't take a reference on the vfsmount if it is the same as the one in the nameidata struct, but do_add_mount() doesn't know this. The fix is to make sure we have a ref on the vfsmount of the mountpoint before calling do_add_mount(). However, if lock_mount() doesn't transit, we're then left with an extra ref on the mountpoint vfsmount which needs releasing. We can handle that in follow_managed() by not making assumptions about what we can and what we cannot get from lookup_mnt() as the current code does. The callers of follow_managed() expect that reference to path->mnt will be grabbed iff path->mnt has been changed. follow_managed() and follow_automount() keep track of whether such reference has been grabbed and assume that it'll happen in those and only those cases that'll have us return with changed path->mnt. That assumption is almost correct - it breaks in case of racing automounts and in even harder to hit race between following a mountpoint and a couple of mount --move. The thing is, we don't need to make that assumption at all - after the end of loop in follow_manage() we can check if path->mnt has ended up unchanged and do mntput() if needed. The BUG can be reproduced with the following test program: #include <stdio.h> #include <sys/types.h> #include <sys/stat.h> #include <unistd.h> #include <sys/wait.h> int main(int argc, char **argv) { int pid, ws; struct stat buf; pid = fork(); stat(argv[1], &buf); if (pid > 0) wait(&ws); return 0; } and the following procedure: (1) Mount an NFS volume that on the server has something else mounted on a subdirectory. For instance, I can mount / from my server: mount warthog:/ /mnt -t nfs4 -r On the server /data has another filesystem mounted on it, so NFS will see a change in FSID as it walks down the path, and will mark /mnt/data as being a mountpoint. This will cause the automount code to be triggered. !!! Do not look inside the mounted fs at this point !!! (2) Run the above program on a file within the submount to generate two simultaneous automount requests: /tmp/forkstat /mnt/data/testfile (3) Unmount the automounted submount: umount /mnt/data (4) Unmount the original mount: umount /mnt At this point the kernel should throw a BUG with something like the following: BUG: Dentry ffff880032e3c5c0{i=2,n=} still in use (1) [unmount of nfs4 0:12] Note that the bug appears on the root dentry of the original mount, not the mountpoint and not the submount because sys_umount() hasn't got to its final mntput_no_expire() yet, but this isn't so obvious from the call trace: [<ffffffff8117cd82>] shrink_dcache_for_umount+0x69/0x82 [<ffffffff8116160e>] generic_shutdown_super+0x37/0x15b [<ffffffffa00fae56>] ? nfs_super_return_all_delegations+0x2e/0x1b1 [nfs] [<ffffffff811617f3>] kill_anon_super+0x1d/0x7e [<ffffffffa00d0be1>] nfs4_kill_super+0x60/0xb6 [nfs] [<ffffffff81161c17>] deactivate_locked_super+0x34/0x83 [<ffffffff811629ff>] deactivate_super+0x6f/0x7b [<ffffffff81186261>] mntput_no_expire+0x18d/0x199 [<ffffffff811862a8>] mntput+0x3b/0x44 [<ffffffff81186d87>] release_mounts+0xa2/0xbf [<ffffffff811876af>] sys_umount+0x47a/0x4ba [<ffffffff8109e1ca>] ? trace_hardirqs_on_caller+0x1fd/0x22f [<ffffffff816ea86b>] system_call_fastpath+0x16/0x1b as do_umount() is inlined. However, you can see release_mounts() in there. Note also that it may be necessary to have multiple CPU cores to be able to trigger this bug. Tested-by: Jeff Layton <jlayton@redhat.com> Tested-by: Ian Kent <raven@themaw.net> Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
1 parent 50338b8
input-polldev.c
/*
* Generic implementation of a polled input device
* Copyright (c) 2007 Dmitry Torokhov
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/jiffies.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/workqueue.h>
#include <linux/input-polldev.h>
MODULE_AUTHOR("Dmitry Torokhov <dtor@mail.ru>");
MODULE_DESCRIPTION("Generic implementation of a polled input device");
MODULE_LICENSE("GPL v2");
MODULE_VERSION("0.1");
static void input_polldev_queue_work(struct input_polled_dev *dev)
{
unsigned long delay;
delay = msecs_to_jiffies(dev->poll_interval);
if (delay >= HZ)
delay = round_jiffies_relative(delay);
queue_delayed_work(system_freezable_wq, &dev->work, delay);
}
static void input_polled_device_work(struct work_struct *work)
{
struct input_polled_dev *dev =
container_of(work, struct input_polled_dev, work.work);
dev->poll(dev);
input_polldev_queue_work(dev);
}
static int input_open_polled_device(struct input_dev *input)
{
struct input_polled_dev *dev = input_get_drvdata(input);
if (dev->open)
dev->open(dev);
/* Only start polling if polling is enabled */
if (dev->poll_interval > 0)
queue_delayed_work(system_freezable_wq, &dev->work, 0);
return 0;
}
static void input_close_polled_device(struct input_dev *input)
{
struct input_polled_dev *dev = input_get_drvdata(input);
cancel_delayed_work_sync(&dev->work);
if (dev->close)
dev->close(dev);
}
/* SYSFS interface */
static ssize_t input_polldev_get_poll(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct input_polled_dev *polldev = dev_get_drvdata(dev);
return sprintf(buf, "%d\n", polldev->poll_interval);
}
static ssize_t input_polldev_set_poll(struct device *dev,
struct device_attribute *attr, const char *buf,
size_t count)
{
struct input_polled_dev *polldev = dev_get_drvdata(dev);
struct input_dev *input = polldev->input;
unsigned long interval;
if (strict_strtoul(buf, 0, &interval))
return -EINVAL;
if (interval < polldev->poll_interval_min)
return -EINVAL;
if (interval > polldev->poll_interval_max)
return -EINVAL;
mutex_lock(&input->mutex);
polldev->poll_interval = interval;
if (input->users) {
cancel_delayed_work_sync(&polldev->work);
if (polldev->poll_interval > 0)
input_polldev_queue_work(polldev);
}
mutex_unlock(&input->mutex);
return count;
}
static DEVICE_ATTR(poll, S_IRUGO | S_IWUSR, input_polldev_get_poll,
input_polldev_set_poll);
static ssize_t input_polldev_get_max(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct input_polled_dev *polldev = dev_get_drvdata(dev);
return sprintf(buf, "%d\n", polldev->poll_interval_max);
}
static DEVICE_ATTR(max, S_IRUGO, input_polldev_get_max, NULL);
static ssize_t input_polldev_get_min(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct input_polled_dev *polldev = dev_get_drvdata(dev);
return sprintf(buf, "%d\n", polldev->poll_interval_min);
}
static DEVICE_ATTR(min, S_IRUGO, input_polldev_get_min, NULL);
static struct attribute *sysfs_attrs[] = {
&dev_attr_poll.attr,
&dev_attr_max.attr,
&dev_attr_min.attr,
NULL
};
static struct attribute_group input_polldev_attribute_group = {
.attrs = sysfs_attrs
};
/**
* input_allocate_polled_device - allocate memory for polled device
*
* The function allocates memory for a polled device and also
* for an input device associated with this polled device.
*/
struct input_polled_dev *input_allocate_polled_device(void)
{
struct input_polled_dev *dev;
dev = kzalloc(sizeof(struct input_polled_dev), GFP_KERNEL);
if (!dev)
return NULL;
dev->input = input_allocate_device();
if (!dev->input) {
kfree(dev);
return NULL;
}
return dev;
}
EXPORT_SYMBOL(input_allocate_polled_device);
/**
* input_free_polled_device - free memory allocated for polled device
* @dev: device to free
*
* The function frees memory allocated for polling device and drops
* reference to the associated input device.
*/
void input_free_polled_device(struct input_polled_dev *dev)
{
if (dev) {
input_free_device(dev->input);
kfree(dev);
}
}
EXPORT_SYMBOL(input_free_polled_device);
/**
* input_register_polled_device - register polled device
* @dev: device to register
*
* The function registers previously initialized polled input device
* with input layer. The device should be allocated with call to
* input_allocate_polled_device(). Callers should also set up poll()
* method and set up capabilities (id, name, phys, bits) of the
* corresponding input_dev structure.
*/
int input_register_polled_device(struct input_polled_dev *dev)
{
struct input_dev *input = dev->input;
int error;
input_set_drvdata(input, dev);
INIT_DELAYED_WORK(&dev->work, input_polled_device_work);
if (!dev->poll_interval)
dev->poll_interval = 500;
if (!dev->poll_interval_max)
dev->poll_interval_max = dev->poll_interval;
input->open = input_open_polled_device;
input->close = input_close_polled_device;
error = input_register_device(input);
if (error)
return error;
error = sysfs_create_group(&input->dev.kobj,
&input_polldev_attribute_group);
if (error) {
input_unregister_device(input);
return error;
}
/*
* Take extra reference to the underlying input device so
* that it survives call to input_unregister_polled_device()
* and is deleted only after input_free_polled_device()
* has been invoked. This is needed to ease task of freeing
* sparse keymaps.
*/
input_get_device(input);
return 0;
}
EXPORT_SYMBOL(input_register_polled_device);
/**
* input_unregister_polled_device - unregister polled device
* @dev: device to unregister
*
* The function unregisters previously registered polled input
* device from input layer. Polling is stopped and device is
* ready to be freed with call to input_free_polled_device().
*/
void input_unregister_polled_device(struct input_polled_dev *dev)
{
sysfs_remove_group(&dev->input->dev.kobj,
&input_polldev_attribute_group);
input_unregister_device(dev->input);
}
EXPORT_SYMBOL(input_unregister_polled_device);
Computing file changes ...