Revision fb59e9f1e9786635ea12e12bf6adbb132e10f979 authored by Hugh Dickins on 04 March 2008, 22:29:16 UTC, committed by Linus Torvalds on 05 March 2008, 00:35:15 UTC
While testing force_empty, during an exit_mmap, __mem_cgroup_remove_list
called from mem_cgroup_uncharge_page oopsed on a NULL pointer in the lru list.
 I couldn't see what racing tasks on other cpus were doing, but surmise that
another must have been in mem_cgroup_charge_common on the same page, between
its unlock_page_cgroup and spin_lock_irqsave near done (thanks to that kzalloc
which I'd almost changed to a kmalloc).

Normally such a race cannot happen, the ref_cnt prevents it, the final
uncharge cannot race with the initial charge.  But force_empty buggers the
ref_cnt, that's what it's all about; and thereafter forced pages are
vulnerable to races such as this (just think of a shared page also mapped into
an mm of another mem_cgroup than that just emptied).  And remain vulnerable
until they're freed indefinitely later.

This patch just fixes the oops by moving the unlock_page_cgroups down below
adding to and removing from the list (only possible given the previous patch);
and while we're at it, we might as well make it an invariant that
page->page_cgroup is always set while pc is on lru.

But this behaviour of force_empty seems highly unsatisfactory to me: why have
a ref_cnt if we always have to cope with it being violated (as in the earlier
page migration patch).  We may prefer force_empty to move pages to an orphan
mem_cgroup (could be the root, but better not), from which other cgroups could
recover them; we might need to reverse the locking again; but no time now for
such concerns.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 9b3c0a0
Raw File
utsname_sysctl.c
/*
 *  Copyright (C) 2007
 *
 *  Author: Eric Biederman <ebiederm@xmision.com>
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License as
 *  published by the Free Software Foundation, version 2 of the
 *  License.
 */

#include <linux/module.h>
#include <linux/uts.h>
#include <linux/utsname.h>
#include <linux/version.h>
#include <linux/sysctl.h>

static void *get_uts(ctl_table *table, int write)
{
	char *which = table->data;
	struct uts_namespace *uts_ns;

	uts_ns = current->nsproxy->uts_ns;
	which = (which - (char *)&init_uts_ns) + (char *)uts_ns;

	if (!write)
		down_read(&uts_sem);
	else
		down_write(&uts_sem);
	return which;
}

static void put_uts(ctl_table *table, int write, void *which)
{
	if (!write)
		up_read(&uts_sem);
	else
		up_write(&uts_sem);
}

#ifdef CONFIG_PROC_FS
/*
 *	Special case of dostring for the UTS structure. This has locks
 *	to observe. Should this be in kernel/sys.c ????
 */
static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
		  void __user *buffer, size_t *lenp, loff_t *ppos)
{
	struct ctl_table uts_table;
	int r;
	memcpy(&uts_table, table, sizeof(uts_table));
	uts_table.data = get_uts(table, write);
	r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos);
	put_uts(table, write, uts_table.data);
	return r;
}
#else
#define proc_do_uts_string NULL
#endif


#ifdef CONFIG_SYSCTL_SYSCALL
/* The generic string strategy routine: */
static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
		  void __user *oldval, size_t __user *oldlenp,
		  void __user *newval, size_t newlen)
{
	struct ctl_table uts_table;
	int r, write;
	write = newval && newlen;
	memcpy(&uts_table, table, sizeof(uts_table));
	uts_table.data = get_uts(table, write);
	r = sysctl_string(&uts_table, name, nlen,
		oldval, oldlenp, newval, newlen);
	put_uts(table, write, uts_table.data);
	return r;
}
#else
#define sysctl_uts_string NULL
#endif

static struct ctl_table uts_kern_table[] = {
	{
		.ctl_name	= KERN_OSTYPE,
		.procname	= "ostype",
		.data		= init_uts_ns.name.sysname,
		.maxlen		= sizeof(init_uts_ns.name.sysname),
		.mode		= 0444,
		.proc_handler	= proc_do_uts_string,
		.strategy	= sysctl_uts_string,
	},
	{
		.ctl_name	= KERN_OSRELEASE,
		.procname	= "osrelease",
		.data		= init_uts_ns.name.release,
		.maxlen		= sizeof(init_uts_ns.name.release),
		.mode		= 0444,
		.proc_handler	= proc_do_uts_string,
		.strategy	= sysctl_uts_string,
	},
	{
		.ctl_name	= KERN_VERSION,
		.procname	= "version",
		.data		= init_uts_ns.name.version,
		.maxlen		= sizeof(init_uts_ns.name.version),
		.mode		= 0444,
		.proc_handler	= proc_do_uts_string,
		.strategy	= sysctl_uts_string,
	},
	{
		.ctl_name	= KERN_NODENAME,
		.procname	= "hostname",
		.data		= init_uts_ns.name.nodename,
		.maxlen		= sizeof(init_uts_ns.name.nodename),
		.mode		= 0644,
		.proc_handler	= proc_do_uts_string,
		.strategy	= sysctl_uts_string,
	},
	{
		.ctl_name	= KERN_DOMAINNAME,
		.procname	= "domainname",
		.data		= init_uts_ns.name.domainname,
		.maxlen		= sizeof(init_uts_ns.name.domainname),
		.mode		= 0644,
		.proc_handler	= proc_do_uts_string,
		.strategy	= sysctl_uts_string,
	},
	{}
};

static struct ctl_table uts_root_table[] = {
	{
		.ctl_name	= CTL_KERN,
		.procname	= "kernel",
		.mode		= 0555,
		.child		= uts_kern_table,
	},
	{}
};

static int __init utsname_sysctl_init(void)
{
	register_sysctl_table(uts_root_table);
	return 0;
}

__initcall(utsname_sysctl_init);
back to top