Revision 6f6acb00514c10be35529402f36ad7a288f08c2e authored by Michal Hocko on 22 May 2014, 18:54:19 UTC, committed by Linus Torvalds on 23 May 2014, 16:37:29 UTC
Commit 284f39afeaa4 ("mm: memcg: push !mm handling out to page cache
charge function") explicitly checks for page cache charges without any
mm context (from kernel thread context[1]).

This seemed to be the only possible case where memory could be charged
without mm context so commit 03583f1a631c ("memcg: remove unnecessary
!mm check from try_get_mem_cgroup_from_mm()") removed the mm check from
get_mem_cgroup_from_mm().  This however caused another NULL ptr
dereference during early boot when loopback kernel thread splices to
tmpfs as reported by Stephan Kulow:

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000360
  IP: get_mem_cgroup_from_mm.isra.42+0x2b/0x60
  Oops: 0000 [#1] SMP
  Modules linked in: btrfs dm_multipath dm_mod scsi_dh multipath raid10 raid456 async_raid6_recov async_memcpy async_pq raid6_pq async_xor xor async_tx raid1 raid0 md_mod parport_pc parport nls_utf8 isofs usb_storage iscsi_ibft iscsi_boot_sysfs arc4 ecb fan thermal nfs lockd fscache nls_iso8859_1 nls_cp437 sg st hid_generic usbhid af_packet sunrpc sr_mod cdrom ata_generic uhci_hcd virtio_net virtio_blk ehci_hcd usbcore ata_piix floppy processor button usb_common virtio_pci virtio_ring virtio edd squashfs loop ppa]
  CPU: 0 PID: 97 Comm: loop1 Not tainted 3.15.0-rc5-5-default #1
  Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
  Call Trace:
    __mem_cgroup_try_charge_swapin+0x40/0xe0
    mem_cgroup_charge_file+0x8b/0xd0
    shmem_getpage_gfp+0x66b/0x7b0
    shmem_file_splice_read+0x18f/0x430
    splice_direct_to_actor+0xa2/0x1c0
    do_lo_receive+0x5a/0x60 [loop]
    loop_thread+0x298/0x720 [loop]
    kthread+0xc6/0xe0
    ret_from_fork+0x7c/0xb0

Also Branimir Maksimovic reported the following oops which is tiggered
for the swapcache charge path from the accounting code for kernel threads:

  CPU: 1 PID: 160 Comm: kworker/u8:5 Tainted: P           OE 3.15.0-rc5-core2-custom #159
  Hardware name: System manufacturer System Product Name/MAXIMUSV GENE, BIOS 1903 08/19/2013
  task: ffff880404e349b0 ti: ffff88040486a000 task.ti: ffff88040486a000
  RIP: get_mem_cgroup_from_mm.isra.42+0x2b/0x60
  Call Trace:
    __mem_cgroup_try_charge_swapin+0x45/0xf0
    mem_cgroup_charge_file+0x9c/0xe0
    shmem_getpage_gfp+0x62c/0x770
    shmem_write_begin+0x38/0x40
    generic_perform_write+0xc5/0x1c0
    __generic_file_aio_write+0x1d1/0x3f0
    generic_file_aio_write+0x4f/0xc0
    do_sync_write+0x5a/0x90
    do_acct_process+0x4b1/0x550
    acct_process+0x6d/0xa0
    do_exit+0x827/0xa70
    kthread+0xc3/0xf0

This patch fixes the issue by reintroducing mm check into
get_mem_cgroup_from_mm.  We could do the same trick in
__mem_cgroup_try_charge_swapin as we do for the regular page cache path
but it is not worth troubles.  The check is not that expensive and it is
better to have get_mem_cgroup_from_mm more robust.

[1] - http://marc.info/?l=linux-mm&m=139463617808941&w=2

Fixes: 03583f1a631c ("memcg: remove unnecessary !mm check from try_get_mem_cgroup_from_mm()")
Reported-and-tested-by: Stephan Kulow <coolo@suse.com>
Reported-by: Branimir Maksimovic <branimir.maksimovic@gmail.com>
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 55231e5
Raw File
exec_domain.c
/*
 * Handling of different ABIs (personalities).
 *
 * We group personalities into execution domains which have their
 * own handlers for kernel entry points, signal mapping, etc...
 *
 * 2001-05-06	Complete rewrite,  Christoph Hellwig (hch@infradead.org)
 */

#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/module.h>
#include <linux/personality.h>
#include <linux/proc_fs.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/syscalls.h>
#include <linux/sysctl.h>
#include <linux/types.h>
#include <linux/fs_struct.h>


static void default_handler(int, struct pt_regs *);

static struct exec_domain *exec_domains = &default_exec_domain;
static DEFINE_RWLOCK(exec_domains_lock);


static unsigned long ident_map[32] = {
	0,	1,	2,	3,	4,	5,	6,	7,
	8,	9,	10,	11,	12,	13,	14,	15,
	16,	17,	18,	19,	20,	21,	22,	23,
	24,	25,	26,	27,	28,	29,	30,	31
};

struct exec_domain default_exec_domain = {
	.name		= "Linux",		/* name */
	.handler	= default_handler,	/* lcall7 causes a seg fault. */
	.pers_low	= 0, 			/* PER_LINUX personality. */
	.pers_high	= 0,			/* PER_LINUX personality. */
	.signal_map	= ident_map,		/* Identity map signals. */
	.signal_invmap	= ident_map,		/*  - both ways. */
};


static void
default_handler(int segment, struct pt_regs *regp)
{
	set_personality(0);

	if (current_thread_info()->exec_domain->handler != default_handler)
		current_thread_info()->exec_domain->handler(segment, regp);
	else
		send_sig(SIGSEGV, current, 1);
}

static struct exec_domain *
lookup_exec_domain(unsigned int personality)
{
	unsigned int pers = personality(personality);
	struct exec_domain *ep;

	read_lock(&exec_domains_lock);
	for (ep = exec_domains; ep; ep = ep->next) {
		if (pers >= ep->pers_low && pers <= ep->pers_high)
			if (try_module_get(ep->module))
				goto out;
	}

#ifdef CONFIG_MODULES
	read_unlock(&exec_domains_lock);
	request_module("personality-%d", pers);
	read_lock(&exec_domains_lock);

	for (ep = exec_domains; ep; ep = ep->next) {
		if (pers >= ep->pers_low && pers <= ep->pers_high)
			if (try_module_get(ep->module))
				goto out;
	}
#endif

	ep = &default_exec_domain;
out:
	read_unlock(&exec_domains_lock);
	return (ep);
}

int
register_exec_domain(struct exec_domain *ep)
{
	struct exec_domain	*tmp;
	int			err = -EBUSY;

	if (ep == NULL)
		return -EINVAL;

	if (ep->next != NULL)
		return -EBUSY;

	write_lock(&exec_domains_lock);
	for (tmp = exec_domains; tmp; tmp = tmp->next) {
		if (tmp == ep)
			goto out;
	}

	ep->next = exec_domains;
	exec_domains = ep;
	err = 0;

out:
	write_unlock(&exec_domains_lock);
	return (err);
}

int
unregister_exec_domain(struct exec_domain *ep)
{
	struct exec_domain	**epp;

	epp = &exec_domains;
	write_lock(&exec_domains_lock);
	for (epp = &exec_domains; *epp; epp = &(*epp)->next) {
		if (ep == *epp)
			goto unregister;
	}
	write_unlock(&exec_domains_lock);
	return -EINVAL;

unregister:
	*epp = ep->next;
	ep->next = NULL;
	write_unlock(&exec_domains_lock);
	return 0;
}

int __set_personality(unsigned int personality)
{
	struct exec_domain *oep = current_thread_info()->exec_domain;

	current_thread_info()->exec_domain = lookup_exec_domain(personality);
	current->personality = personality;
	module_put(oep->module);

	return 0;
}

#ifdef CONFIG_PROC_FS
static int execdomains_proc_show(struct seq_file *m, void *v)
{
	struct exec_domain	*ep;

	read_lock(&exec_domains_lock);
	for (ep = exec_domains; ep; ep = ep->next)
		seq_printf(m, "%d-%d\t%-16s\t[%s]\n",
			       ep->pers_low, ep->pers_high, ep->name,
			       module_name(ep->module));
	read_unlock(&exec_domains_lock);
	return 0;
}

static int execdomains_proc_open(struct inode *inode, struct file *file)
{
	return single_open(file, execdomains_proc_show, NULL);
}

static const struct file_operations execdomains_proc_fops = {
	.open		= execdomains_proc_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= single_release,
};

static int __init proc_execdomains_init(void)
{
	proc_create("execdomains", 0, NULL, &execdomains_proc_fops);
	return 0;
}
module_init(proc_execdomains_init);
#endif

SYSCALL_DEFINE1(personality, unsigned int, personality)
{
	unsigned int old = current->personality;

	if (personality != 0xffffffff)
		set_personality(personality);

	return old;
}


EXPORT_SYMBOL(register_exec_domain);
EXPORT_SYMBOL(unregister_exec_domain);
EXPORT_SYMBOL(__set_personality);
back to top