Revision 6f6acb00514c10be35529402f36ad7a288f08c2e authored by Michal Hocko on 22 May 2014, 18:54:19 UTC, committed by Linus Torvalds on 23 May 2014, 16:37:29 UTC
Commit 284f39afeaa4 ("mm: memcg: push !mm handling out to page cache
charge function") explicitly checks for page cache charges without any
mm context (from kernel thread context[1]).

This seemed to be the only possible case where memory could be charged
without mm context so commit 03583f1a631c ("memcg: remove unnecessary
!mm check from try_get_mem_cgroup_from_mm()") removed the mm check from
get_mem_cgroup_from_mm().  This however caused another NULL ptr
dereference during early boot when loopback kernel thread splices to
tmpfs as reported by Stephan Kulow:

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000360
  IP: get_mem_cgroup_from_mm.isra.42+0x2b/0x60
  Oops: 0000 [#1] SMP
  Modules linked in: btrfs dm_multipath dm_mod scsi_dh multipath raid10 raid456 async_raid6_recov async_memcpy async_pq raid6_pq async_xor xor async_tx raid1 raid0 md_mod parport_pc parport nls_utf8 isofs usb_storage iscsi_ibft iscsi_boot_sysfs arc4 ecb fan thermal nfs lockd fscache nls_iso8859_1 nls_cp437 sg st hid_generic usbhid af_packet sunrpc sr_mod cdrom ata_generic uhci_hcd virtio_net virtio_blk ehci_hcd usbcore ata_piix floppy processor button usb_common virtio_pci virtio_ring virtio edd squashfs loop ppa]
  CPU: 0 PID: 97 Comm: loop1 Not tainted 3.15.0-rc5-5-default #1
  Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
  Call Trace:
    __mem_cgroup_try_charge_swapin+0x40/0xe0
    mem_cgroup_charge_file+0x8b/0xd0
    shmem_getpage_gfp+0x66b/0x7b0
    shmem_file_splice_read+0x18f/0x430
    splice_direct_to_actor+0xa2/0x1c0
    do_lo_receive+0x5a/0x60 [loop]
    loop_thread+0x298/0x720 [loop]
    kthread+0xc6/0xe0
    ret_from_fork+0x7c/0xb0

Also Branimir Maksimovic reported the following oops which is tiggered
for the swapcache charge path from the accounting code for kernel threads:

  CPU: 1 PID: 160 Comm: kworker/u8:5 Tainted: P           OE 3.15.0-rc5-core2-custom #159
  Hardware name: System manufacturer System Product Name/MAXIMUSV GENE, BIOS 1903 08/19/2013
  task: ffff880404e349b0 ti: ffff88040486a000 task.ti: ffff88040486a000
  RIP: get_mem_cgroup_from_mm.isra.42+0x2b/0x60
  Call Trace:
    __mem_cgroup_try_charge_swapin+0x45/0xf0
    mem_cgroup_charge_file+0x9c/0xe0
    shmem_getpage_gfp+0x62c/0x770
    shmem_write_begin+0x38/0x40
    generic_perform_write+0xc5/0x1c0
    __generic_file_aio_write+0x1d1/0x3f0
    generic_file_aio_write+0x4f/0xc0
    do_sync_write+0x5a/0x90
    do_acct_process+0x4b1/0x550
    acct_process+0x6d/0xa0
    do_exit+0x827/0xa70
    kthread+0xc3/0xf0

This patch fixes the issue by reintroducing mm check into
get_mem_cgroup_from_mm.  We could do the same trick in
__mem_cgroup_try_charge_swapin as we do for the regular page cache path
but it is not worth troubles.  The check is not that expensive and it is
better to have get_mem_cgroup_from_mm more robust.

[1] - http://marc.info/?l=linux-mm&m=139463617808941&w=2

Fixes: 03583f1a631c ("memcg: remove unnecessary !mm check from try_get_mem_cgroup_from_mm()")
Reported-and-tested-by: Stephan Kulow <coolo@suse.com>
Reported-by: Branimir Maksimovic <branimir.maksimovic@gmail.com>
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 55231e5
Raw File
timeconst.bc
scale=0

define gcd(a,b) {
	auto t;
	while (b) {
		t = b;
		b = a % b;
		a = t;
	}
	return a;
}

/* Division by reciprocal multiplication. */
define fmul(b,n,d) {
       return (2^b*n+d-1)/d;
}

/* Adjustment factor when a ceiling value is used.  Use as:
   (imul * n) + (fmulxx * n + fadjxx) >> xx) */
define fadj(b,n,d) {
	auto v;
	d = d/gcd(n,d);
	v = 2^b*(d-1)/d;
	return v;
}

/* Compute the appropriate mul/adj values as well as a shift count,
   which brings the mul value into the range 2^b-1 <= x < 2^b.  Such
   a shift value will be correct in the signed integer range and off
   by at most one in the upper half of the unsigned range. */
define fmuls(b,n,d) {
	auto s, m;
	for (s = 0; 1; s++) {
		m = fmul(s,n,d);
		if (m >= 2^(b-1))
			return s;
	}
	return 0;
}

define timeconst(hz) {
	print "/* Automatically generated by kernel/timeconst.bc */\n"
	print "/* Time conversion constants for HZ == ", hz, " */\n"
	print "\n"

	print "#ifndef KERNEL_TIMECONST_H\n"
	print "#define KERNEL_TIMECONST_H\n\n"

	print "#include <linux/param.h>\n"
	print "#include <linux/types.h>\n\n"

	print "#if HZ != ", hz, "\n"
	print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
	print "#endif\n\n"

	if (hz < 2) {
		print "#error Totally bogus HZ value!\n"
	} else {
		s=fmuls(32,1000,hz)
		obase=16
		print "#define HZ_TO_MSEC_MUL32\tU64_C(0x", fmul(s,1000,hz), ")\n"
		print "#define HZ_TO_MSEC_ADJ32\tU64_C(0x", fadj(s,1000,hz), ")\n"
		obase=10
		print "#define HZ_TO_MSEC_SHR32\t", s, "\n"

		s=fmuls(32,hz,1000)
		obase=16
		print "#define MSEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000), ")\n"
		print "#define MSEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000), ")\n"
		obase=10
		print "#define MSEC_TO_HZ_SHR32\t", s, "\n"

		obase=10
		cd=gcd(hz,1000)
		print "#define HZ_TO_MSEC_NUM\t\t", 1000/cd, "\n"
		print "#define HZ_TO_MSEC_DEN\t\t", hz/cd, "\n"
		print "#define MSEC_TO_HZ_NUM\t\t", hz/cd, "\n"
		print "#define MSEC_TO_HZ_DEN\t\t", 1000/cd, "\n"
		print "\n"

		s=fmuls(32,1000000,hz)
		obase=16
		print "#define HZ_TO_USEC_MUL32\tU64_C(0x", fmul(s,1000000,hz), ")\n"
		print "#define HZ_TO_USEC_ADJ32\tU64_C(0x", fadj(s,1000000,hz), ")\n"
		obase=10
		print "#define HZ_TO_USEC_SHR32\t", s, "\n"

		s=fmuls(32,hz,1000000)
		obase=16
		print "#define USEC_TO_HZ_MUL32\tU64_C(0x", fmul(s,hz,1000000), ")\n"
		print "#define USEC_TO_HZ_ADJ32\tU64_C(0x", fadj(s,hz,1000000), ")\n"
		obase=10
		print "#define USEC_TO_HZ_SHR32\t", s, "\n"

		obase=10
		cd=gcd(hz,1000000)
		print "#define HZ_TO_USEC_NUM\t\t", 1000000/cd, "\n"
		print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n"
		print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n"
		print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n"
		print "\n"

		print "#endif /* KERNEL_TIMECONST_H */\n"
	}
	halt
}

timeconst(hz)
back to top