Revision 6f6acb00514c10be35529402f36ad7a288f08c2e authored by Michal Hocko on 22 May 2014, 18:54:19 UTC, committed by Linus Torvalds on 23 May 2014, 16:37:29 UTC
Commit 284f39afeaa4 ("mm: memcg: push !mm handling out to page cache charge function") explicitly checks for page cache charges without any mm context (from kernel thread context[1]). This seemed to be the only possible case where memory could be charged without mm context so commit 03583f1a631c ("memcg: remove unnecessary !mm check from try_get_mem_cgroup_from_mm()") removed the mm check from get_mem_cgroup_from_mm(). This however caused another NULL ptr dereference during early boot when loopback kernel thread splices to tmpfs as reported by Stephan Kulow: BUG: unable to handle kernel NULL pointer dereference at 0000000000000360 IP: get_mem_cgroup_from_mm.isra.42+0x2b/0x60 Oops: 0000 [#1] SMP Modules linked in: btrfs dm_multipath dm_mod scsi_dh multipath raid10 raid456 async_raid6_recov async_memcpy async_pq raid6_pq async_xor xor async_tx raid1 raid0 md_mod parport_pc parport nls_utf8 isofs usb_storage iscsi_ibft iscsi_boot_sysfs arc4 ecb fan thermal nfs lockd fscache nls_iso8859_1 nls_cp437 sg st hid_generic usbhid af_packet sunrpc sr_mod cdrom ata_generic uhci_hcd virtio_net virtio_blk ehci_hcd usbcore ata_piix floppy processor button usb_common virtio_pci virtio_ring virtio edd squashfs loop ppa] CPU: 0 PID: 97 Comm: loop1 Not tainted 3.15.0-rc5-5-default #1 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Call Trace: __mem_cgroup_try_charge_swapin+0x40/0xe0 mem_cgroup_charge_file+0x8b/0xd0 shmem_getpage_gfp+0x66b/0x7b0 shmem_file_splice_read+0x18f/0x430 splice_direct_to_actor+0xa2/0x1c0 do_lo_receive+0x5a/0x60 [loop] loop_thread+0x298/0x720 [loop] kthread+0xc6/0xe0 ret_from_fork+0x7c/0xb0 Also Branimir Maksimovic reported the following oops which is tiggered for the swapcache charge path from the accounting code for kernel threads: CPU: 1 PID: 160 Comm: kworker/u8:5 Tainted: P OE 3.15.0-rc5-core2-custom #159 Hardware name: System manufacturer System Product Name/MAXIMUSV GENE, BIOS 1903 08/19/2013 task: ffff880404e349b0 ti: ffff88040486a000 task.ti: ffff88040486a000 RIP: get_mem_cgroup_from_mm.isra.42+0x2b/0x60 Call Trace: __mem_cgroup_try_charge_swapin+0x45/0xf0 mem_cgroup_charge_file+0x9c/0xe0 shmem_getpage_gfp+0x62c/0x770 shmem_write_begin+0x38/0x40 generic_perform_write+0xc5/0x1c0 __generic_file_aio_write+0x1d1/0x3f0 generic_file_aio_write+0x4f/0xc0 do_sync_write+0x5a/0x90 do_acct_process+0x4b1/0x550 acct_process+0x6d/0xa0 do_exit+0x827/0xa70 kthread+0xc3/0xf0 This patch fixes the issue by reintroducing mm check into get_mem_cgroup_from_mm. We could do the same trick in __mem_cgroup_try_charge_swapin as we do for the regular page cache path but it is not worth troubles. The check is not that expensive and it is better to have get_mem_cgroup_from_mm more robust. [1] - http://marc.info/?l=linux-mm&m=139463617808941&w=2 Fixes: 03583f1a631c ("memcg: remove unnecessary !mm check from try_get_mem_cgroup_from_mm()") Reported-and-tested-by: Stephan Kulow <coolo@suse.com> Reported-by: Branimir Maksimovic <branimir.maksimovic@gmail.com> Signed-off-by: Michal Hocko <mhocko@suse.cz> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 55231e5
latencytop.c
/*
* latencytop.c: Latency display infrastructure
*
* (C) Copyright 2008 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
/*
* CONFIG_LATENCYTOP enables a kernel latency tracking infrastructure that is
* used by the "latencytop" userspace tool. The latency that is tracked is not
* the 'traditional' interrupt latency (which is primarily caused by something
* else consuming CPU), but instead, it is the latency an application encounters
* because the kernel sleeps on its behalf for various reasons.
*
* This code tracks 2 levels of statistics:
* 1) System level latency
* 2) Per process latency
*
* The latency is stored in fixed sized data structures in an accumulated form;
* if the "same" latency cause is hit twice, this will be tracked as one entry
* in the data structure. Both the count, total accumulated latency and maximum
* latency are tracked in this data structure. When the fixed size structure is
* full, no new causes are tracked until the buffer is flushed by writing to
* the /proc file; the userspace tool does this on a regular basis.
*
* A latency cause is identified by a stringified backtrace at the point that
* the scheduler gets invoked. The userland tool will use this string to
* identify the cause of the latency in human readable form.
*
* The information is exported via /proc/latency_stats and /proc/<pid>/latency.
* These files look like this:
*
* Latency Top version : v0.1
* 70 59433 4897 i915_irq_wait drm_ioctl vfs_ioctl do_vfs_ioctl sys_ioctl
* | | | |
* | | | +----> the stringified backtrace
* | | +---------> The maximum latency for this entry in microseconds
* | +--------------> The accumulated latency for this entry (microseconds)
* +-------------------> The number of times this entry is hit
*
* (note: the average latency is the accumulated latency divided by the number
* of times)
*/
#include <linux/latencytop.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/spinlock.h>
#include <linux/proc_fs.h>
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/list.h>
#include <linux/stacktrace.h>
static DEFINE_RAW_SPINLOCK(latency_lock);
#define MAXLR 128
static struct latency_record latency_record[MAXLR];
int latencytop_enabled;
void clear_all_latency_tracing(struct task_struct *p)
{
unsigned long flags;
if (!latencytop_enabled)
return;
raw_spin_lock_irqsave(&latency_lock, flags);
memset(&p->latency_record, 0, sizeof(p->latency_record));
p->latency_record_count = 0;
raw_spin_unlock_irqrestore(&latency_lock, flags);
}
static void clear_global_latency_tracing(void)
{
unsigned long flags;
raw_spin_lock_irqsave(&latency_lock, flags);
memset(&latency_record, 0, sizeof(latency_record));
raw_spin_unlock_irqrestore(&latency_lock, flags);
}
static void __sched
account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat)
{
int firstnonnull = MAXLR + 1;
int i;
if (!latencytop_enabled)
return;
/* skip kernel threads for now */
if (!tsk->mm)
return;
for (i = 0; i < MAXLR; i++) {
int q, same = 1;
/* Nothing stored: */
if (!latency_record[i].backtrace[0]) {
if (firstnonnull > i)
firstnonnull = i;
continue;
}
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
unsigned long record = lat->backtrace[q];
if (latency_record[i].backtrace[q] != record) {
same = 0;
break;
}
/* 0 and ULONG_MAX entries mean end of backtrace: */
if (record == 0 || record == ULONG_MAX)
break;
}
if (same) {
latency_record[i].count++;
latency_record[i].time += lat->time;
if (lat->time > latency_record[i].max)
latency_record[i].max = lat->time;
return;
}
}
i = firstnonnull;
if (i >= MAXLR - 1)
return;
/* Allocted a new one: */
memcpy(&latency_record[i], lat, sizeof(struct latency_record));
}
/*
* Iterator to store a backtrace into a latency record entry
*/
static inline void store_stacktrace(struct task_struct *tsk,
struct latency_record *lat)
{
struct stack_trace trace;
memset(&trace, 0, sizeof(trace));
trace.max_entries = LT_BACKTRACEDEPTH;
trace.entries = &lat->backtrace[0];
save_stack_trace_tsk(tsk, &trace);
}
/**
* __account_scheduler_latency - record an occurred latency
* @tsk - the task struct of the task hitting the latency
* @usecs - the duration of the latency in microseconds
* @inter - 1 if the sleep was interruptible, 0 if uninterruptible
*
* This function is the main entry point for recording latency entries
* as called by the scheduler.
*
* This function has a few special cases to deal with normal 'non-latency'
* sleeps: specifically, interruptible sleep longer than 5 msec is skipped
* since this usually is caused by waiting for events via select() and co.
*
* Negative latencies (caused by time going backwards) are also explicitly
* skipped.
*/
void __sched
__account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
{
unsigned long flags;
int i, q;
struct latency_record lat;
/* Long interruptible waits are generally user requested... */
if (inter && usecs > 5000)
return;
/* Negative sleeps are time going backwards */
/* Zero-time sleeps are non-interesting */
if (usecs <= 0)
return;
memset(&lat, 0, sizeof(lat));
lat.count = 1;
lat.time = usecs;
lat.max = usecs;
store_stacktrace(tsk, &lat);
raw_spin_lock_irqsave(&latency_lock, flags);
account_global_scheduler_latency(tsk, &lat);
for (i = 0; i < tsk->latency_record_count; i++) {
struct latency_record *mylat;
int same = 1;
mylat = &tsk->latency_record[i];
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
unsigned long record = lat.backtrace[q];
if (mylat->backtrace[q] != record) {
same = 0;
break;
}
/* 0 and ULONG_MAX entries mean end of backtrace: */
if (record == 0 || record == ULONG_MAX)
break;
}
if (same) {
mylat->count++;
mylat->time += lat.time;
if (lat.time > mylat->max)
mylat->max = lat.time;
goto out_unlock;
}
}
/*
* short term hack; if we're > 32 we stop; future we recycle:
*/
if (tsk->latency_record_count >= LT_SAVECOUNT)
goto out_unlock;
/* Allocated a new one: */
i = tsk->latency_record_count++;
memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
out_unlock:
raw_spin_unlock_irqrestore(&latency_lock, flags);
}
static int lstats_show(struct seq_file *m, void *v)
{
int i;
seq_puts(m, "Latency Top version : v0.1\n");
for (i = 0; i < MAXLR; i++) {
struct latency_record *lr = &latency_record[i];
if (lr->backtrace[0]) {
int q;
seq_printf(m, "%i %lu %lu",
lr->count, lr->time, lr->max);
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
unsigned long bt = lr->backtrace[q];
if (!bt)
break;
if (bt == ULONG_MAX)
break;
seq_printf(m, " %ps", (void *)bt);
}
seq_printf(m, "\n");
}
}
return 0;
}
static ssize_t
lstats_write(struct file *file, const char __user *buf, size_t count,
loff_t *offs)
{
clear_global_latency_tracing();
return count;
}
static int lstats_open(struct inode *inode, struct file *filp)
{
return single_open(filp, lstats_show, NULL);
}
static const struct file_operations lstats_fops = {
.open = lstats_open,
.read = seq_read,
.write = lstats_write,
.llseek = seq_lseek,
.release = single_release,
};
static int __init init_lstats_procfs(void)
{
proc_create("latency_stats", 0644, NULL, &lstats_fops);
return 0;
}
device_initcall(init_lstats_procfs);
![swh spinner](/static/img/swh-spinner.gif)
Computing file changes ...