Revision f17a5194859a82afe4164e938b92035b86c55794 authored by Steven Rostedt (Red Hat) on 31 May 2013, 01:10:37 UTC, committed by Steven Rostedt on 06 June 2013, 16:35:30 UTC
The irqsoff tracer records the max time that interrupts are disabled.
There are hooks in the assembly code that calls back into the tracer when
interrupts are disabled or enabled.

When they are enabled, the tracer checks if the amount of time they
were disabled is larger than the previous recorded max interrupts off
time. If it is, it creates a snapshot of the currently running trace
to store where the last largest interrupts off time was held and how
it happened.

During testing, this RCU lockdep dump appeared:

[ 1257.829021] ===============================
[ 1257.829021] [ INFO: suspicious RCU usage. ]
[ 1257.829021] 3.10.0-rc1-test+ #171 Tainted: G        W
[ 1257.829021] -------------------------------
[ 1257.829021] /home/rostedt/work/git/linux-trace.git/include/linux/rcupdate.h:780 rcu_read_lock() used illegally while idle!
[ 1257.829021]
[ 1257.829021] other info that might help us debug this:
[ 1257.829021]
[ 1257.829021]
[ 1257.829021] RCU used illegally from idle CPU!
[ 1257.829021] rcu_scheduler_active = 1, debug_locks = 0
[ 1257.829021] RCU used illegally from extended quiescent state!
[ 1257.829021] 2 locks held by trace-cmd/4831:
[ 1257.829021]  #0:  (max_trace_lock){......}, at: [<ffffffff810e2b77>] stop_critical_timing+0x1a3/0x209
[ 1257.829021]  #1:  (rcu_read_lock){.+.+..}, at: [<ffffffff810dae5a>] __update_max_tr+0x88/0x1ee
[ 1257.829021]
[ 1257.829021] stack backtrace:
[ 1257.829021] CPU: 3 PID: 4831 Comm: trace-cmd Tainted: G        W    3.10.0-rc1-test+ #171
[ 1257.829021] Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS SDBLI944.86P 05/08/2007
[ 1257.829021]  0000000000000001 ffff880065f49da8 ffffffff8153dd2b ffff880065f49dd8
[ 1257.829021]  ffffffff81092a00 ffff88006bd78680 ffff88007add7500 0000000000000003
[ 1257.829021]  ffff88006bd78680 ffff880065f49e18 ffffffff810daebf ffffffff810dae5a
[ 1257.829021] Call Trace:
[ 1257.829021]  [<ffffffff8153dd2b>] dump_stack+0x19/0x1b
[ 1257.829021]  [<ffffffff81092a00>] lockdep_rcu_suspicious+0x109/0x112
[ 1257.829021]  [<ffffffff810daebf>] __update_max_tr+0xed/0x1ee
[ 1257.829021]  [<ffffffff810dae5a>] ? __update_max_tr+0x88/0x1ee
[ 1257.829021]  [<ffffffff811002b9>] ? user_enter+0xfd/0x107
[ 1257.829021]  [<ffffffff810dbf85>] update_max_tr_single+0x11d/0x12d
[ 1257.829021]  [<ffffffff811002b9>] ? user_enter+0xfd/0x107
[ 1257.829021]  [<ffffffff810e2b15>] stop_critical_timing+0x141/0x209
[ 1257.829021]  [<ffffffff8109569a>] ? trace_hardirqs_on+0xd/0xf
[ 1257.829021]  [<ffffffff811002b9>] ? user_enter+0xfd/0x107
[ 1257.829021]  [<ffffffff810e3057>] time_hardirqs_on+0x2a/0x2f
[ 1257.829021]  [<ffffffff811002b9>] ? user_enter+0xfd/0x107
[ 1257.829021]  [<ffffffff8109550c>] trace_hardirqs_on_caller+0x16/0x197
[ 1257.829021]  [<ffffffff8109569a>] trace_hardirqs_on+0xd/0xf
[ 1257.829021]  [<ffffffff811002b9>] user_enter+0xfd/0x107
[ 1257.829021]  [<ffffffff810029b4>] do_notify_resume+0x92/0x97
[ 1257.829021]  [<ffffffff8154bdca>] int_signal+0x12/0x17

What happened was entering into the user code, the interrupts were enabled
and a max interrupts off was recorded. The trace buffer was saved along with
various information about the task: comm, pid, uid, priority, etc.

The uid is recorded with task_uid(tsk). But this is a macro that uses rcu_read_lock()
to retrieve the data, and this happened to happen where RCU is blind (user_enter).

As only the preempt and irqs off tracers can have this happen, and they both
only have the tsk == current, if tsk == current, use current_uid() instead of
task_uid(), as current_uid() does not use RCU as only current can change its uid.

This fixes the RCU suspicious splat.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
1 parent 0184d50
Raw File
sortextable.c
/*
 * sortextable.c: Sort the kernel's exception table
 *
 * Copyright 2011 - 2012 Cavium, Inc.
 *
 * Based on code taken from recortmcount.c which is:
 *
 * Copyright 2009 John F. Reiser <jreiser@BitWagon.com>.  All rights reserved.
 * Licensed under the GNU General Public License, version 2 (GPLv2).
 *
 * Restructured to fit Linux format, as well as other updates:
 *  Copyright 2010 Steven Rostedt <srostedt@redhat.com>, Red Hat Inc.
 */

/*
 * Strategy: alter the vmlinux file in-place.
 */

#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <getopt.h>
#include <elf.h>
#include <fcntl.h>
#include <setjmp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include <tools/be_byteshift.h>
#include <tools/le_byteshift.h>

static int fd_map;	/* File descriptor for file being modified. */
static int mmap_failed; /* Boolean flag. */
static void *ehdr_curr; /* current ElfXX_Ehdr *  for resource cleanup */
static struct stat sb;	/* Remember .st_size, etc. */
static jmp_buf jmpenv;	/* setjmp/longjmp per-file error escape */

/* setjmp() return values */
enum {
	SJ_SETJMP = 0,  /* hardwired first return */
	SJ_FAIL,
	SJ_SUCCEED
};

/* Per-file resource cleanup when multiple files. */
static void
cleanup(void)
{
	if (!mmap_failed)
		munmap(ehdr_curr, sb.st_size);
	close(fd_map);
}

static void __attribute__((noreturn))
fail_file(void)
{
	cleanup();
	longjmp(jmpenv, SJ_FAIL);
}

static void __attribute__((noreturn))
succeed_file(void)
{
	cleanup();
	longjmp(jmpenv, SJ_SUCCEED);
}


/*
 * Get the whole file as a programming convenience in order to avoid
 * malloc+lseek+read+free of many pieces.  If successful, then mmap
 * avoids copying unused pieces; else just read the whole file.
 * Open for both read and write.
 */
static void *mmap_file(char const *fname)
{
	void *addr;

	fd_map = open(fname, O_RDWR);
	if (fd_map < 0 || fstat(fd_map, &sb) < 0) {
		perror(fname);
		fail_file();
	}
	if (!S_ISREG(sb.st_mode)) {
		fprintf(stderr, "not a regular file: %s\n", fname);
		fail_file();
	}
	addr = mmap(0, sb.st_size, PROT_READ|PROT_WRITE, MAP_SHARED,
		    fd_map, 0);
	if (addr == MAP_FAILED) {
		mmap_failed = 1;
		fprintf(stderr, "Could not mmap file: %s\n", fname);
		fail_file();
	}
	return addr;
}

static uint64_t r8be(const uint64_t *x)
{
	return get_unaligned_be64(x);
}
static uint32_t rbe(const uint32_t *x)
{
	return get_unaligned_be32(x);
}
static uint16_t r2be(const uint16_t *x)
{
	return get_unaligned_be16(x);
}
static uint64_t r8le(const uint64_t *x)
{
	return get_unaligned_le64(x);
}
static uint32_t rle(const uint32_t *x)
{
	return get_unaligned_le32(x);
}
static uint16_t r2le(const uint16_t *x)
{
	return get_unaligned_le16(x);
}

static void w8be(uint64_t val, uint64_t *x)
{
	put_unaligned_be64(val, x);
}
static void wbe(uint32_t val, uint32_t *x)
{
	put_unaligned_be32(val, x);
}
static void w2be(uint16_t val, uint16_t *x)
{
	put_unaligned_be16(val, x);
}
static void w8le(uint64_t val, uint64_t *x)
{
	put_unaligned_le64(val, x);
}
static void wle(uint32_t val, uint32_t *x)
{
	put_unaligned_le32(val, x);
}
static void w2le(uint16_t val, uint16_t *x)
{
	put_unaligned_le16(val, x);
}

static uint64_t (*r8)(const uint64_t *);
static uint32_t (*r)(const uint32_t *);
static uint16_t (*r2)(const uint16_t *);
static void (*w8)(uint64_t, uint64_t *);
static void (*w)(uint32_t, uint32_t *);
static void (*w2)(uint16_t, uint16_t *);

typedef void (*table_sort_t)(char *, int);

/* 32 bit and 64 bit are very similar */
#include "sortextable.h"
#define SORTEXTABLE_64
#include "sortextable.h"

static int compare_relative_table(const void *a, const void *b)
{
	int32_t av = (int32_t)r(a);
	int32_t bv = (int32_t)r(b);

	if (av < bv)
		return -1;
	if (av > bv)
		return 1;
	return 0;
}

static void sort_relative_table(char *extab_image, int image_size)
{
	int i;

	/*
	 * Do the same thing the runtime sort does, first normalize to
	 * being relative to the start of the section.
	 */
	i = 0;
	while (i < image_size) {
		uint32_t *loc = (uint32_t *)(extab_image + i);
		w(r(loc) + i, loc);
		i += 4;
	}

	qsort(extab_image, image_size / 8, 8, compare_relative_table);

	/* Now denormalize. */
	i = 0;
	while (i < image_size) {
		uint32_t *loc = (uint32_t *)(extab_image + i);
		w(r(loc) - i, loc);
		i += 4;
	}
}

static void
do_file(char const *const fname)
{
	table_sort_t custom_sort;
	Elf32_Ehdr *ehdr = mmap_file(fname);

	ehdr_curr = ehdr;
	switch (ehdr->e_ident[EI_DATA]) {
	default:
		fprintf(stderr, "unrecognized ELF data encoding %d: %s\n",
			ehdr->e_ident[EI_DATA], fname);
		fail_file();
		break;
	case ELFDATA2LSB:
		r = rle;
		r2 = r2le;
		r8 = r8le;
		w = wle;
		w2 = w2le;
		w8 = w8le;
		break;
	case ELFDATA2MSB:
		r = rbe;
		r2 = r2be;
		r8 = r8be;
		w = wbe;
		w2 = w2be;
		w8 = w8be;
		break;
	}  /* end switch */
	if (memcmp(ELFMAG, ehdr->e_ident, SELFMAG) != 0
	||  r2(&ehdr->e_type) != ET_EXEC
	||  ehdr->e_ident[EI_VERSION] != EV_CURRENT) {
		fprintf(stderr, "unrecognized ET_EXEC file %s\n", fname);
		fail_file();
	}

	custom_sort = NULL;
	switch (r2(&ehdr->e_machine)) {
	default:
		fprintf(stderr, "unrecognized e_machine %d %s\n",
			r2(&ehdr->e_machine), fname);
		fail_file();
		break;
	case EM_386:
	case EM_X86_64:
	case EM_S390:
		custom_sort = sort_relative_table;
		break;
	case EM_ARM:
	case EM_MIPS:
		break;
	}  /* end switch */

	switch (ehdr->e_ident[EI_CLASS]) {
	default:
		fprintf(stderr, "unrecognized ELF class %d %s\n",
			ehdr->e_ident[EI_CLASS], fname);
		fail_file();
		break;
	case ELFCLASS32:
		if (r2(&ehdr->e_ehsize) != sizeof(Elf32_Ehdr)
		||  r2(&ehdr->e_shentsize) != sizeof(Elf32_Shdr)) {
			fprintf(stderr,
				"unrecognized ET_EXEC file: %s\n", fname);
			fail_file();
		}
		do32(ehdr, fname, custom_sort);
		break;
	case ELFCLASS64: {
		Elf64_Ehdr *const ghdr = (Elf64_Ehdr *)ehdr;
		if (r2(&ghdr->e_ehsize) != sizeof(Elf64_Ehdr)
		||  r2(&ghdr->e_shentsize) != sizeof(Elf64_Shdr)) {
			fprintf(stderr,
				"unrecognized ET_EXEC file: %s\n", fname);
			fail_file();
		}
		do64(ghdr, fname, custom_sort);
		break;
	}
	}  /* end switch */

	cleanup();
}

int
main(int argc, char *argv[])
{
	int n_error = 0;  /* gcc-4.3.0 false positive complaint */
	int i;

	if (argc < 2) {
		fprintf(stderr, "usage: sortextable vmlinux...\n");
		return 0;
	}

	/* Process each file in turn, allowing deep failure. */
	for (i = 1; i < argc; i++) {
		char *file = argv[i];
		int const sjval = setjmp(jmpenv);

		switch (sjval) {
		default:
			fprintf(stderr, "internal error: %s\n", file);
			exit(1);
			break;
		case SJ_SETJMP:    /* normal sequence */
			/* Avoid problems if early cleanup() */
			fd_map = -1;
			ehdr_curr = NULL;
			mmap_failed = 1;
			do_file(file);
			break;
		case SJ_FAIL:    /* error in do_file or below */
			++n_error;
			break;
		case SJ_SUCCEED:    /* premature success */
			/* do nothing */
			break;
		}  /* end switch */
	}
	return !!n_error;
}
back to top