Revision e91467ecd1ef381377fd327c0ded922835ec52ab authored by Christian Borntraeger on 05 August 2006, 19:13:52 UTC, committed by Linus Torvalds on 06 August 2006, 15:57:46 UTC
This patch adds a barrier() in futex unqueue_me to avoid aliasing of two
pointers.

On my s390x system I saw the following oops:

Unable to handle kernel pointer dereference at virtual kernel address
0000000000000000
Oops: 0004 [#1]
CPU:    0    Not tainted
Process mytool (pid: 13613, task: 000000003ecb6ac0, ksp: 00000000366bdbd8)
Krnl PSW : 0704d00180000000 00000000003c9ac2 (_spin_lock+0xe/0x30)
Krnl GPRS: 00000000ffffffff 000000003ecb6ac0 0000000000000000 0700000000000000
           0000000000000000 0000000000000000 000001fe00002028 00000000000c091f
           000001fe00002054 000001fe00002054 0000000000000000 00000000366bddc0
           00000000005ef8c0 00000000003d00e8 0000000000144f91 00000000366bdcb8
Krnl Code: ba 4e 20 00 12 44 b9 16 00 3e a7 84 00 08 e3 e0 f0 88 00 04
Call Trace:
([<0000000000144f90>] unqueue_me+0x40/0xe4)
 [<0000000000145a0c>] do_futex+0x33c/0xc40
 [<000000000014643e>] sys_futex+0x12e/0x144
 [<000000000010bb00>] sysc_noemu+0x10/0x16
 [<000002000003741c>] 0x2000003741c

The code in question is:

static int unqueue_me(struct futex_q *q)
{
        int ret = 0;
        spinlock_t *lock_ptr;

        /* In the common case we don't take the spinlock, which is nice. */
 retry:
        lock_ptr = q->lock_ptr;
        if (lock_ptr != 0) {
                spin_lock(lock_ptr);
		/*
                 * q->lock_ptr can change between reading it and
                 * spin_lock(), causing us to take the wrong lock.  This
                 * corrects the race condition.
[...]

and my compiler (gcc 4.1.0) makes the following out of it:

00000000000003c8 <unqueue_me>:
     3c8:       eb bf f0 70 00 24       stmg    %r11,%r15,112(%r15)
     3ce:       c0 d0 00 00 00 00       larl    %r13,3ce <unqueue_me+0x6>
                        3d0: R_390_PC32DBL      .rodata+0x2a
     3d4:       a7 f1 1e 00             tml     %r15,7680
     3d8:       a7 84 00 01             je      3da <unqueue_me+0x12>
     3dc:       b9 04 00 ef             lgr     %r14,%r15
     3e0:       a7 fb ff d0             aghi    %r15,-48
     3e4:       b9 04 00 b2             lgr     %r11,%r2
     3e8:       e3 e0 f0 98 00 24       stg     %r14,152(%r15)
     3ee:       e3 c0 b0 28 00 04       lg      %r12,40(%r11)
		/* write q->lock_ptr in r12 */
     3f4:       b9 02 00 cc             ltgr    %r12,%r12
     3f8:       a7 84 00 4b             je      48e <unqueue_me+0xc6>
		/* if r12 is zero then jump over the code.... */
     3fc:       e3 20 b0 28 00 04       lg      %r2,40(%r11)
		/* write q->lock_ptr in r2 */
     402:       c0 e5 00 00 00 00       brasl   %r14,402 <unqueue_me+0x3a>
                        404: R_390_PC32DBL      _spin_lock+0x2
		/* use r2 as parameter for spin_lock */

So the code becomes more or less:
if (q->lock_ptr != 0) spin_lock(q->lock_ptr)
instead of
if (lock_ptr != 0) spin_lock(lock_ptr)

Which caused the oops from above.
After adding a barrier gcc creates code without this problem:
[...] (the same)
     3ee:       e3 c0 b0 28 00 04       lg      %r12,40(%r11)
     3f4:       b9 02 00 cc             ltgr    %r12,%r12
     3f8:       b9 04 00 2c             lgr     %r2,%r12
     3fc:       a7 84 00 48             je      48c <unqueue_me+0xc4>
     400:       c0 e5 00 00 00 00       brasl   %r14,400 <unqueue_me+0x38>
                        402: R_390_PC32DBL      _spin_lock+0x2

As a general note, this code of unqueue_me seems a bit fishy. The retry logic
of unqueue_me only works if we can guarantee, that the original value of
q->lock_ptr is always a spinlock (Otherwise we overwrite kernel memory). We
know that q->lock_ptr can change. I dont know what happens with the original
spinlock, as I am not an expert with the futex code.

Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@timesys.com>
Signed-off-by: Christian Borntraeger <borntrae@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
1 parent 72f0b4e
Raw File
user.h
#ifndef _H8300_USER_H
#define _H8300_USER_H

#include <asm/page.h>

/* Core file format: The core file is written in such a way that gdb
   can understand it and provide useful information to the user (under
   linux we use the 'trad-core' bfd).  There are quite a number of
   obstacles to being able to view the contents of the floating point
   registers, and until these are solved you will not be able to view the
   contents of them.  Actually, you can read in the core file and look at
   the contents of the user struct to find out what the floating point
   registers contain.
   The actual file contents are as follows:
   UPAGE: 1 page consisting of a user struct that tells gdb what is present
   in the file.  Directly after this is a copy of the task_struct, which
   is currently not used by gdb, but it may come in useful at some point.
   All of the registers are stored as part of the upage.  The upage should
   always be only one page.
   DATA: The data area is stored.  We use current->end_text to
   current->brk to pick up all of the user variables, plus any memory
   that may have been malloced.  No attempt is made to determine if a page
   is demand-zero or if a page is totally unused, we just cover the entire
   range.  All of the addresses are rounded in such a way that an integral
   number of pages is written.
   STACK: We need the stack information in order to get a meaningful
   backtrace.  We need to write the data from (esp) to
   current->start_stack, so we round each of these off in order to be able
   to write an integer number of pages.
   The minimum core file size is 3 pages, or 12288 bytes.
*/

/* This is the old layout of "struct pt_regs" as of Linux 1.x, and
   is still the layout used by user (the new pt_regs doesn't have
   all registers). */
struct user_regs_struct {
	long er1,er2,er3,er4,er5,er6;
	long er0;
	long usp;
	long orig_er0;
	short ccr;
	long pc;
};

	
/* When the kernel dumps core, it starts by dumping the user struct -
   this will be used by gdb to figure out where the data and stack segments
   are within the file, and what virtual addresses to use. */
struct user{
/* We start with the registers, to mimic the way that "memory" is returned
   from the ptrace(3,...) function.  */
  struct user_regs_struct regs;	/* Where the registers are actually stored */
/* ptrace does not yet supply these.  Someday.... */
/* The rest of this junk is to help gdb figure out what goes where */
  unsigned long int u_tsize;	/* Text segment size (pages). */
  unsigned long int u_dsize;	/* Data segment size (pages). */
  unsigned long int u_ssize;	/* Stack segment size (pages). */
  unsigned long start_code;     /* Starting virtual address of text. */
  unsigned long start_stack;	/* Starting virtual address of stack area.
				   This is actually the bottom of the stack,
				   the top of the stack is always found in the
				   esp register.  */
  long int signal;     		/* Signal that caused the core dump. */
  int reserved;			/* No longer used */
  struct user_regs_struct *u_ar0;
				/* Used by gdb to help find the values for */
				/* the registers. */
  unsigned long magic;		/* To uniquely identify a core file */
  char u_comm[32];		/* User command that was responsible */
};
#define NBPG PAGE_SIZE
#define UPAGES 1
#define HOST_TEXT_START_ADDR (u.start_code)
#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG)

#endif
back to top