Revision e91467ecd1ef381377fd327c0ded922835ec52ab authored by Christian Borntraeger on 05 August 2006, 19:13:52 UTC, committed by Linus Torvalds on 06 August 2006, 15:57:46 UTC
This patch adds a barrier() in futex unqueue_me to avoid aliasing of two
pointers.

On my s390x system I saw the following oops:

Unable to handle kernel pointer dereference at virtual kernel address
0000000000000000
Oops: 0004 [#1]
CPU:    0    Not tainted
Process mytool (pid: 13613, task: 000000003ecb6ac0, ksp: 00000000366bdbd8)
Krnl PSW : 0704d00180000000 00000000003c9ac2 (_spin_lock+0xe/0x30)
Krnl GPRS: 00000000ffffffff 000000003ecb6ac0 0000000000000000 0700000000000000
           0000000000000000 0000000000000000 000001fe00002028 00000000000c091f
           000001fe00002054 000001fe00002054 0000000000000000 00000000366bddc0
           00000000005ef8c0 00000000003d00e8 0000000000144f91 00000000366bdcb8
Krnl Code: ba 4e 20 00 12 44 b9 16 00 3e a7 84 00 08 e3 e0 f0 88 00 04
Call Trace:
([<0000000000144f90>] unqueue_me+0x40/0xe4)
 [<0000000000145a0c>] do_futex+0x33c/0xc40
 [<000000000014643e>] sys_futex+0x12e/0x144
 [<000000000010bb00>] sysc_noemu+0x10/0x16
 [<000002000003741c>] 0x2000003741c

The code in question is:

static int unqueue_me(struct futex_q *q)
{
        int ret = 0;
        spinlock_t *lock_ptr;

        /* In the common case we don't take the spinlock, which is nice. */
 retry:
        lock_ptr = q->lock_ptr;
        if (lock_ptr != 0) {
                spin_lock(lock_ptr);
		/*
                 * q->lock_ptr can change between reading it and
                 * spin_lock(), causing us to take the wrong lock.  This
                 * corrects the race condition.
[...]

and my compiler (gcc 4.1.0) makes the following out of it:

00000000000003c8 <unqueue_me>:
     3c8:       eb bf f0 70 00 24       stmg    %r11,%r15,112(%r15)
     3ce:       c0 d0 00 00 00 00       larl    %r13,3ce <unqueue_me+0x6>
                        3d0: R_390_PC32DBL      .rodata+0x2a
     3d4:       a7 f1 1e 00             tml     %r15,7680
     3d8:       a7 84 00 01             je      3da <unqueue_me+0x12>
     3dc:       b9 04 00 ef             lgr     %r14,%r15
     3e0:       a7 fb ff d0             aghi    %r15,-48
     3e4:       b9 04 00 b2             lgr     %r11,%r2
     3e8:       e3 e0 f0 98 00 24       stg     %r14,152(%r15)
     3ee:       e3 c0 b0 28 00 04       lg      %r12,40(%r11)
		/* write q->lock_ptr in r12 */
     3f4:       b9 02 00 cc             ltgr    %r12,%r12
     3f8:       a7 84 00 4b             je      48e <unqueue_me+0xc6>
		/* if r12 is zero then jump over the code.... */
     3fc:       e3 20 b0 28 00 04       lg      %r2,40(%r11)
		/* write q->lock_ptr in r2 */
     402:       c0 e5 00 00 00 00       brasl   %r14,402 <unqueue_me+0x3a>
                        404: R_390_PC32DBL      _spin_lock+0x2
		/* use r2 as parameter for spin_lock */

So the code becomes more or less:
if (q->lock_ptr != 0) spin_lock(q->lock_ptr)
instead of
if (lock_ptr != 0) spin_lock(lock_ptr)

Which caused the oops from above.
After adding a barrier gcc creates code without this problem:
[...] (the same)
     3ee:       e3 c0 b0 28 00 04       lg      %r12,40(%r11)
     3f4:       b9 02 00 cc             ltgr    %r12,%r12
     3f8:       b9 04 00 2c             lgr     %r2,%r12
     3fc:       a7 84 00 48             je      48c <unqueue_me+0xc4>
     400:       c0 e5 00 00 00 00       brasl   %r14,400 <unqueue_me+0x38>
                        402: R_390_PC32DBL      _spin_lock+0x2

As a general note, this code of unqueue_me seems a bit fishy. The retry logic
of unqueue_me only works if we can guarantee, that the original value of
q->lock_ptr is always a spinlock (Otherwise we overwrite kernel memory). We
know that q->lock_ptr can change. I dont know what happens with the original
spinlock, as I am not an expert with the futex code.

Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@timesys.com>
Signed-off-by: Christian Borntraeger <borntrae@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
1 parent 72f0b4e
History
File Mode Size
9p
adfs
affs
afs
autofs
autofs4
befs
bfs
cifs
coda
configfs
cramfs
debugfs
devpts
efs
exportfs
ext2
ext3
fat
freevxfs
fuse
hfs
hfsplus
hostfs
hpfs
hppfs
hugetlbfs
isofs
jbd
jffs
jffs2
jfs
lockd
minix
msdos
ncpfs
nfs
nfs_common
nfsd
nls
ntfs
ocfs2
openpromfs
partitions
proc
qnx4
ramfs
reiserfs
romfs
smbfs
sysfs
sysv
udf
ufs
vfat
xfs
Kconfig -rw-r--r-- 69.8 KB
Kconfig.binfmt -rw-r--r-- 5.3 KB
Makefile -rw-r--r-- 3.3 KB
aio.c -rw-r--r-- 43.8 KB
attr.c -rw-r--r-- 4.2 KB
bad_inode.c -rw-r--r-- 2.8 KB
binfmt_aout.c -rw-r--r-- 14.8 KB
binfmt_elf.c -rw-r--r-- 45.5 KB
binfmt_elf_fdpic.c -rw-r--r-- 46.5 KB
binfmt_em86.c -rw-r--r-- 2.8 KB
binfmt_flat.c -rw-r--r-- 26.0 KB
binfmt_misc.c -rw-r--r-- 15.7 KB
binfmt_script.c -rw-r--r-- 2.7 KB
binfmt_som.c -rw-r--r-- 7.6 KB
bio.c -rw-r--r-- 30.0 KB
block_dev.c -rw-r--r-- 31.3 KB
buffer.c -rw-r--r-- 83.2 KB
char_dev.c -rw-r--r-- 9.9 KB
compat.c -rw-r--r-- 52.6 KB
compat_ioctl.c -rw-r--r-- 82.9 KB
dcache.c -rw-r--r-- 45.5 KB
dcookies.c -rw-r--r-- 6.4 KB
direct-io.c -rw-r--r-- 35.0 KB
dnotify.c -rw-r--r-- 4.3 KB
dquot.c -rw-r--r-- 52.5 KB
drop_caches.c -rw-r--r-- 1.4 KB
eventpoll.c -rw-r--r-- 44.3 KB
exec.c -rw-r--r-- 34.9 KB
fcntl.c -rw-r--r-- 13.7 KB
fifo.c -rw-r--r-- 3.1 KB
file.c -rw-r--r-- 8.6 KB
file_table.c -rw-r--r-- 6.7 KB
filesystems.c -rw-r--r-- 5.2 KB
fs-writeback.c -rw-r--r-- 19.8 KB
inode.c -rw-r--r-- 36.5 KB
inotify.c -rw-r--r-- 19.2 KB
inotify_user.c -rw-r--r-- 17.3 KB
ioctl.c -rw-r--r-- 3.9 KB
ioprio.c -rw-r--r-- 4.0 KB
libfs.c -rw-r--r-- 15.4 KB
locks.c -rw-r--r-- 55.1 KB
mbcache.c -rw-r--r-- 18.2 KB
mpage.c -rw-r--r-- 22.8 KB
namei.c -rw-r--r-- 67.0 KB
namespace.c -rw-r--r-- 46.9 KB
nfsctl.c -rw-r--r-- 2.4 KB
open.c -rw-r--r-- 27.5 KB
pipe.c -rw-r--r-- 21.4 KB
pnode.c -rw-r--r-- 7.6 KB
pnode.h -rw-r--r-- 1013 bytes
posix_acl.c -rw-r--r-- 8.5 KB
quota.c -rw-r--r-- 8.9 KB
quota_v1.c -rw-r--r-- 5.7 KB
quota_v2.c -rw-r--r-- 20.1 KB
read_write.c -rw-r--r-- 16.2 KB
readdir.c -rw-r--r-- 6.7 KB
select.c -rw-r--r-- 20.6 KB
seq_file.c -rw-r--r-- 9.6 KB
splice.c -rw-r--r-- 34.4 KB
stat.c -rw-r--r-- 10.5 KB
super.c -rw-r--r-- 20.5 KB
sync.c -rw-r--r-- 4.4 KB
xattr.c -rw-r--r-- 13.3 KB
xattr_acl.c -rw-r--r-- 2.3 KB

back to top