Revision - e91467e - [PATCH] bug in futex unqueue_me

Revision e91467ecd1ef381377fd327c0ded922835ec52ab authored by Christian Borntraeger on 05 August 2006, 19:13:52 UTC, committed by Linus Torvalds on 06 August 2006, 15:57:46 UTC

[PATCH] bug in futex unqueue_me

This patch adds a barrier() in futex unqueue_me to avoid aliasing of two
pointers.

On my s390x system I saw the following oops:

Unable to handle kernel pointer dereference at virtual kernel address
0000000000000000
Oops: 0004 [#1]
CPU:    0    Not tainted
Process mytool (pid: 13613, task: 000000003ecb6ac0, ksp: 00000000366bdbd8)
Krnl PSW : 0704d00180000000 00000000003c9ac2 (_spin_lock+0xe/0x30)
Krnl GPRS: 00000000ffffffff 000000003ecb6ac0 0000000000000000 0700000000000000
           0000000000000000 0000000000000000 000001fe00002028 00000000000c091f
           000001fe00002054 000001fe00002054 0000000000000000 00000000366bddc0
           00000000005ef8c0 00000000003d00e8 0000000000144f91 00000000366bdcb8
Krnl Code: ba 4e 20 00 12 44 b9 16 00 3e a7 84 00 08 e3 e0 f0 88 00 04
Call Trace:
([<0000000000144f90>] unqueue_me+0x40/0xe4)
 [<0000000000145a0c>] do_futex+0x33c/0xc40
 [<000000000014643e>] sys_futex+0x12e/0x144
 [<000000000010bb00>] sysc_noemu+0x10/0x16
 [<000002000003741c>] 0x2000003741c

The code in question is:

static int unqueue_me(struct futex_q *q)
{
        int ret = 0;
        spinlock_t *lock_ptr;

        /* In the common case we don't take the spinlock, which is nice. */
 retry:
        lock_ptr = q->lock_ptr;
        if (lock_ptr != 0) {
                spin_lock(lock_ptr);
		/*
                 * q->lock_ptr can change between reading it and
                 * spin_lock(), causing us to take the wrong lock.  This
                 * corrects the race condition.
[...]

and my compiler (gcc 4.1.0) makes the following out of it:

00000000000003c8 <unqueue_me>:
     3c8:       eb bf f0 70 00 24       stmg    %r11,%r15,112(%r15)
     3ce:       c0 d0 00 00 00 00       larl    %r13,3ce <unqueue_me+0x6>
                        3d0: R_390_PC32DBL      .rodata+0x2a
     3d4:       a7 f1 1e 00             tml     %r15,7680
     3d8:       a7 84 00 01             je      3da <unqueue_me+0x12>
     3dc:       b9 04 00 ef             lgr     %r14,%r15
     3e0:       a7 fb ff d0             aghi    %r15,-48
     3e4:       b9 04 00 b2             lgr     %r11,%r2
     3e8:       e3 e0 f0 98 00 24       stg     %r14,152(%r15)
     3ee:       e3 c0 b0 28 00 04       lg      %r12,40(%r11)
		/* write q->lock_ptr in r12 */
     3f4:       b9 02 00 cc             ltgr    %r12,%r12
     3f8:       a7 84 00 4b             je      48e <unqueue_me+0xc6>
		/* if r12 is zero then jump over the code.... */
     3fc:       e3 20 b0 28 00 04       lg      %r2,40(%r11)
		/* write q->lock_ptr in r2 */
     402:       c0 e5 00 00 00 00       brasl   %r14,402 <unqueue_me+0x3a>
                        404: R_390_PC32DBL      _spin_lock+0x2
		/* use r2 as parameter for spin_lock */

So the code becomes more or less:
if (q->lock_ptr != 0) spin_lock(q->lock_ptr)
instead of
if (lock_ptr != 0) spin_lock(lock_ptr)

Which caused the oops from above.
After adding a barrier gcc creates code without this problem:
[...] (the same)
     3ee:       e3 c0 b0 28 00 04       lg      %r12,40(%r11)
     3f4:       b9 02 00 cc             ltgr    %r12,%r12
     3f8:       b9 04 00 2c             lgr     %r2,%r12
     3fc:       a7 84 00 48             je      48c <unqueue_me+0xc4>
     400:       c0 e5 00 00 00 00       brasl   %r14,400 <unqueue_me+0x38>
                        402: R_390_PC32DBL      _spin_lock+0x2

As a general note, this code of unqueue_me seems a bit fishy. The retry logic
of unqueue_me only works if we can guarantee, that the original value of
q->lock_ptr is always a spinlock (Otherwise we overwrite kernel memory). We
know that q->lock_ptr can change. I dont know what happens with the original
spinlock, as I am not an expert with the futex code.

Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@timesys.com>
Signed-off-by: Christian Borntraeger <borntrae@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

1 parent 72f0b4e

Files
Changes

Permalinks

objectid.c

/*
 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
 */

#include <linux/string.h>
#include <linux/random.h>
#include <linux/time.h>
#include <linux/reiserfs_fs.h>
#include <linux/reiserfs_fs_sb.h>

// find where objectid map starts
#define objectid_map(s,rs) (old_format_only (s) ? \
                         (__le32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\
			 (__le32 *)((rs) + 1))

#ifdef CONFIG_REISERFS_CHECK

static void check_objectid_map(struct super_block *s, __le32 * map)
{
	if (le32_to_cpu(map[0]) != 1)
		reiserfs_panic(s,
			       "vs-15010: check_objectid_map: map corrupted: %lx",
			       (long unsigned int)le32_to_cpu(map[0]));

	// FIXME: add something else here
}

#else
static void check_objectid_map(struct super_block *s, __le32 * map)
{;
}
#endif

/* When we allocate objectids we allocate the first unused objectid.
   Each sequence of objectids in use (the odd sequences) is followed
   by a sequence of objectids not in use (the even sequences).  We
   only need to record the last objectid in each of these sequences
   (both the odd and even sequences) in order to fully define the
   boundaries of the sequences.  A consequence of allocating the first
   objectid not in use is that under most conditions this scheme is
   extremely compact.  The exception is immediately after a sequence
   of operations which deletes a large number of objects of
   non-sequential objectids, and even then it will become compact
   again as soon as more objects are created.  Note that many
   interesting optimizations of layout could result from complicating
   objectid assignment, but we have deferred making them for now. */

/* get unique object identifier */
__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th)
{
	struct super_block *s = th->t_super;
	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
	__le32 *map = objectid_map(s, rs);
	__u32 unused_objectid;

	BUG_ON(!th->t_trans_id);

	check_objectid_map(s, map);

	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
	/* comment needed -Hans */
	unused_objectid = le32_to_cpu(map[1]);
	if (unused_objectid == U32_MAX) {
		reiserfs_warning(s, "%s: no more object ids", __FUNCTION__);
		reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s));
		return 0;
	}

	/* This incrementation allocates the first unused objectid. That
	   is to say, the first entry on the objectid map is the first
	   unused objectid, and by incrementing it we use it.  See below
	   where we check to see if we eliminated a sequence of unused
	   objectids.... */
	map[1] = cpu_to_le32(unused_objectid + 1);

	/* Now we check to see if we eliminated the last remaining member of
	   the first even sequence (and can eliminate the sequence by
	   eliminating its last objectid from oids), and can collapse the
	   first two odd sequences into one sequence.  If so, then the net
	   result is to eliminate a pair of objectids from oids.  We do this
	   by shifting the entire map to the left. */
	if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) {
		memmove(map + 1, map + 3,
			(sb_oid_cursize(rs) - 3) * sizeof(__u32));
		set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
	}

	journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s));
	return unused_objectid;
}

/* makes object identifier unused */
void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
			       __u32 objectid_to_release)
{
	struct super_block *s = th->t_super;
	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
	__le32 *map = objectid_map(s, rs);
	int i = 0;

	BUG_ON(!th->t_trans_id);
	//return;
	check_objectid_map(s, map);

	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
	journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s));

	/* start at the beginning of the objectid map (i = 0) and go to
	   the end of it (i = disk_sb->s_oid_cursize).  Linear search is
	   what we use, though it is possible that binary search would be
	   more efficient after performing lots of deletions (which is
	   when oids is large.)  We only check even i's. */
	while (i < sb_oid_cursize(rs)) {
		if (objectid_to_release == le32_to_cpu(map[i])) {
			/* This incrementation unallocates the objectid. */
			//map[i]++;
			map[i] = cpu_to_le32(le32_to_cpu(map[i]) + 1);

			/* Did we unallocate the last member of an odd sequence, and can shrink oids? */
			if (map[i] == map[i + 1]) {
				/* shrink objectid map */
				memmove(map + i, map + i + 2,
					(sb_oid_cursize(rs) - i -
					 2) * sizeof(__u32));
				//disk_sb->s_oid_cursize -= 2;
				set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);

				RFALSE(sb_oid_cursize(rs) < 2 ||
				       sb_oid_cursize(rs) > sb_oid_maxsize(rs),
				       "vs-15005: objectid map corrupted cur_size == %d (max == %d)",
				       sb_oid_cursize(rs), sb_oid_maxsize(rs));
			}
			return;
		}

		if (objectid_to_release > le32_to_cpu(map[i]) &&
		    objectid_to_release < le32_to_cpu(map[i + 1])) {
			/* size of objectid map is not changed */
			if (objectid_to_release + 1 == le32_to_cpu(map[i + 1])) {
				//objectid_map[i+1]--;
				map[i + 1] =
				    cpu_to_le32(le32_to_cpu(map[i + 1]) - 1);
				return;
			}

			/* JDM comparing two little-endian values for equality -- safe */
			if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) {
				/* objectid map must be expanded, but there is no space */
				PROC_INFO_INC(s, leaked_oid);
				return;
			}

			/* expand the objectid map */
			memmove(map + i + 3, map + i + 1,
				(sb_oid_cursize(rs) - i - 1) * sizeof(__u32));
			map[i + 1] = cpu_to_le32(objectid_to_release);
			map[i + 2] = cpu_to_le32(objectid_to_release + 1);
			set_sb_oid_cursize(rs, sb_oid_cursize(rs) + 2);
			return;
		}
		i += 2;
	}

	reiserfs_warning(s,
			 "vs-15011: reiserfs_release_objectid: tried to free free object id (%lu)",
			 (long unsigned)objectid_to_release);
}

int reiserfs_convert_objectid_map_v1(struct super_block *s)
{
	struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK(s);
	int cur_size = sb_oid_cursize(disk_sb);
	int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2;
	int old_max = sb_oid_maxsize(disk_sb);
	struct reiserfs_super_block_v1 *disk_sb_v1;
	__le32 *objectid_map, *new_objectid_map;
	int i;

	disk_sb_v1 =
	    (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data);
	objectid_map = (__le32 *) (disk_sb_v1 + 1);
	new_objectid_map = (__le32 *) (disk_sb + 1);

	if (cur_size > new_size) {
		/* mark everyone used that was listed as free at the end of the objectid
		 ** map 
		 */
		objectid_map[new_size - 1] = objectid_map[cur_size - 1];
		set_sb_oid_cursize(disk_sb, new_size);
	}
	/* move the smaller objectid map past the end of the new super */
	for (i = new_size - 1; i >= 0; i--) {
		objectid_map[i + (old_max - new_size)] = objectid_map[i];
	}

	/* set the max size so we don't overflow later */
	set_sb_oid_maxsize(disk_sb, new_size);

	/* Zero out label and generate random UUID */
	memset(disk_sb->s_label, 0, sizeof(disk_sb->s_label));
	generate_random_uuid(disk_sb->s_uuid);

	/* finally, zero out the unused chunk of the new super */
	memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused));
	return 0;
}

Showing with 0 additions and 0 deletions (0 / 0 diffs computed)

Computing file changes ...