Revision 85bd839983778fcd0c1c043327b14a046e979b39 authored by Gu Zheng on 10 June 2015, 18:14:43 UTC, committed by Linus Torvalds on 10 June 2015, 23:43:43 UTC
Izumi found the following oops when hot re-adding a node:

    BUG: unable to handle kernel paging request at ffffc90008963690
    IP: __wake_up_bit+0x20/0x70
    Oops: 0000 [#1] SMP
    CPU: 68 PID: 1237 Comm: rs:main Q:Reg Not tainted 4.1.0-rc5 #80
    Hardware name: FUJITSU PRIMEQUEST2800E/SB, BIOS PRIMEQUEST 2000 Series BIOS Version 1.87 04/28/2015
    task: ffff880838df8000 ti: ffff880017b94000 task.ti: ffff880017b94000
    RIP: 0010:[<ffffffff810dff80>]  [<ffffffff810dff80>] __wake_up_bit+0x20/0x70
    RSP: 0018:ffff880017b97be8  EFLAGS: 00010246
    RAX: ffffc90008963690 RBX: 00000000003c0000 RCX: 000000000000a4c9
    RDX: 0000000000000000 RSI: ffffea101bffd500 RDI: ffffc90008963648
    RBP: ffff880017b97c08 R08: 0000000002000020 R09: 0000000000000000
    R10: 0000000000000000 R11: 0000000000000000 R12: ffff8a0797c73800
    R13: ffffea101bffd500 R14: 0000000000000001 R15: 00000000003c0000
    FS:  00007fcc7ffff700(0000) GS:ffff880874800000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: ffffc90008963690 CR3: 0000000836761000 CR4: 00000000001407e0
    Call Trace:
      unlock_page+0x6d/0x70
      generic_write_end+0x53/0xb0
      xfs_vm_write_end+0x29/0x80 [xfs]
      generic_perform_write+0x10a/0x1e0
      xfs_file_buffered_aio_write+0x14d/0x3e0 [xfs]
      xfs_file_write_iter+0x79/0x120 [xfs]
      __vfs_write+0xd4/0x110
      vfs_write+0xac/0x1c0
      SyS_write+0x58/0xd0
      system_call_fastpath+0x12/0x76
    Code: 5d c3 66 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 48 83 ec 20 65 48 8b 04 25 28 00 00 00 48 89 45 f8 31 c0 48 8d 47 48 <48> 39 47 48 48 c7 45 e8 00 00 00 00 48 c7 45 f0 00 00 00 00 48
    RIP  [<ffffffff810dff80>] __wake_up_bit+0x20/0x70
     RSP <ffff880017b97be8>
    CR2: ffffc90008963690

Reproduce method (re-add a node)::
  Hot-add nodeA --> remove nodeA --> hot-add nodeA (panic)

This seems an use-after-free problem, and the root cause is
zone->wait_table was not set to *NULL* after free it in
try_offline_node.

When hot re-add a node, we will reuse the pgdat of it, so does the zone
struct, and when add pages to the target zone, it will init the zone
first (including the wait_table) if the zone is not initialized.  The
judgement of zone initialized is based on zone->wait_table:

	static inline bool zone_is_initialized(struct zone *zone)
	{
		return !!zone->wait_table;
	}

so if we do not set the zone->wait_table to *NULL* after free it, the
memory hotplug routine will skip the init of new zone when hot re-add
the node, and the wait_table still points to the freed memory, then we
will access the invalid address when trying to wake up the waiting
people after the i/o operation with the page is done, such as mentioned
above.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Reported-by: Taku Izumi <izumi.taku@jp.fujitsu.com>
Reviewed by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 5879ae5
Raw File
ansi_cprng.c
/*
 * PRNG: Pseudo Random Number Generator
 *       Based on NIST Recommended PRNG From ANSI X9.31 Appendix A.2.4 using
 *       AES 128 cipher
 *
 *  (C) Neil Horman <nhorman@tuxdriver.com>
 *
 *  This program is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by the
 *  Free Software Foundation; either version 2 of the License, or (at your
 *  any later version.
 *
 *
 */

#include <crypto/internal/rng.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/string.h>

#include "internal.h"

#define DEFAULT_PRNG_KEY "0123456789abcdef"
#define DEFAULT_PRNG_KSZ 16
#define DEFAULT_BLK_SZ 16
#define DEFAULT_V_SEED "zaybxcwdveuftgsh"

/*
 * Flags for the prng_context flags field
 */

#define PRNG_FIXED_SIZE 0x1
#define PRNG_NEED_RESET 0x2

/*
 * Note: DT is our counter value
 *	 I is our intermediate value
 *	 V is our seed vector
 * See http://csrc.nist.gov/groups/STM/cavp/documents/rng/931rngext.pdf
 * for implementation details
 */


struct prng_context {
	spinlock_t prng_lock;
	unsigned char rand_data[DEFAULT_BLK_SZ];
	unsigned char last_rand_data[DEFAULT_BLK_SZ];
	unsigned char DT[DEFAULT_BLK_SZ];
	unsigned char I[DEFAULT_BLK_SZ];
	unsigned char V[DEFAULT_BLK_SZ];
	u32 rand_data_valid;
	struct crypto_cipher *tfm;
	u32 flags;
};

static int dbg;

static void hexdump(char *note, unsigned char *buf, unsigned int len)
{
	if (dbg) {
		printk(KERN_CRIT "%s", note);
		print_hex_dump(KERN_CONT, "", DUMP_PREFIX_OFFSET,
				16, 1,
				buf, len, false);
	}
}

#define dbgprint(format, args...) do {\
if (dbg)\
	printk(format, ##args);\
} while (0)

static void xor_vectors(unsigned char *in1, unsigned char *in2,
			unsigned char *out, unsigned int size)
{
	int i;

	for (i = 0; i < size; i++)
		out[i] = in1[i] ^ in2[i];

}
/*
 * Returns DEFAULT_BLK_SZ bytes of random data per call
 * returns 0 if generation succeeded, <0 if something went wrong
 */
static int _get_more_prng_bytes(struct prng_context *ctx, int cont_test)
{
	int i;
	unsigned char tmp[DEFAULT_BLK_SZ];
	unsigned char *output = NULL;


	dbgprint(KERN_CRIT "Calling _get_more_prng_bytes for context %p\n",
		ctx);

	hexdump("Input DT: ", ctx->DT, DEFAULT_BLK_SZ);
	hexdump("Input I: ", ctx->I, DEFAULT_BLK_SZ);
	hexdump("Input V: ", ctx->V, DEFAULT_BLK_SZ);

	/*
	 * This algorithm is a 3 stage state machine
	 */
	for (i = 0; i < 3; i++) {

		switch (i) {
		case 0:
			/*
			 * Start by encrypting the counter value
			 * This gives us an intermediate value I
			 */
			memcpy(tmp, ctx->DT, DEFAULT_BLK_SZ);
			output = ctx->I;
			hexdump("tmp stage 0: ", tmp, DEFAULT_BLK_SZ);
			break;
		case 1:

			/*
			 * Next xor I with our secret vector V
			 * encrypt that result to obtain our
			 * pseudo random data which we output
			 */
			xor_vectors(ctx->I, ctx->V, tmp, DEFAULT_BLK_SZ);
			hexdump("tmp stage 1: ", tmp, DEFAULT_BLK_SZ);
			output = ctx->rand_data;
			break;
		case 2:
			/*
			 * First check that we didn't produce the same
			 * random data that we did last time around through this
			 */
			if (!memcmp(ctx->rand_data, ctx->last_rand_data,
					DEFAULT_BLK_SZ)) {
				if (cont_test) {
					panic("cprng %p Failed repetition check!\n",
						ctx);
				}

				printk(KERN_ERR
					"ctx %p Failed repetition check!\n",
					ctx);

				ctx->flags |= PRNG_NEED_RESET;
				return -EINVAL;
			}
			memcpy(ctx->last_rand_data, ctx->rand_data,
				DEFAULT_BLK_SZ);

			/*
			 * Lastly xor the random data with I
			 * and encrypt that to obtain a new secret vector V
			 */
			xor_vectors(ctx->rand_data, ctx->I, tmp,
				DEFAULT_BLK_SZ);
			output = ctx->V;
			hexdump("tmp stage 2: ", tmp, DEFAULT_BLK_SZ);
			break;
		}


		/* do the encryption */
		crypto_cipher_encrypt_one(ctx->tfm, output, tmp);

	}

	/*
	 * Now update our DT value
	 */
	for (i = DEFAULT_BLK_SZ - 1; i >= 0; i--) {
		ctx->DT[i] += 1;
		if (ctx->DT[i] != 0)
			break;
	}

	dbgprint("Returning new block for context %p\n", ctx);
	ctx->rand_data_valid = 0;

	hexdump("Output DT: ", ctx->DT, DEFAULT_BLK_SZ);
	hexdump("Output I: ", ctx->I, DEFAULT_BLK_SZ);
	hexdump("Output V: ", ctx->V, DEFAULT_BLK_SZ);
	hexdump("New Random Data: ", ctx->rand_data, DEFAULT_BLK_SZ);

	return 0;
}

/* Our exported functions */
static int get_prng_bytes(char *buf, size_t nbytes, struct prng_context *ctx,
				int do_cont_test)
{
	unsigned char *ptr = buf;
	unsigned int byte_count = (unsigned int)nbytes;
	int err;


	spin_lock_bh(&ctx->prng_lock);

	err = -EINVAL;
	if (ctx->flags & PRNG_NEED_RESET)
		goto done;

	/*
	 * If the FIXED_SIZE flag is on, only return whole blocks of
	 * pseudo random data
	 */
	err = -EINVAL;
	if (ctx->flags & PRNG_FIXED_SIZE) {
		if (nbytes < DEFAULT_BLK_SZ)
			goto done;
		byte_count = DEFAULT_BLK_SZ;
	}

	/*
	 * Return 0 in case of success as mandated by the kernel
	 * crypto API interface definition.
	 */
	err = 0;

	dbgprint(KERN_CRIT "getting %d random bytes for context %p\n",
		byte_count, ctx);


remainder:
	if (ctx->rand_data_valid == DEFAULT_BLK_SZ) {
		if (_get_more_prng_bytes(ctx, do_cont_test) < 0) {
			memset(buf, 0, nbytes);
			err = -EINVAL;
			goto done;
		}
	}

	/*
	 * Copy any data less than an entire block
	 */
	if (byte_count < DEFAULT_BLK_SZ) {
empty_rbuf:
		while (ctx->rand_data_valid < DEFAULT_BLK_SZ) {
			*ptr = ctx->rand_data[ctx->rand_data_valid];
			ptr++;
			byte_count--;
			ctx->rand_data_valid++;
			if (byte_count == 0)
				goto done;
		}
	}

	/*
	 * Now copy whole blocks
	 */
	for (; byte_count >= DEFAULT_BLK_SZ; byte_count -= DEFAULT_BLK_SZ) {
		if (ctx->rand_data_valid == DEFAULT_BLK_SZ) {
			if (_get_more_prng_bytes(ctx, do_cont_test) < 0) {
				memset(buf, 0, nbytes);
				err = -EINVAL;
				goto done;
			}
		}
		if (ctx->rand_data_valid > 0)
			goto empty_rbuf;
		memcpy(ptr, ctx->rand_data, DEFAULT_BLK_SZ);
		ctx->rand_data_valid += DEFAULT_BLK_SZ;
		ptr += DEFAULT_BLK_SZ;
	}

	/*
	 * Now go back and get any remaining partial block
	 */
	if (byte_count)
		goto remainder;

done:
	spin_unlock_bh(&ctx->prng_lock);
	dbgprint(KERN_CRIT "returning %d from get_prng_bytes in context %p\n",
		err, ctx);
	return err;
}

static void free_prng_context(struct prng_context *ctx)
{
	crypto_free_cipher(ctx->tfm);
}

static int reset_prng_context(struct prng_context *ctx,
			      unsigned char *key, size_t klen,
			      unsigned char *V, unsigned char *DT)
{
	int ret;
	unsigned char *prng_key;

	spin_lock_bh(&ctx->prng_lock);
	ctx->flags |= PRNG_NEED_RESET;

	prng_key = (key != NULL) ? key : (unsigned char *)DEFAULT_PRNG_KEY;

	if (!key)
		klen = DEFAULT_PRNG_KSZ;

	if (V)
		memcpy(ctx->V, V, DEFAULT_BLK_SZ);
	else
		memcpy(ctx->V, DEFAULT_V_SEED, DEFAULT_BLK_SZ);

	if (DT)
		memcpy(ctx->DT, DT, DEFAULT_BLK_SZ);
	else
		memset(ctx->DT, 0, DEFAULT_BLK_SZ);

	memset(ctx->rand_data, 0, DEFAULT_BLK_SZ);
	memset(ctx->last_rand_data, 0, DEFAULT_BLK_SZ);

	ctx->rand_data_valid = DEFAULT_BLK_SZ;

	ret = crypto_cipher_setkey(ctx->tfm, prng_key, klen);
	if (ret) {
		dbgprint(KERN_CRIT "PRNG: setkey() failed flags=%x\n",
			crypto_cipher_get_flags(ctx->tfm));
		goto out;
	}

	ret = 0;
	ctx->flags &= ~PRNG_NEED_RESET;
out:
	spin_unlock_bh(&ctx->prng_lock);
	return ret;
}

static int cprng_init(struct crypto_tfm *tfm)
{
	struct prng_context *ctx = crypto_tfm_ctx(tfm);

	spin_lock_init(&ctx->prng_lock);
	ctx->tfm = crypto_alloc_cipher("aes", 0, 0);
	if (IS_ERR(ctx->tfm)) {
		dbgprint(KERN_CRIT "Failed to alloc tfm for context %p\n",
				ctx);
		return PTR_ERR(ctx->tfm);
	}

	if (reset_prng_context(ctx, NULL, DEFAULT_PRNG_KSZ, NULL, NULL) < 0)
		return -EINVAL;

	/*
	 * after allocation, we should always force the user to reset
	 * so they don't inadvertently use the insecure default values
	 * without specifying them intentially
	 */
	ctx->flags |= PRNG_NEED_RESET;
	return 0;
}

static void cprng_exit(struct crypto_tfm *tfm)
{
	free_prng_context(crypto_tfm_ctx(tfm));
}

static int cprng_get_random(struct crypto_rng *tfm, u8 *rdata,
			    unsigned int dlen)
{
	struct prng_context *prng = crypto_rng_ctx(tfm);

	return get_prng_bytes(rdata, dlen, prng, 0);
}

/*
 *  This is the cprng_registered reset method the seed value is
 *  interpreted as the tuple { V KEY DT}
 *  V and KEY are required during reset, and DT is optional, detected
 *  as being present by testing the length of the seed
 */
static int cprng_reset(struct crypto_rng *tfm, u8 *seed, unsigned int slen)
{
	struct prng_context *prng = crypto_rng_ctx(tfm);
	u8 *key = seed + DEFAULT_BLK_SZ;
	u8 *dt = NULL;

	if (slen < DEFAULT_PRNG_KSZ + DEFAULT_BLK_SZ)
		return -EINVAL;

	if (slen >= (2 * DEFAULT_BLK_SZ + DEFAULT_PRNG_KSZ))
		dt = key + DEFAULT_PRNG_KSZ;

	reset_prng_context(prng, key, DEFAULT_PRNG_KSZ, seed, dt);

	if (prng->flags & PRNG_NEED_RESET)
		return -EINVAL;
	return 0;
}

#ifdef CONFIG_CRYPTO_FIPS
static int fips_cprng_get_random(struct crypto_rng *tfm, u8 *rdata,
			    unsigned int dlen)
{
	struct prng_context *prng = crypto_rng_ctx(tfm);

	return get_prng_bytes(rdata, dlen, prng, 1);
}

static int fips_cprng_reset(struct crypto_rng *tfm, u8 *seed, unsigned int slen)
{
	u8 rdata[DEFAULT_BLK_SZ];
	u8 *key = seed + DEFAULT_BLK_SZ;
	int rc;

	struct prng_context *prng = crypto_rng_ctx(tfm);

	if (slen < DEFAULT_PRNG_KSZ + DEFAULT_BLK_SZ)
		return -EINVAL;

	/* fips strictly requires seed != key */
	if (!memcmp(seed, key, DEFAULT_PRNG_KSZ))
		return -EINVAL;

	rc = cprng_reset(tfm, seed, slen);

	if (!rc)
		goto out;

	/* this primes our continuity test */
	rc = get_prng_bytes(rdata, DEFAULT_BLK_SZ, prng, 0);
	prng->rand_data_valid = DEFAULT_BLK_SZ;

out:
	return rc;
}
#endif

static struct crypto_alg rng_algs[] = { {
	.cra_name		= "stdrng",
	.cra_driver_name	= "ansi_cprng",
	.cra_priority		= 100,
	.cra_flags		= CRYPTO_ALG_TYPE_RNG,
	.cra_ctxsize		= sizeof(struct prng_context),
	.cra_type		= &crypto_rng_type,
	.cra_module		= THIS_MODULE,
	.cra_init		= cprng_init,
	.cra_exit		= cprng_exit,
	.cra_u			= {
		.rng = {
			.rng_make_random	= cprng_get_random,
			.rng_reset		= cprng_reset,
			.seedsize = DEFAULT_PRNG_KSZ + 2*DEFAULT_BLK_SZ,
		}
	}
#ifdef CONFIG_CRYPTO_FIPS
}, {
	.cra_name		= "fips(ansi_cprng)",
	.cra_driver_name	= "fips_ansi_cprng",
	.cra_priority		= 300,
	.cra_flags		= CRYPTO_ALG_TYPE_RNG,
	.cra_ctxsize		= sizeof(struct prng_context),
	.cra_type		= &crypto_rng_type,
	.cra_module		= THIS_MODULE,
	.cra_init		= cprng_init,
	.cra_exit		= cprng_exit,
	.cra_u			= {
		.rng = {
			.rng_make_random	= fips_cprng_get_random,
			.rng_reset		= fips_cprng_reset,
			.seedsize = DEFAULT_PRNG_KSZ + 2*DEFAULT_BLK_SZ,
		}
	}
#endif
} };

/* Module initalization */
static int __init prng_mod_init(void)
{
	return crypto_register_algs(rng_algs, ARRAY_SIZE(rng_algs));
}

static void __exit prng_mod_fini(void)
{
	crypto_unregister_algs(rng_algs, ARRAY_SIZE(rng_algs));
}

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Software Pseudo Random Number Generator");
MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>");
module_param(dbg, int, 0);
MODULE_PARM_DESC(dbg, "Boolean to enable debugging (0/1 == off/on)");
module_init(prng_mod_init);
module_exit(prng_mod_fini);
MODULE_ALIAS_CRYPTO("stdrng");
MODULE_ALIAS_CRYPTO("ansi_cprng");
back to top