Revision c9d398fa237882ea07167e23bcfc5e6847066518 authored by Naoya Horiguchi on 31 March 2017, 22:11:55 UTC, committed by Linus Torvalds on 01 April 2017, 00:13:30 UTC
I found the race condition which triggers the following bug when
move_pages() and soft offline are called on a single hugetlb page
concurrently.

    Soft offlining page 0x119400 at 0x700000000000
    BUG: unable to handle kernel paging request at ffffea0011943820
    IP: follow_huge_pmd+0x143/0x190
    PGD 7ffd2067
    PUD 7ffd1067
    PMD 0
        [61163.582052] Oops: 0000 [#1] SMP
    Modules linked in: binfmt_misc ppdev virtio_balloon parport_pc pcspkr i2c_piix4 parport i2c_core acpi_cpufreq ip_tables xfs libcrc32c ata_generic pata_acpi virtio_blk 8139too crc32c_intel ata_piix serio_raw libata virtio_pci 8139cp virtio_ring virtio mii floppy dm_mirror dm_region_hash dm_log dm_mod [last unloaded: cap_check]
    CPU: 0 PID: 22573 Comm: iterate_numa_mo Tainted: P           OE   4.11.0-rc2-mm1+ #2
    Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
    RIP: 0010:follow_huge_pmd+0x143/0x190
    RSP: 0018:ffffc90004bdbcd0 EFLAGS: 00010202
    RAX: 0000000465003e80 RBX: ffffea0004e34d30 RCX: 00003ffffffff000
    RDX: 0000000011943800 RSI: 0000000000080001 RDI: 0000000465003e80
    RBP: ffffc90004bdbd18 R08: 0000000000000000 R09: ffff880138d34000
    R10: ffffea0004650000 R11: 0000000000c363b0 R12: ffffea0011943800
    R13: ffff8801b8d34000 R14: ffffea0000000000 R15: 000077ff80000000
    FS:  00007fc977710740(0000) GS:ffff88007dc00000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: ffffea0011943820 CR3: 000000007a746000 CR4: 00000000001406f0
    Call Trace:
     follow_page_mask+0x270/0x550
     SYSC_move_pages+0x4ea/0x8f0
     SyS_move_pages+0xe/0x10
     do_syscall_64+0x67/0x180
     entry_SYSCALL64_slow_path+0x25/0x25
    RIP: 0033:0x7fc976e03949
    RSP: 002b:00007ffe72221d88 EFLAGS: 00000246 ORIG_RAX: 0000000000000117
    RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fc976e03949
    RDX: 0000000000c22390 RSI: 0000000000001400 RDI: 0000000000005827
    RBP: 00007ffe72221e00 R08: 0000000000c2c3a0 R09: 0000000000000004
    R10: 0000000000c363b0 R11: 0000000000000246 R12: 0000000000400650
    R13: 00007ffe72221ee0 R14: 0000000000000000 R15: 0000000000000000
    Code: 81 e4 ff ff 1f 00 48 21 c2 49 c1 ec 0c 48 c1 ea 0c 4c 01 e2 49 bc 00 00 00 00 00 ea ff ff 48 c1 e2 06 49 01 d4 f6 45 bc 04 74 90 <49> 8b 7c 24 20 40 f6 c7 01 75 2b 4c 89 e7 8b 47 1c 85 c0 7e 2a
    RIP: follow_huge_pmd+0x143/0x190 RSP: ffffc90004bdbcd0
    CR2: ffffea0011943820
    ---[ end trace e4f81353a2d23232 ]---
    Kernel panic - not syncing: Fatal exception
    Kernel Offset: disabled

This bug is triggered when pmd_present() returns true for non-present
hugetlb, so fixing the present check in follow_huge_pmd() prevents it.
Using pmd_present() to determine present/non-present for hugetlb is not
correct, because pmd_present() checks multiple bits (not only
_PAGE_PRESENT) for historical reason and it can misjudge hugetlb state.

Fixes: e66f17ff7177 ("mm/hugetlb: take page table lock in follow_huge_pmd()")
Link: http://lkml.kernel.org/r/1490149898-20231-1-git-send-email-n-horiguchi@ah.jp.nec.com
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: <stable@vger.kernel.org>        [4.0+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 0cefabd
Raw File
kcov.c
#define pr_fmt(fmt) "kcov: " fmt

#define DISABLE_BRANCH_PROFILING
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/types.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/preempt.h>
#include <linux/printk.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/kcov.h>
#include <asm/setup.h>

/*
 * kcov descriptor (one per opened debugfs file).
 * State transitions of the descriptor:
 *  - initial state after open()
 *  - then there must be a single ioctl(KCOV_INIT_TRACE) call
 *  - then, mmap() call (several calls are allowed but not useful)
 *  - then, repeated enable/disable for a task (only one task a time allowed)
 */
struct kcov {
	/*
	 * Reference counter. We keep one for:
	 *  - opened file descriptor
	 *  - task with enabled coverage (we can't unwire it from another task)
	 */
	atomic_t		refcount;
	/* The lock protects mode, size, area and t. */
	spinlock_t		lock;
	enum kcov_mode		mode;
	/* Size of arena (in long's for KCOV_MODE_TRACE). */
	unsigned		size;
	/* Coverage buffer shared with user space. */
	void			*area;
	/* Task for which we collect coverage, or NULL. */
	struct task_struct	*t;
};

/*
 * Entry point from instrumented code.
 * This is called once per basic-block/edge.
 */
void notrace __sanitizer_cov_trace_pc(void)
{
	struct task_struct *t;
	enum kcov_mode mode;

	t = current;
	/*
	 * We are interested in code coverage as a function of a syscall inputs,
	 * so we ignore code executed in interrupts.
	 * The checks for whether we are in an interrupt are open-coded, because
	 * 1. We can't use in_interrupt() here, since it also returns true
	 *    when we are inside local_bh_disable() section.
	 * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()),
	 *    since that leads to slower generated code (three separate tests,
	 *    one for each of the flags).
	 */
	if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET
							| NMI_MASK)))
		return;
	mode = READ_ONCE(t->kcov_mode);
	if (mode == KCOV_MODE_TRACE) {
		unsigned long *area;
		unsigned long pos;
		unsigned long ip = _RET_IP_;

#ifdef CONFIG_RANDOMIZE_BASE
		ip -= kaslr_offset();
#endif

		/*
		 * There is some code that runs in interrupts but for which
		 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
		 * READ_ONCE()/barrier() effectively provides load-acquire wrt
		 * interrupts, there are paired barrier()/WRITE_ONCE() in
		 * kcov_ioctl_locked().
		 */
		barrier();
		area = t->kcov_area;
		/* The first word is number of subsequent PCs. */
		pos = READ_ONCE(area[0]) + 1;
		if (likely(pos < t->kcov_size)) {
			area[pos] = ip;
			WRITE_ONCE(area[0], pos);
		}
	}
}
EXPORT_SYMBOL(__sanitizer_cov_trace_pc);

static void kcov_get(struct kcov *kcov)
{
	atomic_inc(&kcov->refcount);
}

static void kcov_put(struct kcov *kcov)
{
	if (atomic_dec_and_test(&kcov->refcount)) {
		vfree(kcov->area);
		kfree(kcov);
	}
}

void kcov_task_init(struct task_struct *t)
{
	t->kcov_mode = KCOV_MODE_DISABLED;
	t->kcov_size = 0;
	t->kcov_area = NULL;
	t->kcov = NULL;
}

void kcov_task_exit(struct task_struct *t)
{
	struct kcov *kcov;

	kcov = t->kcov;
	if (kcov == NULL)
		return;
	spin_lock(&kcov->lock);
	if (WARN_ON(kcov->t != t)) {
		spin_unlock(&kcov->lock);
		return;
	}
	/* Just to not leave dangling references behind. */
	kcov_task_init(t);
	kcov->t = NULL;
	spin_unlock(&kcov->lock);
	kcov_put(kcov);
}

static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
{
	int res = 0;
	void *area;
	struct kcov *kcov = vma->vm_file->private_data;
	unsigned long size, off;
	struct page *page;

	area = vmalloc_user(vma->vm_end - vma->vm_start);
	if (!area)
		return -ENOMEM;

	spin_lock(&kcov->lock);
	size = kcov->size * sizeof(unsigned long);
	if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
	    vma->vm_end - vma->vm_start != size) {
		res = -EINVAL;
		goto exit;
	}
	if (!kcov->area) {
		kcov->area = area;
		vma->vm_flags |= VM_DONTEXPAND;
		spin_unlock(&kcov->lock);
		for (off = 0; off < size; off += PAGE_SIZE) {
			page = vmalloc_to_page(kcov->area + off);
			if (vm_insert_page(vma, vma->vm_start + off, page))
				WARN_ONCE(1, "vm_insert_page() failed");
		}
		return 0;
	}
exit:
	spin_unlock(&kcov->lock);
	vfree(area);
	return res;
}

static int kcov_open(struct inode *inode, struct file *filep)
{
	struct kcov *kcov;

	kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
	if (!kcov)
		return -ENOMEM;
	atomic_set(&kcov->refcount, 1);
	spin_lock_init(&kcov->lock);
	filep->private_data = kcov;
	return nonseekable_open(inode, filep);
}

static int kcov_close(struct inode *inode, struct file *filep)
{
	kcov_put(filep->private_data);
	return 0;
}

static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
			     unsigned long arg)
{
	struct task_struct *t;
	unsigned long size, unused;

	switch (cmd) {
	case KCOV_INIT_TRACE:
		/*
		 * Enable kcov in trace mode and setup buffer size.
		 * Must happen before anything else.
		 */
		if (kcov->mode != KCOV_MODE_DISABLED)
			return -EBUSY;
		/*
		 * Size must be at least 2 to hold current position and one PC.
		 * Later we allocate size * sizeof(unsigned long) memory,
		 * that must not overflow.
		 */
		size = arg;
		if (size < 2 || size > INT_MAX / sizeof(unsigned long))
			return -EINVAL;
		kcov->size = size;
		kcov->mode = KCOV_MODE_TRACE;
		return 0;
	case KCOV_ENABLE:
		/*
		 * Enable coverage for the current task.
		 * At this point user must have been enabled trace mode,
		 * and mmapped the file. Coverage collection is disabled only
		 * at task exit or voluntary by KCOV_DISABLE. After that it can
		 * be enabled for another task.
		 */
		unused = arg;
		if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
		    kcov->area == NULL)
			return -EINVAL;
		if (kcov->t != NULL)
			return -EBUSY;
		t = current;
		/* Cache in task struct for performance. */
		t->kcov_size = kcov->size;
		t->kcov_area = kcov->area;
		/* See comment in __sanitizer_cov_trace_pc(). */
		barrier();
		WRITE_ONCE(t->kcov_mode, kcov->mode);
		t->kcov = kcov;
		kcov->t = t;
		/* This is put either in kcov_task_exit() or in KCOV_DISABLE. */
		kcov_get(kcov);
		return 0;
	case KCOV_DISABLE:
		/* Disable coverage for the current task. */
		unused = arg;
		if (unused != 0 || current->kcov != kcov)
			return -EINVAL;
		t = current;
		if (WARN_ON(kcov->t != t))
			return -EINVAL;
		kcov_task_init(t);
		kcov->t = NULL;
		kcov_put(kcov);
		return 0;
	default:
		return -ENOTTY;
	}
}

static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
{
	struct kcov *kcov;
	int res;

	kcov = filep->private_data;
	spin_lock(&kcov->lock);
	res = kcov_ioctl_locked(kcov, cmd, arg);
	spin_unlock(&kcov->lock);
	return res;
}

static const struct file_operations kcov_fops = {
	.open		= kcov_open,
	.unlocked_ioctl	= kcov_ioctl,
	.mmap		= kcov_mmap,
	.release        = kcov_close,
};

static int __init kcov_init(void)
{
	/*
	 * The kcov debugfs file won't ever get removed and thus,
	 * there is no need to protect it against removal races. The
	 * use of debugfs_create_file_unsafe() is actually safe here.
	 */
	if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) {
		pr_err("failed to create kcov in debugfs\n");
		return -ENOMEM;
	}
	return 0;
}

device_initcall(kcov_init);
back to top