Revision 4837fe37adff1d159904f0c013471b1ecbcb455e authored by Michal Hocko on 14 December 2017, 23:33:15 UTC, committed by Linus Torvalds on 15 December 2017, 00:00:49 UTC
David Rientjes has reported the following memory corruption while the
oom reaper tries to unmap the victims address space

  BUG: Bad page map in process oom_reaper  pte:6353826300000000 pmd:00000000
  addr:00007f50cab1d000 vm_flags:08100073 anon_vma:ffff9eea335603f0 mapping:          (null) index:7f50cab1d
  file:          (null) fault:          (null) mmap:          (null) readpage:          (null)
  CPU: 2 PID: 1001 Comm: oom_reaper
  Call Trace:
     unmap_page_range+0x1068/0x1130
     __oom_reap_task_mm+0xd5/0x16b
     oom_reaper+0xff/0x14c
     kthread+0xc1/0xe0

Tetsuo Handa has noticed that the synchronization inside exit_mmap is
insufficient.  We only synchronize with the oom reaper if
tsk_is_oom_victim which is not true if the final __mmput is called from
a different context than the oom victim exit path.  This can trivially
happen from context of any task which has grabbed mm reference (e.g.  to
read /proc/<pid>/ file which requires mm etc.).

The race would look like this

  oom_reaper		oom_victim		task
						mmget_not_zero
			do_exit
			  mmput
  __oom_reap_task_mm				mmput
  						  __mmput
						    exit_mmap
						      remove_vma
    unmap_page_range

Fix this issue by providing a new mm_is_oom_victim() helper which
operates on the mm struct rather than a task.  Any context which
operates on a remote mm struct should use this helper in place of
tsk_is_oom_victim.  The flag is set in mark_oom_victim and never cleared
so it is stable in the exit_mmap path.

Debugged by Tetsuo Handa.

Link: http://lkml.kernel.org/r/20171210095130.17110-1-mhocko@kernel.org
Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently")
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: David Rientjes <rientjes@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Andrea Argangeli <andrea@kernel.org>
Cc: <stable@vger.kernel.org>	[4.14]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent bdcf0a4
Raw File
iomap_32.c
/*
 * Copyright © 2008 Ingo Molnar
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 */

#include <asm/iomap.h>
#include <asm/pat.h>
#include <linux/export.h>
#include <linux/highmem.h>

static int is_io_mapping_possible(resource_size_t base, unsigned long size)
{
#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
	/* There is no way to map greater than 1 << 32 address without PAE */
	if (base + size > 0x100000000ULL)
		return 0;
#endif
	return 1;
}

int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
{
	enum page_cache_mode pcm = _PAGE_CACHE_MODE_WC;
	int ret;

	if (!is_io_mapping_possible(base, size))
		return -EINVAL;

	ret = io_reserve_memtype(base, base + size, &pcm);
	if (ret)
		return ret;

	*prot = __pgprot(__PAGE_KERNEL | cachemode2protval(pcm));
	return 0;
}
EXPORT_SYMBOL_GPL(iomap_create_wc);

void iomap_free(resource_size_t base, unsigned long size)
{
	io_free_memtype(base, base + size);
}
EXPORT_SYMBOL_GPL(iomap_free);

void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
{
	unsigned long vaddr;
	int idx, type;

	preempt_disable();
	pagefault_disable();

	type = kmap_atomic_idx_push();
	idx = type + KM_TYPE_NR * smp_processor_id();
	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
	set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
	arch_flush_lazy_mmu_mode();

	return (void *)vaddr;
}

/*
 * Map 'pfn' using protections 'prot'
 */
void __iomem *
iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
{
	/*
	 * For non-PAT systems, translate non-WB request to UC- just in
	 * case the caller set the PWT bit to prot directly without using
	 * pgprot_writecombine(). UC- translates to uncached if the MTRR
	 * is UC or WC. UC- gets the real intention, of the user, which is
	 * "WC if the MTRR is WC, UC if you can't do that."
	 */
	if (!pat_enabled() && pgprot2cachemode(prot) != _PAGE_CACHE_MODE_WB)
		prot = __pgprot(__PAGE_KERNEL |
				cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS));

	return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot);
}
EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);

void
iounmap_atomic(void __iomem *kvaddr)
{
	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;

	if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
	    vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
		int idx, type;

		type = kmap_atomic_idx();
		idx = type + KM_TYPE_NR * smp_processor_id();

#ifdef CONFIG_DEBUG_HIGHMEM
		WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
#endif
		/*
		 * Force other mappings to Oops if they'll try to access this
		 * pte without first remap it.  Keeping stale mappings around
		 * is a bad idea also, in case the page changes cacheability
		 * attributes or becomes a protected page in a hypervisor.
		 */
		kpte_clear_flush(kmap_pte-idx, vaddr);
		kmap_atomic_idx_pop();
	}

	pagefault_enable();
	preempt_enable();
}
EXPORT_SYMBOL_GPL(iounmap_atomic);
back to top