Revision 3ad33b2436b545cbe8b28e53f3710432cad457ab authored by Lee Schermerhorn on 15 November 2007, 00:59:10 UTC, committed by Linus Torvalds on 15 November 2007, 02:45:38 UTC
We hit the BUG_ON() in mm/rmap.c:vma_address() when trying to migrate via
mbind(MPOL_MF_MOVE) a non-anon region that spans multiple vmas.  For
anon-regions, we just fail to migrate any pages beyond the 1st vma in the
range.

This occurs because do_mbind() collects a list of pages to migrate by
calling check_range().  check_range() walks the task's mm, spanning vmas as
necessary, to collect the migratable pages into a list.  Then, do_mbind()
calls migrate_pages() passing the list of pages, a function to allocate new
pages based on vma policy [new_vma_page()], and a pointer to the first vma
of the range.

For each page in the list, new_vma_page() calls page_address_in_vma()
passing the page and the vma [first in range] to obtain the address to get
for alloc_page_vma().  The page address is needed to get interleaving
policy correct.  If the pages in the list come from multiple vmas,
eventually, new_page_address() will pass that page to page_address_in_vma()
with the incorrect vma.  For !PageAnon pages, this will result in a bug
check in rmap.c:vma_address().  For anon pages, vma_address() will just
return EFAULT and fail the migration.

This patch modifies new_vma_page() to check the return value from
page_address_in_vma().  If the return value is EFAULT, new_vma_page()
searchs forward via vm_next for the vma that maps the page--i.e., that does
not return EFAULT.  This assumes that the pages in the list handed to
migrate_pages() is in address order.  This is currently case.  The patch
documents this assumption in a new comment block for new_vma_page().

If new_vma_page() cannot locate the vma mapping the page in a forward
search in the mm, it will pass a NULL vma to alloc_page_vma().  This will
result in the allocation using the task policy, if any, else system default
policy.  This situation is unlikely, but the patch documents this behavior
with a comment.

Note, this patch results in restarting from the first vma in a multi-vma
range each time new_vma_page() is called.  If this is not acceptable, we
can make the vma argument a pointer, both in new_vma_page() and it's caller
unmap_and_move() so that the value held by the loop in migrate_pages()
always passes down the last vma in which a page was found.  This will
require changes to all new_page_t functions passed to migrate_pages().  Is
this necessary?

For this patch to work, we can't bug check in vma_address() for pages
outside the argument vma.  This patch removes the BUG_ON().  All other
callers [besides new_vma_page()] already check the return status.

Tested on x86_64, 4 node NUMA platform.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent e1a1c99
Raw File
sfp-util.h
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/types.h>
#include <asm/byteorder.h>

#define add_ssaaaa(sh, sl, ah, al, bh, bl) ({		\
	unsigned int __sh = (ah);			\
	unsigned int __sl = (al);			\
	asm volatile(					\
		"	alr	%1,%3\n"		\
		"	brc	12,0f\n"		\
		"	ahi	%0,1\n"			\
		"0:	alr  %0,%2"			\
		: "+&d" (__sh), "+d" (__sl)		\
		: "d" (bh), "d" (bl) : "cc");		\
	(sh) = __sh;					\
	(sl) = __sl;					\
})

#define sub_ddmmss(sh, sl, ah, al, bh, bl) ({		\
	unsigned int __sh = (ah);			\
	unsigned int __sl = (al);			\
	asm volatile(					\
		"	slr	%1,%3\n"		\
		"	brc	3,0f\n"			\
		"	ahi	%0,-1\n"		\
		"0:	slr	%0,%2"			\
		: "+&d" (__sh), "+d" (__sl)		\
		: "d" (bh), "d" (bl) : "cc");		\
	(sh) = __sh;					\
	(sl) = __sl;					\
})

/* a umul b = a mul b + (a>=2<<31) ? b<<32:0 + (b>=2<<31) ? a<<32:0 */
#define umul_ppmm(wh, wl, u, v) ({			\
	unsigned int __wh = u;				\
	unsigned int __wl = v;				\
	asm volatile(					\
		"	ltr	1,%0\n"			\
		"	mr	0,%1\n"			\
		"	jnm	0f\n"				\
		"	alr	0,%1\n"			\
		"0:	ltr	%1,%1\n"			\
		"	jnm	1f\n"				\
		"	alr	0,%0\n"			\
		"1:	lr	%0,0\n"			\
		"	lr	%1,1\n"			\
		: "+d" (__wh), "+d" (__wl)		\
		: : "0", "1", "cc");			\
	wh = __wh;					\
	wl = __wl;					\
})

#ifdef __s390x__
#define udiv_qrnnd(q, r, n1, n0, d)			\
  do { unsigned long __n;				\
       unsigned int __r, __d;				\
    __n = ((unsigned long)(n1) << 32) + n0;		\
    __d = (d);						\
    (q) = __n / __d;					\
    (r) = __n % __d;					\
  } while (0)
#else
#define udiv_qrnnd(q, r, n1, n0, d)			\
  do { unsigned int __r;				\
    (q) = __udiv_qrnnd (&__r, (n1), (n0), (d));		\
    (r) = __r;						\
  } while (0)
extern unsigned long __udiv_qrnnd (unsigned int *, unsigned int,
				   unsigned int , unsigned int);
#endif

#define UDIV_NEEDS_NORMALIZATION 0

#define abort() return 0

#define __BYTE_ORDER __BIG_ENDIAN
back to top