Revision 3ad33b2436b545cbe8b28e53f3710432cad457ab authored by Lee Schermerhorn on 15 November 2007, 00:59:10 UTC, committed by Linus Torvalds on 15 November 2007, 02:45:38 UTC
We hit the BUG_ON() in mm/rmap.c:vma_address() when trying to migrate via mbind(MPOL_MF_MOVE) a non-anon region that spans multiple vmas. For anon-regions, we just fail to migrate any pages beyond the 1st vma in the range. This occurs because do_mbind() collects a list of pages to migrate by calling check_range(). check_range() walks the task's mm, spanning vmas as necessary, to collect the migratable pages into a list. Then, do_mbind() calls migrate_pages() passing the list of pages, a function to allocate new pages based on vma policy [new_vma_page()], and a pointer to the first vma of the range. For each page in the list, new_vma_page() calls page_address_in_vma() passing the page and the vma [first in range] to obtain the address to get for alloc_page_vma(). The page address is needed to get interleaving policy correct. If the pages in the list come from multiple vmas, eventually, new_page_address() will pass that page to page_address_in_vma() with the incorrect vma. For !PageAnon pages, this will result in a bug check in rmap.c:vma_address(). For anon pages, vma_address() will just return EFAULT and fail the migration. This patch modifies new_vma_page() to check the return value from page_address_in_vma(). If the return value is EFAULT, new_vma_page() searchs forward via vm_next for the vma that maps the page--i.e., that does not return EFAULT. This assumes that the pages in the list handed to migrate_pages() is in address order. This is currently case. The patch documents this assumption in a new comment block for new_vma_page(). If new_vma_page() cannot locate the vma mapping the page in a forward search in the mm, it will pass a NULL vma to alloc_page_vma(). This will result in the allocation using the task policy, if any, else system default policy. This situation is unlikely, but the patch documents this behavior with a comment. Note, this patch results in restarting from the first vma in a multi-vma range each time new_vma_page() is called. If this is not acceptable, we can make the vma argument a pointer, both in new_vma_page() and it's caller unmap_and_move() so that the value held by the loop in migrate_pages() always passes down the last vma in which a page was found. This will require changes to all new_page_t functions passed to migrate_pages(). Is this necessary? For this patch to work, we can't bug check in vma_address() for pages outside the argument vma. This patch removes the BUG_ON(). All other callers [besides new_vma_page()] already check the return status. Tested on x86_64, 4 node NUMA platform. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Acked-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent e1a1c99
posix_types.h
/*
* include/asm-s390/posix_types.h
*
* S390 version
*
* Derived from "include/asm-i386/posix_types.h"
*/
#ifndef __ARCH_S390_POSIX_TYPES_H
#define __ARCH_S390_POSIX_TYPES_H
/*
* This file is generally used by user-level software, so you need to
* be a little careful about namespace pollution etc. Also, we cannot
* assume GCC is being used.
*/
typedef long __kernel_off_t;
typedef int __kernel_pid_t;
typedef unsigned long __kernel_size_t;
typedef long __kernel_time_t;
typedef long __kernel_suseconds_t;
typedef long __kernel_clock_t;
typedef int __kernel_timer_t;
typedef int __kernel_clockid_t;
typedef int __kernel_daddr_t;
typedef char * __kernel_caddr_t;
typedef unsigned short __kernel_uid16_t;
typedef unsigned short __kernel_gid16_t;
#ifdef __GNUC__
typedef long long __kernel_loff_t;
#endif
#ifndef __s390x__
typedef unsigned long __kernel_ino_t;
typedef unsigned short __kernel_mode_t;
typedef unsigned short __kernel_nlink_t;
typedef unsigned short __kernel_ipc_pid_t;
typedef unsigned short __kernel_uid_t;
typedef unsigned short __kernel_gid_t;
typedef int __kernel_ssize_t;
typedef int __kernel_ptrdiff_t;
typedef unsigned int __kernel_uid32_t;
typedef unsigned int __kernel_gid32_t;
typedef unsigned short __kernel_old_uid_t;
typedef unsigned short __kernel_old_gid_t;
typedef unsigned short __kernel_old_dev_t;
#else /* __s390x__ */
typedef unsigned int __kernel_ino_t;
typedef unsigned int __kernel_mode_t;
typedef unsigned int __kernel_nlink_t;
typedef int __kernel_ipc_pid_t;
typedef unsigned int __kernel_uid_t;
typedef unsigned int __kernel_gid_t;
typedef long __kernel_ssize_t;
typedef long __kernel_ptrdiff_t;
typedef unsigned long __kernel_sigset_t; /* at least 32 bits */
typedef __kernel_uid_t __kernel_old_uid_t;
typedef __kernel_gid_t __kernel_old_gid_t;
typedef __kernel_uid_t __kernel_uid32_t;
typedef __kernel_gid_t __kernel_gid32_t;
typedef unsigned short __kernel_old_dev_t;
#endif /* __s390x__ */
typedef struct {
#if defined(__KERNEL__) || defined(__USE_ALL)
int val[2];
#else /* !defined(__KERNEL__) && !defined(__USE_ALL)*/
int __val[2];
#endif /* !defined(__KERNEL__) && !defined(__USE_ALL)*/
} __kernel_fsid_t;
#ifdef __KERNEL__
#undef __FD_SET
static inline void __FD_SET(unsigned long fd, __kernel_fd_set *fdsetp)
{
unsigned long _tmp = fd / __NFDBITS;
unsigned long _rem = fd % __NFDBITS;
fdsetp->fds_bits[_tmp] |= (1UL<<_rem);
}
#undef __FD_CLR
static inline void __FD_CLR(unsigned long fd, __kernel_fd_set *fdsetp)
{
unsigned long _tmp = fd / __NFDBITS;
unsigned long _rem = fd % __NFDBITS;
fdsetp->fds_bits[_tmp] &= ~(1UL<<_rem);
}
#undef __FD_ISSET
static inline int __FD_ISSET(unsigned long fd, const __kernel_fd_set *fdsetp)
{
unsigned long _tmp = fd / __NFDBITS;
unsigned long _rem = fd % __NFDBITS;
return (fdsetp->fds_bits[_tmp] & (1UL<<_rem)) != 0;
}
#undef __FD_ZERO
#define __FD_ZERO(fdsetp) \
((void) memset ((void *) (fdsetp), 0, sizeof (__kernel_fd_set)))
#endif /* __KERNEL__ */
#endif
![swh spinner](/static/img/swh-spinner.gif)
Computing file changes ...