Revision 3ad33b2436b545cbe8b28e53f3710432cad457ab authored by Lee Schermerhorn on 15 November 2007, 00:59:10 UTC, committed by Linus Torvalds on 15 November 2007, 02:45:38 UTC
We hit the BUG_ON() in mm/rmap.c:vma_address() when trying to migrate via
mbind(MPOL_MF_MOVE) a non-anon region that spans multiple vmas.  For
anon-regions, we just fail to migrate any pages beyond the 1st vma in the
range.

This occurs because do_mbind() collects a list of pages to migrate by
calling check_range().  check_range() walks the task's mm, spanning vmas as
necessary, to collect the migratable pages into a list.  Then, do_mbind()
calls migrate_pages() passing the list of pages, a function to allocate new
pages based on vma policy [new_vma_page()], and a pointer to the first vma
of the range.

For each page in the list, new_vma_page() calls page_address_in_vma()
passing the page and the vma [first in range] to obtain the address to get
for alloc_page_vma().  The page address is needed to get interleaving
policy correct.  If the pages in the list come from multiple vmas,
eventually, new_page_address() will pass that page to page_address_in_vma()
with the incorrect vma.  For !PageAnon pages, this will result in a bug
check in rmap.c:vma_address().  For anon pages, vma_address() will just
return EFAULT and fail the migration.

This patch modifies new_vma_page() to check the return value from
page_address_in_vma().  If the return value is EFAULT, new_vma_page()
searchs forward via vm_next for the vma that maps the page--i.e., that does
not return EFAULT.  This assumes that the pages in the list handed to
migrate_pages() is in address order.  This is currently case.  The patch
documents this assumption in a new comment block for new_vma_page().

If new_vma_page() cannot locate the vma mapping the page in a forward
search in the mm, it will pass a NULL vma to alloc_page_vma().  This will
result in the allocation using the task policy, if any, else system default
policy.  This situation is unlikely, but the patch documents this behavior
with a comment.

Note, this patch results in restarting from the first vma in a multi-vma
range each time new_vma_page() is called.  If this is not acceptable, we
can make the vma argument a pointer, both in new_vma_page() and it's caller
unmap_and_move() so that the value held by the loop in migrate_pages()
always passes down the last vma in which a page was found.  This will
require changes to all new_page_t functions passed to migrate_pages().  Is
this necessary?

For this patch to work, we can't bug check in vma_address() for pages
outside the argument vma.  This patch removes the BUG_ON().  All other
callers [besides new_vma_page()] already check the return status.

Tested on x86_64, 4 node NUMA platform.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent e1a1c99
Raw File
qeth.h
/*
 * include/asm-s390/qeth.h
 *
 * ioctl definitions for qeth driver
 *
 * Copyright (C) 2004 IBM Corporation
 *
 * Author(s):	Thomas Spatzier <tspat@de.ibm.com>
 *
 */
#ifndef __ASM_S390_QETH_IOCTL_H__
#define __ASM_S390_QETH_IOCTL_H__
#include <linux/ioctl.h>

#define SIOC_QETH_ARP_SET_NO_ENTRIES    (SIOCDEVPRIVATE)
#define SIOC_QETH_ARP_QUERY_INFO        (SIOCDEVPRIVATE + 1)
#define SIOC_QETH_ARP_ADD_ENTRY         (SIOCDEVPRIVATE + 2)
#define SIOC_QETH_ARP_REMOVE_ENTRY      (SIOCDEVPRIVATE + 3)
#define SIOC_QETH_ARP_FLUSH_CACHE       (SIOCDEVPRIVATE + 4)
#define SIOC_QETH_ADP_SET_SNMP_CONTROL  (SIOCDEVPRIVATE + 5)
#define SIOC_QETH_GET_CARD_TYPE         (SIOCDEVPRIVATE + 6)

struct qeth_arp_cache_entry {
	__u8  macaddr[6];
	__u8  reserved1[2];
	__u8  ipaddr[16]; /* for both  IPv4 and IPv6 */
	__u8  reserved2[32];
} __attribute__ ((packed));

struct qeth_arp_qi_entry7 {
	__u8 media_specific[32];
	__u8 macaddr_type;
	__u8 ipaddr_type;
	__u8 macaddr[6];
	__u8 ipaddr[4];
} __attribute__((packed));

struct qeth_arp_qi_entry7_short {
	__u8 macaddr_type;
	__u8 ipaddr_type;
	__u8 macaddr[6];
	__u8 ipaddr[4];
} __attribute__((packed));

struct qeth_arp_qi_entry5 {
	__u8 media_specific[32];
	__u8 macaddr_type;
	__u8 ipaddr_type;
	__u8 ipaddr[4];
} __attribute__((packed));

struct qeth_arp_qi_entry5_short {
	__u8 macaddr_type;
	__u8 ipaddr_type;
	__u8 ipaddr[4];
} __attribute__((packed));

/*
 * can be set by user if no "media specific information" is wanted
 * -> saves a lot of space in user space buffer
 */
#define QETH_QARP_STRIP_ENTRIES  0x8000
#define QETH_QARP_REQUEST_MASK   0x00ff

/* data sent to user space as result of query arp ioctl */
#define QETH_QARP_USER_DATA_SIZE 20000
#define QETH_QARP_MASK_OFFSET    4
#define QETH_QARP_ENTRIES_OFFSET 6
struct qeth_arp_query_user_data {
	union {
		__u32 data_len;		/* set by user space program */
		__u32 no_entries;	/* set by kernel */
	} u;
	__u16 mask_bits;
	char *entries;
} __attribute__((packed));

#endif /* __ASM_S390_QETH_IOCTL_H__ */
back to top