Revision 9a291a7c9428155e8e623e4a3989f8be47134df5 authored by James Morse on 02 June 2017, 21:46:46 UTC, committed by Linus Torvalds on 02 June 2017, 22:07:38 UTC
KVM uses get_user_pages() to resolve its stage2 faults.  KVM sets the
FOLL_HWPOISON flag causing faultin_page() to return -EHWPOISON when it
finds a VM_FAULT_HWPOISON.  KVM handles these hwpoison pages as a
special case.  (check_user_page_hwpoison())

When huge pages are involved, this doesn't work so well.
get_user_pages() calls follow_hugetlb_page(), which stops early if it
receives VM_FAULT_HWPOISON from hugetlb_fault(), eventually returning
-EFAULT to the caller.  The step to map this to -EHWPOISON based on the
FOLL_ flags is missing.  The hwpoison special case is skipped, and
-EFAULT is returned to user-space, causing Qemu or kvmtool to exit.

Instead, move this VM_FAULT_ to errno mapping code into a header file
and use it from faultin_page() and follow_hugetlb_page().

With this, KVM works as expected.

This isn't a problem for arm64 today as we haven't enabled
MEMORY_FAILURE, but I can't see any reason this doesn't happen on x86
too, so I think this should be a fix.  This doesn't apply earlier than
stable's v4.11.1 due to all sorts of cleanup.

[james.morse@arm.com: add vm_fault_to_errno() call to faultin_page()]
suggested.
  Link: http://lkml.kernel.org/r/20170525171035.16359-1-james.morse@arm.com
[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20170524160900.28786-1-james.morse@arm.com
Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: <stable@vger.kernel.org>	[4.11.1+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 70feee0
Raw File
cache_lib.c
/*
 * linux/fs/nfs/cache_lib.c
 *
 * Helper routines for the NFS client caches
 *
 * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
 */
#include <linux/kmod.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/slab.h>
#include <linux/sunrpc/cache.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
#include <net/net_namespace.h>

#include "cache_lib.h"

#define NFS_CACHE_UPCALL_PATHLEN 256
#define NFS_CACHE_UPCALL_TIMEOUT 15

static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] =
				"/sbin/nfs_cache_getent";
static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT;

module_param_string(cache_getent, nfs_cache_getent_prog,
		sizeof(nfs_cache_getent_prog), 0600);
MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program");
module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600);
MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which "
		"the cache upcall is assumed to have failed");

int nfs_cache_upcall(struct cache_detail *cd, char *entry_name)
{
	static char *envp[] = { "HOME=/",
		"TERM=linux",
		"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
		NULL
	};
	char *argv[] = {
		nfs_cache_getent_prog,
		cd->name,
		entry_name,
		NULL
	};
	int ret = -EACCES;

	if (nfs_cache_getent_prog[0] == '\0')
		goto out;
	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
	/*
	 * Disable the upcall mechanism if we're getting an ENOENT or
	 * EACCES error. The admin can re-enable it on the fly by using
	 * sysfs to set the 'cache_getent' parameter once the problem
	 * has been fixed.
	 */
	if (ret == -ENOENT || ret == -EACCES)
		nfs_cache_getent_prog[0] = '\0';
out:
	return ret > 0 ? 0 : ret;
}

/*
 * Deferred request handling
 */
void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq)
{
	if (atomic_dec_and_test(&dreq->count))
		kfree(dreq);
}

static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany)
{
	struct nfs_cache_defer_req *dreq;

	dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);

	complete(&dreq->completion);
	nfs_cache_defer_req_put(dreq);
}

static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req)
{
	struct nfs_cache_defer_req *dreq;

	dreq = container_of(req, struct nfs_cache_defer_req, req);
	dreq->deferred_req.revisit = nfs_dns_cache_revisit;
	atomic_inc(&dreq->count);

	return &dreq->deferred_req;
}

struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void)
{
	struct nfs_cache_defer_req *dreq;

	dreq = kzalloc(sizeof(*dreq), GFP_KERNEL);
	if (dreq) {
		init_completion(&dreq->completion);
		atomic_set(&dreq->count, 1);
		dreq->req.defer = nfs_dns_cache_defer;
	}
	return dreq;
}

int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
{
	if (wait_for_completion_timeout(&dreq->completion,
			nfs_cache_getent_timeout * HZ) == 0)
		return -ETIMEDOUT;
	return 0;
}

int nfs_cache_register_sb(struct super_block *sb, struct cache_detail *cd)
{
	int ret;
	struct dentry *dir;

	dir = rpc_d_lookup_sb(sb, "cache");
	ret = sunrpc_cache_register_pipefs(dir, cd->name, 0600, cd);
	dput(dir);
	return ret;
}

int nfs_cache_register_net(struct net *net, struct cache_detail *cd)
{
	struct super_block *pipefs_sb;
	int ret = 0;

	sunrpc_init_cache_detail(cd);
	pipefs_sb = rpc_get_sb_net(net);
	if (pipefs_sb) {
		ret = nfs_cache_register_sb(pipefs_sb, cd);
		rpc_put_sb_net(net);
		if (ret)
			sunrpc_destroy_cache_detail(cd);
	}
	return ret;
}

void nfs_cache_unregister_sb(struct super_block *sb, struct cache_detail *cd)
{
	sunrpc_cache_unregister_pipefs(cd);
}

void nfs_cache_unregister_net(struct net *net, struct cache_detail *cd)
{
	struct super_block *pipefs_sb;

	pipefs_sb = rpc_get_sb_net(net);
	if (pipefs_sb) {
		nfs_cache_unregister_sb(pipefs_sb, cd);
		rpc_put_sb_net(net);
	}
	sunrpc_destroy_cache_detail(cd);
}
back to top