Revision e2093926a098a8ccf0f1d10f6df8dad452cb28d3 authored by Ross Zwisler on 02 June 2017, 21:46:37 UTC, committed by Linus Torvalds on 02 June 2017, 22:07:37 UTC
We currently have two related PMD vs PTE races in the DAX code.  These
can both be easily triggered by having two threads reading and writing
simultaneously to the same private mapping, with the key being that
private mapping reads can be handled with PMDs but private mapping
writes are always handled with PTEs so that we can COW.

Here is the first race:

  CPU 0					CPU 1

  (private mapping write)
  __handle_mm_fault()
    create_huge_pmd() - FALLBACK
    handle_pte_fault()
      passes check for pmd_devmap()

					(private mapping read)
					__handle_mm_fault()
					  create_huge_pmd()
					    dax_iomap_pmd_fault() inserts PMD

      dax_iomap_pte_fault() does a PTE fault, but we already have a DAX PMD
      			  installed in our page tables at this spot.

Here's the second race:

  CPU 0					CPU 1

  (private mapping read)
  __handle_mm_fault()
    passes check for pmd_none()
    create_huge_pmd()
      dax_iomap_pmd_fault() inserts PMD

  (private mapping write)
  __handle_mm_fault()
    create_huge_pmd() - FALLBACK
					(private mapping read)
					__handle_mm_fault()
					  passes check for pmd_none()
					  create_huge_pmd()

    handle_pte_fault()
      dax_iomap_pte_fault() inserts PTE
					    dax_iomap_pmd_fault() inserts PMD,
					       but we already have a PTE at
					       this spot.

The core of the issue is that while there is isolation between faults to
the same range in the DAX fault handlers via our DAX entry locking,
there is no isolation between faults in the code in mm/memory.c.  This
means for instance that this code in __handle_mm_fault() can run:

	if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
		ret = create_huge_pmd(&vmf);

But by the time we actually get to run the fault handler called by
create_huge_pmd(), the PMD is no longer pmd_none() because a racing PTE
fault has installed a normal PMD here as a parent.  This is the cause of
the 2nd race.  The first race is similar - there is the following check
in handle_pte_fault():

	} else {
		/* See comment in pte_alloc_one_map() */
		if (pmd_devmap(*vmf->pmd) || pmd_trans_unstable(vmf->pmd))
			return 0;

So if a pmd_devmap() PMD (a DAX PMD) has been installed at vmf->pmd, we
will bail and retry the fault.  This is correct, but there is nothing
preventing the PMD from being installed after this check but before we
actually get to the DAX PTE fault handlers.

In my testing these races result in the following types of errors:

  BUG: Bad rss-counter state mm:ffff8800a817d280 idx:1 val:1
  BUG: non-zero nr_ptes on freeing mm: 15

Fix this issue by having the DAX fault handlers verify that it is safe
to continue their fault after they have taken an entry lock to block
other racing faults.

[ross.zwisler@linux.intel.com: improve fix for colliding PMD & PTE entries]
  Link: http://lkml.kernel.org/r/20170526195932.32178-1-ross.zwisler@linux.intel.com
Link: http://lkml.kernel.org/r/20170522215749.23516-2-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reported-by: Pawel Lebioda <pawel.lebioda@intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Pawel Lebioda <pawel.lebioda@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Xiong Zhou <xzhou@redhat.com>
Cc: Eryu Guan <eguan@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent d0f0931
Raw File
util.h
#ifndef _UTIL_H
#define _UTIL_H

#include <stdarg.h>
#include <stdbool.h>
#include <getopt.h>

/*
 * Copyright 2011 The Chromium Authors, All Rights Reserved.
 * Copyright 2008 Jon Loeliger, Freescale Semiconductor, Inc.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
 *                                                                   USA
 */

#ifdef __GNUC__
#define PRINTF(i, j)	__attribute__((format (printf, i, j)))
#define NORETURN	__attribute__((noreturn))
#else
#define PRINTF(i, j)
#define NORETURN
#endif

#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))

static inline void NORETURN PRINTF(1, 2) die(const char *str, ...)
{
	va_list ap;

	va_start(ap, str);
	fprintf(stderr, "FATAL ERROR: ");
	vfprintf(stderr, str, ap);
	va_end(ap);
	exit(1);
}

static inline void *xmalloc(size_t len)
{
	void *new = malloc(len);

	if (!new)
		die("malloc() failed\n");

	return new;
}

static inline void *xrealloc(void *p, size_t len)
{
	void *new = realloc(p, len);

	if (!new)
		die("realloc() failed (len=%zd)\n", len);

	return new;
}

extern char *xstrdup(const char *s);

extern int PRINTF(2, 3) xasprintf(char **strp, const char *fmt, ...);
extern char *join_path(const char *path, const char *name);

/**
 * Check a property of a given length to see if it is all printable and
 * has a valid terminator. The property can contain either a single string,
 * or multiple strings each of non-zero length.
 *
 * @param data	The string to check
 * @param len	The string length including terminator
 * @return 1 if a valid printable string, 0 if not
 */
bool util_is_printable_string(const void *data, int len);

/*
 * Parse an escaped character starting at index i in string s.  The resulting
 * character will be returned and the index i will be updated to point at the
 * character directly after the end of the encoding, this may be the '\0'
 * terminator of the string.
 */
char get_escape_char(const char *s, int *i);

/**
 * Read a device tree file into a buffer. This will report any errors on
 * stderr.
 *
 * @param filename	The filename to read, or - for stdin
 * @return Pointer to allocated buffer containing fdt, or NULL on error
 */
char *utilfdt_read(const char *filename);

/**
 * Like utilfdt_read(), but also passes back the size of the file read.
 *
 * @param len		If non-NULL, the amount of data we managed to read
 */
char *utilfdt_read_len(const char *filename, off_t *len);

/**
 * Read a device tree file into a buffer. Does not report errors, but only
 * returns them. The value returned can be passed to strerror() to obtain
 * an error message for the user.
 *
 * @param filename	The filename to read, or - for stdin
 * @param buffp		Returns pointer to buffer containing fdt
 * @return 0 if ok, else an errno value representing the error
 */
int utilfdt_read_err(const char *filename, char **buffp);

/**
 * Like utilfdt_read_err(), but also passes back the size of the file read.
 *
 * @param len		If non-NULL, the amount of data we managed to read
 */
int utilfdt_read_err_len(const char *filename, char **buffp, off_t *len);

/**
 * Write a device tree buffer to a file. This will report any errors on
 * stderr.
 *
 * @param filename	The filename to write, or - for stdout
 * @param blob		Poiner to buffer containing fdt
 * @return 0 if ok, -1 on error
 */
int utilfdt_write(const char *filename, const void *blob);

/**
 * Write a device tree buffer to a file. Does not report errors, but only
 * returns them. The value returned can be passed to strerror() to obtain
 * an error message for the user.
 *
 * @param filename	The filename to write, or - for stdout
 * @param blob		Poiner to buffer containing fdt
 * @return 0 if ok, else an errno value representing the error
 */
int utilfdt_write_err(const char *filename, const void *blob);

/**
 * Decode a data type string. The purpose of this string
 *
 * The string consists of an optional character followed by the type:
 *	Modifier characters:
 *		hh or b	1 byte
 *		h	2 byte
 *		l	4 byte, default
 *
 *	Type character:
 *		s	string
 *		i	signed integer
 *		u	unsigned integer
 *		x	hex
 *
 * TODO: Implement ll modifier (8 bytes)
 * TODO: Implement o type (octal)
 *
 * @param fmt		Format string to process
 * @param type		Returns type found(s/d/u/x), or 0 if none
 * @param size		Returns size found(1,2,4,8) or 4 if none
 * @return 0 if ok, -1 on error (no type given, or other invalid format)
 */
int utilfdt_decode_type(const char *fmt, int *type, int *size);

/*
 * This is a usage message fragment for the -t option. It is the format
 * supported by utilfdt_decode_type.
 */

#define USAGE_TYPE_MSG \
	"<type>\ts=string, i=int, u=unsigned, x=hex\n" \
	"\tOptional modifier prefix:\n" \
	"\t\thh or b=byte, h=2 byte, l=4 byte (default)";

/**
 * Print property data in a readable format to stdout
 *
 * Properties that look like strings will be printed as strings. Otherwise
 * the data will be displayed either as cells (if len is a multiple of 4
 * bytes) or bytes.
 *
 * If len is 0 then this function does nothing.
 *
 * @param data	Pointers to property data
 * @param len	Length of property data
 */
void utilfdt_print_data(const char *data, int len);

/**
 * Show source version and exit
 */
void NORETURN util_version(void);

/**
 * Show usage and exit
 *
 * This helps standardize the output of various utils.  You most likely want
 * to use the usage() helper below rather than call this.
 *
 * @param errmsg	If non-NULL, an error message to display
 * @param synopsis	The initial example usage text (and possible examples)
 * @param short_opts	The string of short options
 * @param long_opts	The structure of long options
 * @param opts_help	An array of help strings (should align with long_opts)
 */
void NORETURN util_usage(const char *errmsg, const char *synopsis,
			 const char *short_opts,
			 struct option const long_opts[],
			 const char * const opts_help[]);

/**
 * Show usage and exit
 *
 * If you name all your usage variables with usage_xxx, then you can call this
 * help macro rather than expanding all arguments yourself.
 *
 * @param errmsg	If non-NULL, an error message to display
 */
#define usage(errmsg) \
	util_usage(errmsg, usage_synopsis, usage_short_opts, \
		   usage_long_opts, usage_opts_help)

/**
 * Call getopt_long() with standard options
 *
 * Since all util code runs getopt in the same way, provide a helper.
 */
#define util_getopt_long() getopt_long(argc, argv, usage_short_opts, \
				       usage_long_opts, NULL)

/* Helper for aligning long_opts array */
#define a_argument required_argument

/* Helper for usage_short_opts string constant */
#define USAGE_COMMON_SHORT_OPTS "hV"

/* Helper for usage_long_opts option array */
#define USAGE_COMMON_LONG_OPTS \
	{"help",      no_argument, NULL, 'h'}, \
	{"version",   no_argument, NULL, 'V'}, \
	{NULL,        no_argument, NULL, 0x0}

/* Helper for usage_opts_help array */
#define USAGE_COMMON_OPTS_HELP \
	"Print this help and exit", \
	"Print version and exit", \
	NULL

/* Helper for getopt case statements */
#define case_USAGE_COMMON_FLAGS \
	case 'h': usage(NULL); \
	case 'V': util_version(); \
	case '?': usage("unknown option");

#endif /* _UTIL_H */
back to top