Revision e2093926a098a8ccf0f1d10f6df8dad452cb28d3 authored by Ross Zwisler on 02 June 2017, 21:46:37 UTC, committed by Linus Torvalds on 02 June 2017, 22:07:37 UTC
We currently have two related PMD vs PTE races in the DAX code.  These
can both be easily triggered by having two threads reading and writing
simultaneously to the same private mapping, with the key being that
private mapping reads can be handled with PMDs but private mapping
writes are always handled with PTEs so that we can COW.

Here is the first race:

  CPU 0					CPU 1

  (private mapping write)
  __handle_mm_fault()
    create_huge_pmd() - FALLBACK
    handle_pte_fault()
      passes check for pmd_devmap()

					(private mapping read)
					__handle_mm_fault()
					  create_huge_pmd()
					    dax_iomap_pmd_fault() inserts PMD

      dax_iomap_pte_fault() does a PTE fault, but we already have a DAX PMD
      			  installed in our page tables at this spot.

Here's the second race:

  CPU 0					CPU 1

  (private mapping read)
  __handle_mm_fault()
    passes check for pmd_none()
    create_huge_pmd()
      dax_iomap_pmd_fault() inserts PMD

  (private mapping write)
  __handle_mm_fault()
    create_huge_pmd() - FALLBACK
					(private mapping read)
					__handle_mm_fault()
					  passes check for pmd_none()
					  create_huge_pmd()

    handle_pte_fault()
      dax_iomap_pte_fault() inserts PTE
					    dax_iomap_pmd_fault() inserts PMD,
					       but we already have a PTE at
					       this spot.

The core of the issue is that while there is isolation between faults to
the same range in the DAX fault handlers via our DAX entry locking,
there is no isolation between faults in the code in mm/memory.c.  This
means for instance that this code in __handle_mm_fault() can run:

	if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
		ret = create_huge_pmd(&vmf);

But by the time we actually get to run the fault handler called by
create_huge_pmd(), the PMD is no longer pmd_none() because a racing PTE
fault has installed a normal PMD here as a parent.  This is the cause of
the 2nd race.  The first race is similar - there is the following check
in handle_pte_fault():

	} else {
		/* See comment in pte_alloc_one_map() */
		if (pmd_devmap(*vmf->pmd) || pmd_trans_unstable(vmf->pmd))
			return 0;

So if a pmd_devmap() PMD (a DAX PMD) has been installed at vmf->pmd, we
will bail and retry the fault.  This is correct, but there is nothing
preventing the PMD from being installed after this check but before we
actually get to the DAX PTE fault handlers.

In my testing these races result in the following types of errors:

  BUG: Bad rss-counter state mm:ffff8800a817d280 idx:1 val:1
  BUG: non-zero nr_ptes on freeing mm: 15

Fix this issue by having the DAX fault handlers verify that it is safe
to continue their fault after they have taken an entry lock to block
other racing faults.

[ross.zwisler@linux.intel.com: improve fix for colliding PMD & PTE entries]
  Link: http://lkml.kernel.org/r/20170526195932.32178-1-ross.zwisler@linux.intel.com
Link: http://lkml.kernel.org/r/20170522215749.23516-2-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reported-by: Pawel Lebioda <pawel.lebioda@intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Pawel Lebioda <pawel.lebioda@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Xiong Zhou <xzhou@redhat.com>
Cc: Eryu Guan <eguan@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent d0f0931
Raw File
matrix-keymap.c
/*
 * Helpers for matrix keyboard bindings
 *
 * Copyright (C) 2012 Google, Inc
 *
 * Author:
 *	Olof Johansson <olof@lixom.net>
 *
 * This software is licensed under the terms of the GNU General Public
 * License version 2, as published by the Free Software Foundation, and
 * may be copied, distributed, and modified under those terms.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <linux/device.h>
#include <linux/export.h>
#include <linux/gfp.h>
#include <linux/input.h>
#include <linux/input/matrix_keypad.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/property.h>
#include <linux/slab.h>
#include <linux/types.h>

static bool matrix_keypad_map_key(struct input_dev *input_dev,
				  unsigned int rows, unsigned int cols,
				  unsigned int row_shift, unsigned int key)
{
	unsigned short *keymap = input_dev->keycode;
	unsigned int row = KEY_ROW(key);
	unsigned int col = KEY_COL(key);
	unsigned short code = KEY_VAL(key);

	if (row >= rows || col >= cols) {
		dev_err(input_dev->dev.parent,
			"%s: invalid keymap entry 0x%x (row: %d, col: %d, rows: %d, cols: %d)\n",
			__func__, key, row, col, rows, cols);
		return false;
	}

	keymap[MATRIX_SCAN_CODE(row, col, row_shift)] = code;
	__set_bit(code, input_dev->keybit);

	return true;
}

/**
 * matrix_keypad_parse_properties() - Read properties of matrix keypad
 *
 * @dev: Device containing properties
 * @rows: Returns number of matrix rows
 * @cols: Returns number of matrix columns
 * @return 0 if OK, <0 on error
 */
int matrix_keypad_parse_properties(struct device *dev,
				   unsigned int *rows, unsigned int *cols)
{
	*rows = *cols = 0;

	device_property_read_u32(dev, "keypad,num-rows", rows);
	device_property_read_u32(dev, "keypad,num-columns", cols);

	if (!*rows || !*cols) {
		dev_err(dev, "number of keypad rows/columns not specified\n");
		return -EINVAL;
	}

	return 0;
}
EXPORT_SYMBOL_GPL(matrix_keypad_parse_properties);

static int matrix_keypad_parse_keymap(const char *propname,
				      unsigned int rows, unsigned int cols,
				      struct input_dev *input_dev)
{
	struct device *dev = input_dev->dev.parent;
	unsigned int row_shift = get_count_order(cols);
	unsigned int max_keys = rows << row_shift;
	u32 *keys;
	int i;
	int size;
	int retval;

	if (!propname)
		propname = "linux,keymap";

	size = device_property_read_u32_array(dev, propname, NULL, 0);
	if (size <= 0) {
		dev_err(dev, "missing or malformed property %s: %d\n",
			propname, size);
		return size < 0 ? size : -EINVAL;
	}

	if (size > max_keys) {
		dev_err(dev, "%s size overflow (%d vs max %u)\n",
			propname, size, max_keys);
		return -EINVAL;
	}

	keys = kmalloc_array(size, sizeof(u32), GFP_KERNEL);
	if (!keys)
		return -ENOMEM;

	retval = device_property_read_u32_array(dev, propname, keys, size);
	if (retval) {
		dev_err(dev, "failed to read %s property: %d\n",
			propname, retval);
		goto out;
	}

	for (i = 0; i < size; i++) {
		if (!matrix_keypad_map_key(input_dev, rows, cols,
					   row_shift, keys[i])) {
			retval = -EINVAL;
			goto out;
		}
	}

	retval = 0;

out:
	kfree(keys);
	return retval;
}

/**
 * matrix_keypad_build_keymap - convert platform keymap into matrix keymap
 * @keymap_data: keymap supplied by the platform code
 * @keymap_name: name of device tree property containing keymap (if device
 *	tree support is enabled).
 * @rows: number of rows in target keymap array
 * @cols: number of cols in target keymap array
 * @keymap: expanded version of keymap that is suitable for use by
 * matrix keyboard driver
 * @input_dev: input devices for which we are setting up the keymap
 *
 * This function converts platform keymap (encoded with KEY() macro) into
 * an array of keycodes that is suitable for using in a standard matrix
 * keyboard driver that uses row and col as indices.
 *
 * If @keymap_data is not supplied and device tree support is enabled
 * it will attempt load the keymap from property specified by @keymap_name
 * argument (or "linux,keymap" if @keymap_name is %NULL).
 *
 * If @keymap is %NULL the function will automatically allocate managed
 * block of memory to store the keymap. This memory will be associated with
 * the parent device and automatically freed when device unbinds from the
 * driver.
 *
 * Callers are expected to set up input_dev->dev.parent before calling this
 * function.
 */
int matrix_keypad_build_keymap(const struct matrix_keymap_data *keymap_data,
			       const char *keymap_name,
			       unsigned int rows, unsigned int cols,
			       unsigned short *keymap,
			       struct input_dev *input_dev)
{
	unsigned int row_shift = get_count_order(cols);
	size_t max_keys = rows << row_shift;
	int i;
	int error;

	if (WARN_ON(!input_dev->dev.parent))
		return -EINVAL;

	if (!keymap) {
		keymap = devm_kzalloc(input_dev->dev.parent,
				      max_keys * sizeof(*keymap),
				      GFP_KERNEL);
		if (!keymap) {
			dev_err(input_dev->dev.parent,
				"Unable to allocate memory for keymap");
			return -ENOMEM;
		}
	}

	input_dev->keycode = keymap;
	input_dev->keycodesize = sizeof(*keymap);
	input_dev->keycodemax = max_keys;

	__set_bit(EV_KEY, input_dev->evbit);

	if (keymap_data) {
		for (i = 0; i < keymap_data->keymap_size; i++) {
			unsigned int key = keymap_data->keymap[i];

			if (!matrix_keypad_map_key(input_dev, rows, cols,
						   row_shift, key))
				return -EINVAL;
		}
	} else {
		error = matrix_keypad_parse_keymap(keymap_name, rows, cols,
						   input_dev);
		if (error)
			return error;
	}

	__clear_bit(KEY_RESERVED, input_dev->keybit);

	return 0;
}
EXPORT_SYMBOL(matrix_keypad_build_keymap);

MODULE_LICENSE("GPL");
back to top