Revision e2093926a098a8ccf0f1d10f6df8dad452cb28d3 authored by Ross Zwisler on 02 June 2017, 21:46:37 UTC, committed by Linus Torvalds on 02 June 2017, 22:07:37 UTC
We currently have two related PMD vs PTE races in the DAX code.  These
can both be easily triggered by having two threads reading and writing
simultaneously to the same private mapping, with the key being that
private mapping reads can be handled with PMDs but private mapping
writes are always handled with PTEs so that we can COW.

Here is the first race:

  CPU 0					CPU 1

  (private mapping write)
  __handle_mm_fault()
    create_huge_pmd() - FALLBACK
    handle_pte_fault()
      passes check for pmd_devmap()

					(private mapping read)
					__handle_mm_fault()
					  create_huge_pmd()
					    dax_iomap_pmd_fault() inserts PMD

      dax_iomap_pte_fault() does a PTE fault, but we already have a DAX PMD
      			  installed in our page tables at this spot.

Here's the second race:

  CPU 0					CPU 1

  (private mapping read)
  __handle_mm_fault()
    passes check for pmd_none()
    create_huge_pmd()
      dax_iomap_pmd_fault() inserts PMD

  (private mapping write)
  __handle_mm_fault()
    create_huge_pmd() - FALLBACK
					(private mapping read)
					__handle_mm_fault()
					  passes check for pmd_none()
					  create_huge_pmd()

    handle_pte_fault()
      dax_iomap_pte_fault() inserts PTE
					    dax_iomap_pmd_fault() inserts PMD,
					       but we already have a PTE at
					       this spot.

The core of the issue is that while there is isolation between faults to
the same range in the DAX fault handlers via our DAX entry locking,
there is no isolation between faults in the code in mm/memory.c.  This
means for instance that this code in __handle_mm_fault() can run:

	if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
		ret = create_huge_pmd(&vmf);

But by the time we actually get to run the fault handler called by
create_huge_pmd(), the PMD is no longer pmd_none() because a racing PTE
fault has installed a normal PMD here as a parent.  This is the cause of
the 2nd race.  The first race is similar - there is the following check
in handle_pte_fault():

	} else {
		/* See comment in pte_alloc_one_map() */
		if (pmd_devmap(*vmf->pmd) || pmd_trans_unstable(vmf->pmd))
			return 0;

So if a pmd_devmap() PMD (a DAX PMD) has been installed at vmf->pmd, we
will bail and retry the fault.  This is correct, but there is nothing
preventing the PMD from being installed after this check but before we
actually get to the DAX PTE fault handlers.

In my testing these races result in the following types of errors:

  BUG: Bad rss-counter state mm:ffff8800a817d280 idx:1 val:1
  BUG: non-zero nr_ptes on freeing mm: 15

Fix this issue by having the DAX fault handlers verify that it is safe
to continue their fault after they have taken an entry lock to block
other racing faults.

[ross.zwisler@linux.intel.com: improve fix for colliding PMD & PTE entries]
  Link: http://lkml.kernel.org/r/20170526195932.32178-1-ross.zwisler@linux.intel.com
Link: http://lkml.kernel.org/r/20170522215749.23516-2-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reported-by: Pawel Lebioda <pawel.lebioda@intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Pawel Lebioda <pawel.lebioda@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Xiong Zhou <xzhou@redhat.com>
Cc: Eryu Guan <eguan@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent d0f0931
Raw File
spmi.h
/* Copyright (c) 2012-2013, The Linux Foundation. All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 and
 * only version 2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
#ifndef _LINUX_SPMI_H
#define _LINUX_SPMI_H

#include <linux/types.h>
#include <linux/device.h>
#include <linux/mod_devicetable.h>

/* Maximum slave identifier */
#define SPMI_MAX_SLAVE_ID		16

/* SPMI Commands */
#define SPMI_CMD_EXT_WRITE		0x00
#define SPMI_CMD_RESET			0x10
#define SPMI_CMD_SLEEP			0x11
#define SPMI_CMD_SHUTDOWN		0x12
#define SPMI_CMD_WAKEUP			0x13
#define SPMI_CMD_AUTHENTICATE		0x14
#define SPMI_CMD_MSTR_READ		0x15
#define SPMI_CMD_MSTR_WRITE		0x16
#define SPMI_CMD_TRANSFER_BUS_OWNERSHIP	0x1A
#define SPMI_CMD_DDB_MASTER_READ	0x1B
#define SPMI_CMD_DDB_SLAVE_READ		0x1C
#define SPMI_CMD_EXT_READ		0x20
#define SPMI_CMD_EXT_WRITEL		0x30
#define SPMI_CMD_EXT_READL		0x38
#define SPMI_CMD_WRITE			0x40
#define SPMI_CMD_READ			0x60
#define SPMI_CMD_ZERO_WRITE		0x80

/**
 * struct spmi_device - Basic representation of an SPMI device
 * @dev:	Driver model representation of the device.
 * @ctrl:	SPMI controller managing the bus hosting this device.
 * @usid:	This devices' Unique Slave IDentifier.
 */
struct spmi_device {
	struct device		dev;
	struct spmi_controller	*ctrl;
	u8			usid;
};

static inline struct spmi_device *to_spmi_device(struct device *d)
{
	return container_of(d, struct spmi_device, dev);
}

static inline void *spmi_device_get_drvdata(const struct spmi_device *sdev)
{
	return dev_get_drvdata(&sdev->dev);
}

static inline void spmi_device_set_drvdata(struct spmi_device *sdev, void *data)
{
	dev_set_drvdata(&sdev->dev, data);
}

struct spmi_device *spmi_device_alloc(struct spmi_controller *ctrl);

static inline void spmi_device_put(struct spmi_device *sdev)
{
	if (sdev)
		put_device(&sdev->dev);
}

int spmi_device_add(struct spmi_device *sdev);

void spmi_device_remove(struct spmi_device *sdev);

/**
 * struct spmi_controller - interface to the SPMI master controller
 * @dev:	Driver model representation of the device.
 * @nr:		board-specific number identifier for this controller/bus
 * @cmd:	sends a non-data command sequence on the SPMI bus.
 * @read_cmd:	sends a register read command sequence on the SPMI bus.
 * @write_cmd:	sends a register write command sequence on the SPMI bus.
 */
struct spmi_controller {
	struct device		dev;
	unsigned int		nr;
	int	(*cmd)(struct spmi_controller *ctrl, u8 opcode, u8 sid);
	int	(*read_cmd)(struct spmi_controller *ctrl, u8 opcode,
			    u8 sid, u16 addr, u8 *buf, size_t len);
	int	(*write_cmd)(struct spmi_controller *ctrl, u8 opcode,
			     u8 sid, u16 addr, const u8 *buf, size_t len);
};

static inline struct spmi_controller *to_spmi_controller(struct device *d)
{
	return container_of(d, struct spmi_controller, dev);
}

static inline
void *spmi_controller_get_drvdata(const struct spmi_controller *ctrl)
{
	return dev_get_drvdata(&ctrl->dev);
}

static inline void spmi_controller_set_drvdata(struct spmi_controller *ctrl,
					       void *data)
{
	dev_set_drvdata(&ctrl->dev, data);
}

struct spmi_controller *spmi_controller_alloc(struct device *parent,
					      size_t size);

/**
 * spmi_controller_put() - decrement controller refcount
 * @ctrl	SPMI controller.
 */
static inline void spmi_controller_put(struct spmi_controller *ctrl)
{
	if (ctrl)
		put_device(&ctrl->dev);
}

int spmi_controller_add(struct spmi_controller *ctrl);
void spmi_controller_remove(struct spmi_controller *ctrl);

/**
 * struct spmi_driver - SPMI slave device driver
 * @driver:	SPMI device drivers should initialize name and owner field of
 *		this structure.
 * @probe:	binds this driver to a SPMI device.
 * @remove:	unbinds this driver from the SPMI device.
 *
 * If PM runtime support is desired for a slave, a device driver can call
 * pm_runtime_put() from their probe() routine (and a balancing
 * pm_runtime_get() in remove()).  PM runtime support for a slave is
 * implemented by issuing a SLEEP command to the slave on runtime_suspend(),
 * transitioning the slave into the SLEEP state.  On runtime_resume(), a WAKEUP
 * command is sent to the slave to bring it back to ACTIVE.
 */
struct spmi_driver {
	struct device_driver driver;
	int	(*probe)(struct spmi_device *sdev);
	void	(*remove)(struct spmi_device *sdev);
};

static inline struct spmi_driver *to_spmi_driver(struct device_driver *d)
{
	return container_of(d, struct spmi_driver, driver);
}

#define spmi_driver_register(sdrv) \
	__spmi_driver_register(sdrv, THIS_MODULE)
int __spmi_driver_register(struct spmi_driver *sdrv, struct module *owner);

/**
 * spmi_driver_unregister() - unregister an SPMI client driver
 * @sdrv:	the driver to unregister
 */
static inline void spmi_driver_unregister(struct spmi_driver *sdrv)
{
	if (sdrv)
		driver_unregister(&sdrv->driver);
}

#define module_spmi_driver(__spmi_driver) \
	module_driver(__spmi_driver, spmi_driver_register, \
			spmi_driver_unregister)

int spmi_register_read(struct spmi_device *sdev, u8 addr, u8 *buf);
int spmi_ext_register_read(struct spmi_device *sdev, u8 addr, u8 *buf,
			   size_t len);
int spmi_ext_register_readl(struct spmi_device *sdev, u16 addr, u8 *buf,
			    size_t len);
int spmi_register_write(struct spmi_device *sdev, u8 addr, u8 data);
int spmi_register_zero_write(struct spmi_device *sdev, u8 data);
int spmi_ext_register_write(struct spmi_device *sdev, u8 addr,
			    const u8 *buf, size_t len);
int spmi_ext_register_writel(struct spmi_device *sdev, u16 addr,
			     const u8 *buf, size_t len);
int spmi_command_reset(struct spmi_device *sdev);
int spmi_command_sleep(struct spmi_device *sdev);
int spmi_command_wakeup(struct spmi_device *sdev);
int spmi_command_shutdown(struct spmi_device *sdev);

#endif
back to top