Revision 6bc9b56433b76e40d11099338d27fbc5cd2935ca authored by Naoya Horiguchi on 24 August 2018, 00:00:38 UTC, committed by Linus Torvalds on 24 August 2018, 01:48:43 UTC
Patch series "mm: soft-offline: fix race against page allocation".

Xishi recently reported the issue about race on reusing the target pages
of soft offlining.  Discussion and analysis showed that we need make
sure that setting PG_hwpoison should be done in the right place under
zone->lock for soft offline.  1/2 handles free hugepage's case, and 2/2
hanldes free buddy page's case.

This patch (of 2):

There's a race condition between soft offline and hugetlb_fault which
causes unexpected process killing and/or hugetlb allocation failure.

The process killing is caused by the following flow:

  CPU 0               CPU 1              CPU 2

  soft offline
    get_any_page
    // find the hugetlb is free
                      mmap a hugetlb file
                      page fault
                        ...
                          hugetlb_fault
                            hugetlb_no_page
                              alloc_huge_page
                              // succeed
      soft_offline_free_page
      // set hwpoison flag
                                         mmap the hugetlb file
                                         page fault
                                           ...
                                             hugetlb_fault
                                               hugetlb_no_page
                                                 find_lock_page
                                                   return VM_FAULT_HWPOISON
                                           mm_fault_error
                                             do_sigbus
                                             // kill the process

The hugetlb allocation failure comes from the following flow:

  CPU 0                          CPU 1

                                 mmap a hugetlb file
                                 // reserve all free page but don't fault-in
  soft offline
    get_any_page
    // find the hugetlb is free
      soft_offline_free_page
      // set hwpoison flag
        dissolve_free_huge_page
        // fail because all free hugepages are reserved
                                 page fault
                                   ...
                                     hugetlb_fault
                                       hugetlb_no_page
                                         alloc_huge_page
                                           ...
                                             dequeue_huge_page_node_exact
                                             // ignore hwpoisoned hugepage
                                             // and finally fail due to no-mem

The root cause of this is that current soft-offline code is written based
on an assumption that PageHWPoison flag should be set at first to avoid
accessing the corrupted data.  This makes sense for memory_failure() or
hard offline, but does not for soft offline because soft offline is about
corrected (not uncorrected) error and is safe from data lost.  This patch
changes soft offline semantics where it sets PageHWPoison flag only after
containment of the error page completes successfully.

Link: http://lkml.kernel.org/r/1531452366-11661-2-git-send-email-n-horiguchi@ah.jp.nec.com
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reported-by: Xishi Qiu <xishi.qiuxishi@alibaba-inc.com>
Suggested-by: Xishi Qiu <xishi.qiuxishi@alibaba-inc.com>
Tested-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: <zy.zhengyi@alibaba-inc.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 30aba66
Raw File
Kconfig
# SPDX-License-Identifier: GPL-2.0
#
# Partition configuration
#
config PARTITION_ADVANCED
	bool "Advanced partition selection"
	help
	  Say Y here if you would like to use hard disks under Linux which
	  were partitioned under an operating system running on a different
	  architecture than your Linux system.

	  Note that the answer to this question won't directly affect the
	  kernel: saying N will just cause the configurator to skip all
	  the questions about foreign partitioning schemes.

	  If unsure, say N.

config ACORN_PARTITION
	bool "Acorn partition support" if PARTITION_ADVANCED
	default y if ARCH_ACORN
	help
	  Support hard disks partitioned under Acorn operating systems.

config ACORN_PARTITION_CUMANA
	bool "Cumana partition support" if PARTITION_ADVANCED
	default y if ARCH_ACORN
	depends on ACORN_PARTITION
	help
	  Say Y here if you would like to use hard disks under Linux which
	  were partitioned using the Cumana interface on Acorn machines.

config ACORN_PARTITION_EESOX
	bool "EESOX partition support" if PARTITION_ADVANCED
	default y if ARCH_ACORN
	depends on ACORN_PARTITION

config ACORN_PARTITION_ICS
	bool "ICS partition support" if PARTITION_ADVANCED
	default y if ARCH_ACORN
	depends on ACORN_PARTITION
	help
	  Say Y here if you would like to use hard disks under Linux which
	  were partitioned using the ICS interface on Acorn machines.

config ACORN_PARTITION_ADFS
	bool "Native filecore partition support" if PARTITION_ADVANCED
	default y if ARCH_ACORN
	depends on ACORN_PARTITION
	help
	  The Acorn Disc Filing System is the standard file system of the
	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
	  systems and the Acorn Archimedes range of machines.  If you say
	  `Y' here, Linux will support disk partitions created under ADFS.

config ACORN_PARTITION_POWERTEC
	bool "PowerTec partition support" if PARTITION_ADVANCED
	default y if ARCH_ACORN
	depends on ACORN_PARTITION
	help
	  Support reading partition tables created on Acorn machines using
	  the PowerTec SCSI drive.

config ACORN_PARTITION_RISCIX
	bool "RISCiX partition support" if PARTITION_ADVANCED
	default y if ARCH_ACORN
	depends on ACORN_PARTITION
	help
	  Once upon a time, there was a native Unix port for the Acorn series
	  of machines called RISCiX.  If you say 'Y' here, Linux will be able
	  to read disks partitioned under RISCiX.

config AIX_PARTITION
	bool "AIX basic partition table support" if PARTITION_ADVANCED
	help
	  Say Y here if you would like to be able to read the hard disk
	  partition table format used by IBM or Motorola PowerPC machines
	  running AIX.  AIX actually uses a Logical Volume Manager, where
	  "logical volumes" can be spread across one or multiple disks,
	  but this driver works only for the simple case of partitions which
	  are contiguous.
	  Otherwise, say N.

config OSF_PARTITION
	bool "Alpha OSF partition support" if PARTITION_ADVANCED
	default y if ALPHA
	help
	  Say Y here if you would like to use hard disks under Linux which
	  were partitioned on an Alpha machine.

config AMIGA_PARTITION
	bool "Amiga partition table support" if PARTITION_ADVANCED
	default y if (AMIGA || AFFS_FS=y)
	help
	  Say Y here if you would like to use hard disks under Linux which
	  were partitioned under AmigaOS.

config ATARI_PARTITION
	bool "Atari partition table support" if PARTITION_ADVANCED
	default y if ATARI
	help
	  Say Y here if you would like to use hard disks under Linux which
	  were partitioned under the Atari OS.

config IBM_PARTITION
	bool "IBM disk label and partition support"
	depends on PARTITION_ADVANCED && S390
	help
	  Say Y here if you would like to be able to read the hard disk
	  partition table format used by IBM DASD disks operating under CMS.
	  Otherwise, say N.

config MAC_PARTITION
	bool "Macintosh partition map support" if PARTITION_ADVANCED
	default y if (MAC || PPC_PMAC)
	help
	  Say Y here if you would like to use hard disks under Linux which
	  were partitioned on a Macintosh.

config MSDOS_PARTITION
	bool "PC BIOS (MSDOS partition tables) support" if PARTITION_ADVANCED
	default y
	help
	  Say Y here.

config BSD_DISKLABEL
	bool "BSD disklabel (FreeBSD partition tables) support"
	depends on PARTITION_ADVANCED && MSDOS_PARTITION
	help
	  FreeBSD uses its own hard disk partition scheme on your PC. It
	  requires only one entry in the primary partition table of your disk
	  and manages it similarly to DOS extended partitions, putting in its
	  first sector a new partition table in BSD disklabel format. Saying Y
	  here allows you to read these disklabels and further mount FreeBSD
	  partitions from within Linux if you have also said Y to "UFS
	  file system support", above. If you don't know what all this is
	  about, say N.

config MINIX_SUBPARTITION
	bool "Minix subpartition support"
	depends on PARTITION_ADVANCED && MSDOS_PARTITION
	help
	  Minix 2.0.0/2.0.2 subpartition table support for Linux.
	  Say Y here if you want to mount and use Minix 2.0.0/2.0.2
	  subpartitions.

config SOLARIS_X86_PARTITION
	bool "Solaris (x86) partition table support"
	depends on PARTITION_ADVANCED && MSDOS_PARTITION
	help
	  Like most systems, Solaris x86 uses its own hard disk partition
	  table format, incompatible with all others. Saying Y here allows you
	  to read these partition tables and further mount Solaris x86
	  partitions from within Linux if you have also said Y to "UFS
	  file system support", above.

config UNIXWARE_DISKLABEL
	bool "Unixware slices support"
	depends on PARTITION_ADVANCED && MSDOS_PARTITION
	---help---
	  Like some systems, UnixWare uses its own slice table inside a
	  partition (VTOC - Virtual Table of Contents). Its format is
	  incompatible with all other OSes. Saying Y here allows you to read
	  VTOC and further mount UnixWare partitions read-only from within
	  Linux if you have also said Y to "UFS file system support" or
	  "System V and Coherent file system support", above.

	  This is mainly used to carry data from a UnixWare box to your
	  Linux box via a removable medium like magneto-optical, ZIP or
	  removable IDE drives. Note, however, that a good portable way to
	  transport files and directories between unixes (and even other
	  operating systems) is given by the tar program ("man tar" or
	  preferably "info tar").

	  If you don't know what all this is about, say N.

config LDM_PARTITION
	bool "Windows Logical Disk Manager (Dynamic Disk) support"
	depends on PARTITION_ADVANCED
	---help---
	  Say Y here if you would like to use hard disks under Linux which
	  were partitioned using Windows 2000's/XP's or Vista's Logical Disk
	  Manager.  They are also known as "Dynamic Disks".

	  Note this driver only supports Dynamic Disks with a protective MBR
	  label, i.e. DOS partition table.  It does not support GPT labelled
	  Dynamic Disks yet as can be created with Vista.

	  Windows 2000 introduced the concept of Dynamic Disks to get around
	  the limitations of the PC's partitioning scheme.  The Logical Disk
	  Manager allows the user to repartition a disk and create spanned,
	  mirrored, striped or RAID volumes, all without the need for
	  rebooting.

	  Normal partitions are now called Basic Disks under Windows 2000, XP,
	  and Vista.

	  For a fuller description read <file:Documentation/ldm.txt>.

	  If unsure, say N.

config LDM_DEBUG
	bool "Windows LDM extra logging"
	depends on LDM_PARTITION
	help
	  Say Y here if you would like LDM to log verbosely.  This could be
	  helpful if the driver doesn't work as expected and you'd like to
	  report a bug.

	  If unsure, say N.

config SGI_PARTITION
	bool "SGI partition support" if PARTITION_ADVANCED
	default y if DEFAULT_SGI_PARTITION
	help
	  Say Y here if you would like to be able to read the hard disk
	  partition table format used by SGI machines.

config ULTRIX_PARTITION
	bool "Ultrix partition table support" if PARTITION_ADVANCED
	default y if MACH_DECSTATION
	help
	  Say Y here if you would like to be able to read the hard disk
	  partition table format used by DEC (now Compaq) Ultrix machines.
	  Otherwise, say N.

config SUN_PARTITION
	bool "Sun partition tables support" if PARTITION_ADVANCED
	default y if (SPARC || SUN3 || SUN3X)
	---help---
	  Like most systems, SunOS uses its own hard disk partition table
	  format, incompatible with all others. Saying Y here allows you to
	  read these partition tables and further mount SunOS partitions from
	  within Linux if you have also said Y to "UFS file system support",
	  above. This is mainly used to carry data from a SPARC under SunOS to
	  your Linux box via a removable medium like magneto-optical or ZIP
	  drives; note however that a good portable way to transport files and
	  directories between unixes (and even other operating systems) is
	  given by the tar program ("man tar" or preferably "info tar"). If
	  you don't know what all this is about, say N.

config KARMA_PARTITION
	bool "Karma Partition support"
	depends on PARTITION_ADVANCED
	help
	  Say Y here if you would like to mount the Rio Karma MP3 player, as it
	  uses a proprietary partition table.

config EFI_PARTITION
	bool "EFI GUID Partition support" if PARTITION_ADVANCED
	default y
	select CRC32
	help
	  Say Y here if you would like to use hard disks under Linux which
	  were partitioned using EFI GPT.

config SYSV68_PARTITION
	bool "SYSV68 partition table support" if PARTITION_ADVANCED
	default y if VME
	help
	  Say Y here if you would like to be able to read the hard disk
	  partition table format used by Motorola Delta machines (using
	  sysv68).
	  Otherwise, say N.

config CMDLINE_PARTITION
	bool "Command line partition support" if PARTITION_ADVANCED
	select BLK_CMDLINE_PARSER
	help
	  Say Y here if you want to read the partition table from bootargs.
	  The format for the command line is just like mtdparts.
back to top