Revision aad5f69bc161af489dbb5934868bd347282f0764 authored by David Hildenbrand on 19 October 2019, 03:19:20 UTC, committed by Linus Torvalds on 19 October 2019, 10:32:31 UTC
There are three places where we access uninitialized memmaps, namely: - /proc/kpagecount - /proc/kpageflags - /proc/kpagecgroup We have initialized memmaps either when the section is online or when the page was initialized to the ZONE_DEVICE. Uninitialized memmaps contain garbage and in the worst case trigger kernel BUGs, especially with CONFIG_PAGE_POISONING. For example, not onlining a DIMM during boot and calling /proc/kpagecount with CONFIG_PAGE_POISONING: :/# cat /proc/kpagecount > tmp.test BUG: unable to handle page fault for address: fffffffffffffffe #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 114616067 P4D 114616067 PUD 114618067 PMD 0 Oops: 0000 [#1] SMP NOPTI CPU: 0 PID: 469 Comm: cat Not tainted 5.4.0-rc1-next-20191004+ #11 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.4 RIP: 0010:kpagecount_read+0xce/0x1e0 Code: e8 09 83 e0 3f 48 0f a3 02 73 2d 4c 89 e7 48 c1 e7 06 48 03 3d ab 51 01 01 74 1d 48 8b 57 08 480 RSP: 0018:ffffa14e409b7e78 EFLAGS: 00010202 RAX: fffffffffffffffe RBX: 0000000000020000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 00007f76b5595000 RDI: fffff35645000000 RBP: 00007f76b5595000 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000140000 R13: 0000000000020000 R14: 00007f76b5595000 R15: ffffa14e409b7f08 FS: 00007f76b577d580(0000) GS:ffff8f41bd400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: fffffffffffffffe CR3: 0000000078960000 CR4: 00000000000006f0 Call Trace: proc_reg_read+0x3c/0x60 vfs_read+0xc5/0x180 ksys_read+0x68/0xe0 do_syscall_64+0x5c/0xa0 entry_SYSCALL_64_after_hwframe+0x49/0xbe For now, let's drop support for ZONE_DEVICE from the three pseudo files in order to fix this. To distinguish offline memory (with garbage memmap) from ZONE_DEVICE memory with properly initialized memmaps, we would have to check get_dev_pagemap() and pfn_zone_device_reserved() right now. The usage of both (especially, special casing devmem) is frowned upon and needs to be reworked. The fundamental issue we have is: if (pfn_to_online_page(pfn)) { /* memmap initialized */ } else if (pfn_valid(pfn)) { /* * ??? * a) offline memory. memmap garbage. * b) devmem: memmap initialized to ZONE_DEVICE. * c) devmem: reserved for driver. memmap garbage. * (d) devmem: memmap currently initializing - garbage) */ } We'll leave the pfn_zone_device_reserved() check in stable_page_flags() in place as that function is also used from memory failure. We now no longer dump information about pages that are not in use anymore - offline. Link: http://lkml.kernel.org/r/20191009142435.3975-2-david@redhat.com Fixes: f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded memory to zones until online") [visible after d0dc12e86b319] Signed-off-by: David Hildenbrand <david@redhat.com> Reported-by: Qian Cai <cai@lca.pw> Acked-by: Michal Hocko <mhocko@suse.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Cc: Toshiki Fukasawa <t-fukasawa@vx.jp.nec.com> Cc: Pankaj gupta <pagupta@redhat.com> Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> Cc: Anthony Yznaga <anthony.yznaga@oracle.com> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> Cc: <stable@vger.kernel.org> [4.13+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 641fe2e
File | Mode | Size |
---|---|---|
9p | ||
adfs | ||
affs | ||
afs | ||
autofs | ||
befs | ||
bfs | ||
btrfs | ||
cachefiles | ||
ceph | ||
cifs | ||
coda | ||
configfs | ||
cramfs | ||
crypto | ||
debugfs | ||
devpts | ||
dlm | ||
ecryptfs | ||
efivarfs | ||
efs | ||
erofs | ||
exportfs | ||
ext2 | ||
ext4 | ||
f2fs | ||
fat | ||
freevxfs | ||
fscache | ||
fuse | ||
gfs2 | ||
hfs | ||
hfsplus | ||
hostfs | ||
hpfs | ||
hugetlbfs | ||
iomap | ||
isofs | ||
jbd2 | ||
jffs2 | ||
jfs | ||
kernfs | ||
lockd | ||
minix | ||
nfs | ||
nfs_common | ||
nfsd | ||
nilfs2 | ||
nls | ||
notify | ||
ntfs | ||
ocfs2 | ||
omfs | ||
openpromfs | ||
orangefs | ||
overlayfs | ||
proc | ||
pstore | ||
qnx4 | ||
qnx6 | ||
quota | ||
ramfs | ||
reiserfs | ||
romfs | ||
squashfs | ||
sysfs | ||
sysv | ||
tracefs | ||
ubifs | ||
udf | ||
ufs | ||
unicode | ||
verity | ||
xfs | ||
Kconfig | -rw-r--r-- | 7.6 KB |
Kconfig.binfmt | -rw-r--r-- | 7.6 KB |
Makefile | -rw-r--r-- | 4.4 KB |
aio.c | -rw-r--r-- | 56.0 KB |
anon_inodes.c | -rw-r--r-- | 4.6 KB |
attr.c | -rw-r--r-- | 9.6 KB |
bad_inode.c | -rw-r--r-- | 5.3 KB |
binfmt_aout.c | -rw-r--r-- | 8.3 KB |
binfmt_elf.c | -rw-r--r-- | 63.0 KB |
binfmt_elf_fdpic.c | -rw-r--r-- | 47.1 KB |
binfmt_em86.c | -rw-r--r-- | 2.8 KB |
binfmt_flat.c | -rw-r--r-- | 28.0 KB |
binfmt_misc.c | -rw-r--r-- | 18.5 KB |
binfmt_script.c | -rw-r--r-- | 4.4 KB |
block_dev.c | -rw-r--r-- | 55.9 KB |
buffer.c | -rw-r--r-- | 90.2 KB |
char_dev.c | -rw-r--r-- | 16.5 KB |
compat.c | -rw-r--r-- | 3.2 KB |
compat_binfmt_elf.c | -rw-r--r-- | 3.2 KB |
compat_ioctl.c | -rw-r--r-- | 31.0 KB |
coredump.c | -rw-r--r-- | 22.1 KB |
d_path.c | -rw-r--r-- | 11.3 KB |
dax.c | -rw-r--r-- | 45.8 KB |
dcache.c | -rw-r--r-- | 83.9 KB |
dcookies.c | -rw-r--r-- | 7.1 KB |
direct-io.c | -rw-r--r-- | 40.8 KB |
drop_caches.c | -rw-r--r-- | 1.8 KB |
eventfd.c | -rw-r--r-- | 11.1 KB |
eventpoll.c | -rw-r--r-- | 64.5 KB |
exec.c | -rw-r--r-- | 46.9 KB |
fcntl.c | -rw-r--r-- | 23.3 KB |
fhandle.c | -rw-r--r-- | 6.8 KB |
file.c | -rw-r--r-- | 24.2 KB |
file_table.c | -rw-r--r-- | 10.2 KB |
filesystems.c | -rw-r--r-- | 6.4 KB |
fs-writeback.c | -rw-r--r-- | 74.4 KB |
fs_context.c | -rw-r--r-- | 18.1 KB |
fs_parser.c | -rw-r--r-- | 11.0 KB |
fs_pin.c | -rw-r--r-- | 1.9 KB |
fs_struct.c | -rw-r--r-- | 3.3 KB |
fs_types.c | -rw-r--r-- | 2.5 KB |
fsopen.c | -rw-r--r-- | 11.2 KB |
inode.c | -rw-r--r-- | 60.7 KB |
internal.h | -rw-r--r-- | 5.1 KB |
io_uring.c | -rw-r--r-- | 94.1 KB |
ioctl.c | -rw-r--r-- | 17.7 KB |
libfs.c | -rw-r--r-- | 32.9 KB |
locks.c | -rw-r--r-- | 78.9 KB |
mbcache.c | -rw-r--r-- | 12.0 KB |
mount.h | -rw-r--r-- | 4.0 KB |
mpage.c | -rw-r--r-- | 21.1 KB |
namei.c | -rw-r--r-- | 122.8 KB |
namespace.c | -rw-r--r-- | 97.0 KB |
no-block.c | -rw-r--r-- | 478 bytes |
nsfs.c | -rw-r--r-- | 6.1 KB |
open.c | -rw-r--r-- | 30.2 KB |
pipe.c | -rw-r--r-- | 27.7 KB |
pnode.c | -rw-r--r-- | 15.1 KB |
pnode.h | -rw-r--r-- | 1.9 KB |
posix_acl.c | -rw-r--r-- | 21.5 KB |
proc_namespace.c | -rw-r--r-- | 7.8 KB |
read_write.c | -rw-r--r-- | 51.6 KB |
readdir.c | -rw-r--r-- | 13.3 KB |
select.c | -rw-r--r-- | 34.2 KB |
seq_file.c | -rw-r--r-- | 24.7 KB |
signalfd.c | -rw-r--r-- | 9.0 KB |
splice.c | -rw-r--r-- | 40.2 KB |
stack.c | -rw-r--r-- | 2.5 KB |
stat.c | -rw-r--r-- | 19.4 KB |
statfs.c | -rw-r--r-- | 9.6 KB |
super.c | -rw-r--r-- | 47.9 KB |
sync.c | -rw-r--r-- | 10.4 KB |
timerfd.c | -rw-r--r-- | 13.5 KB |
userfaultfd.c | -rw-r--r-- | 51.2 KB |
utimes.c | -rw-r--r-- | 7.3 KB |
xattr.c | -rw-r--r-- | 23.5 KB |
Computing file changes ...