Revision 18e19f195cd888f65643a77a0c6aee8f5be6439a authored by Wei Yang on 21 February 2020, 04:04:27 UTC, committed by Linus Torvalds on 21 February 2020, 19:22:15 UTC
When we use SPARSEMEM instead of SPARSEMEM_VMEMMAP, pfn_to_page()
doesn't work before sparse_init_one_section() is called.

This leads to a crash when hotplug memory:

    BUG: unable to handle page fault for address: 0000000006400000
    #PF: supervisor write access in kernel mode
    #PF: error_code(0x0002) - not-present page
    PGD 0 P4D 0
    Oops: 0002 [#1] SMP PTI
    CPU: 3 PID: 221 Comm: kworker/u16:1 Tainted: G        W         5.5.0-next-20200205+ #343
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015
    Workqueue: kacpi_hotplug acpi_hotplug_work_fn
    RIP: 0010:__memset+0x24/0x30
    Code: cc cc cc cc cc cc 0f 1f 44 00 00 49 89 f9 48 89 d1 83 e2 07 48 c1 e9 03 40 0f b6 f6 48 b8 01 01 01 01 01 01 01 01 48 0f af c6 <f3> 48 ab 89 d1 f3 aa 4c 89 c8 c3 90 49 89 f9 40 88 f0 48 89 d1 f3
    RSP: 0018:ffffb43ac0373c80 EFLAGS: 00010a87
    RAX: ffffffffffffffff RBX: ffff8a1518800000 RCX: 0000000000050000
    RDX: 0000000000000000 RSI: 00000000000000ff RDI: 0000000006400000
    RBP: 0000000000140000 R08: 0000000000100000 R09: 0000000006400000
    R10: 0000000000000000 R11: 0000000000000002 R12: 0000000000000000
    R13: 0000000000000028 R14: 0000000000000000 R15: ffff8a153ffd9280
    FS:  0000000000000000(0000) GS:ffff8a153ab00000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: 0000000006400000 CR3: 0000000136fca000 CR4: 00000000000006e0
    DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
    DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
    Call Trace:
     sparse_add_section+0x1c9/0x26a
     __add_pages+0xbf/0x150
     add_pages+0x12/0x60
     add_memory_resource+0xc8/0x210
     __add_memory+0x62/0xb0
     acpi_memory_device_add+0x13f/0x300
     acpi_bus_attach+0xf6/0x200
     acpi_bus_scan+0x43/0x90
     acpi_device_hotplug+0x275/0x3d0
     acpi_hotplug_work_fn+0x1a/0x30
     process_one_work+0x1a7/0x370
     worker_thread+0x30/0x380
     kthread+0x112/0x130
     ret_from_fork+0x35/0x40

We should use memmap as it did.

On x86 the impact is limited to x86_32 builds, or x86_64 configurations
that override the default setting for SPARSEMEM_VMEMMAP.

Other memory hotplug archs (arm64, ia64, and ppc) also default to
SPARSEMEM_VMEMMAP=y.

[dan.j.williams@intel.com: changelog update]
{rppt@linux.ibm.com: changelog update]
Link: http://lkml.kernel.org/r/20200219030454.4844-1-bhe@redhat.com
Fixes: ba72b4c8cf60 ("mm/sparsemem: support sub-section hotplug")
Signed-off-by: Wei Yang <richardw.yang@linux.intel.com>
Signed-off-by: Baoquan He <bhe@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 76073c6
Raw File
offwaketime_kern.c
/* Copyright (c) 2016 Facebook
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 */
#include <uapi/linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <uapi/linux/ptrace.h>
#include <uapi/linux/perf_event.h>
#include <linux/version.h>
#include <linux/sched.h>

#define _(P) ({typeof(P) val; bpf_probe_read(&val, sizeof(val), &P); val;})

#define MINBLOCK_US	1

struct key_t {
	char waker[TASK_COMM_LEN];
	char target[TASK_COMM_LEN];
	u32 wret;
	u32 tret;
};

struct bpf_map_def SEC("maps") counts = {
	.type = BPF_MAP_TYPE_HASH,
	.key_size = sizeof(struct key_t),
	.value_size = sizeof(u64),
	.max_entries = 10000,
};

struct bpf_map_def SEC("maps") start = {
	.type = BPF_MAP_TYPE_HASH,
	.key_size = sizeof(u32),
	.value_size = sizeof(u64),
	.max_entries = 10000,
};

struct wokeby_t {
	char name[TASK_COMM_LEN];
	u32 ret;
};

struct bpf_map_def SEC("maps") wokeby = {
	.type = BPF_MAP_TYPE_HASH,
	.key_size = sizeof(u32),
	.value_size = sizeof(struct wokeby_t),
	.max_entries = 10000,
};

struct bpf_map_def SEC("maps") stackmap = {
	.type = BPF_MAP_TYPE_STACK_TRACE,
	.key_size = sizeof(u32),
	.value_size = PERF_MAX_STACK_DEPTH * sizeof(u64),
	.max_entries = 10000,
};

#define STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP)

SEC("kprobe/try_to_wake_up")
int waker(struct pt_regs *ctx)
{
	struct task_struct *p = (void *) PT_REGS_PARM1(ctx);
	struct wokeby_t woke;
	u32 pid;

	pid = _(p->pid);

	bpf_get_current_comm(&woke.name, sizeof(woke.name));
	woke.ret = bpf_get_stackid(ctx, &stackmap, STACKID_FLAGS);

	bpf_map_update_elem(&wokeby, &pid, &woke, BPF_ANY);
	return 0;
}

static inline int update_counts(void *ctx, u32 pid, u64 delta)
{
	struct wokeby_t *woke;
	u64 zero = 0, *val;
	struct key_t key;

	__builtin_memset(&key.waker, 0, sizeof(key.waker));
	bpf_get_current_comm(&key.target, sizeof(key.target));
	key.tret = bpf_get_stackid(ctx, &stackmap, STACKID_FLAGS);
	key.wret = 0;

	woke = bpf_map_lookup_elem(&wokeby, &pid);
	if (woke) {
		key.wret = woke->ret;
		__builtin_memcpy(&key.waker, woke->name, sizeof(key.waker));
		bpf_map_delete_elem(&wokeby, &pid);
	}

	val = bpf_map_lookup_elem(&counts, &key);
	if (!val) {
		bpf_map_update_elem(&counts, &key, &zero, BPF_NOEXIST);
		val = bpf_map_lookup_elem(&counts, &key);
		if (!val)
			return 0;
	}
	(*val) += delta;
	return 0;
}

#if 1
/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
struct sched_switch_args {
	unsigned long long pad;
	char prev_comm[16];
	int prev_pid;
	int prev_prio;
	long long prev_state;
	char next_comm[16];
	int next_pid;
	int next_prio;
};
SEC("tracepoint/sched/sched_switch")
int oncpu(struct sched_switch_args *ctx)
{
	/* record previous thread sleep time */
	u32 pid = ctx->prev_pid;
#else
SEC("kprobe/finish_task_switch")
int oncpu(struct pt_regs *ctx)
{
	struct task_struct *p = (void *) PT_REGS_PARM1(ctx);
	/* record previous thread sleep time */
	u32 pid = _(p->pid);
#endif
	u64 delta, ts, *tsp;

	ts = bpf_ktime_get_ns();
	bpf_map_update_elem(&start, &pid, &ts, BPF_ANY);

	/* calculate current thread's delta time */
	pid = bpf_get_current_pid_tgid();
	tsp = bpf_map_lookup_elem(&start, &pid);
	if (!tsp)
		/* missed start or filtered */
		return 0;

	delta = bpf_ktime_get_ns() - *tsp;
	bpf_map_delete_elem(&start, &pid);
	delta = delta / 1000;
	if (delta < MINBLOCK_US)
		return 0;

	return update_counts(ctx, pid, delta);
}
char _license[] SEC("license") = "GPL";
u32 _version SEC("version") = LINUX_VERSION_CODE;
back to top