Revision 595d153dd1022392083ac93a1550382cbee127e0 authored by Michael Ellerman on 26 May 2020, 06:18:08 UTC, committed by Michael Ellerman on 26 May 2020, 07:32:37 UTC
Commit 702f09805222 ("powerpc/64s/exception: Remove lite interrupt
return") changed the interrupt return path to not restore non-volatile
registers by default, and explicitly restore them in paths where it is
required.

But it missed that the facility unavailable exception can sometimes
modify user registers, ie. when it does emulation of move from DSCR.

This is seen as a failure of the dscr_sysfs_thread_test:
  test: dscr_sysfs_thread_test
  [cpu 0] User DSCR should be 1 but is 0
  failure: dscr_sysfs_thread_test

So restore non-volatile GPRs after facility unavailable exceptions.

Currently the hypervisor facility unavailable exception is also wired
up to call facility_unavailable_exception().

In practice we should never take a hypervisor facility unavailable
exception for the DSCR. On older bare metal systems we set HFSCR_DSCR
unconditionally in __init_HFSCR, or on newer systems it should be
enabled via the "data-stream-control-register" device tree CPU
feature.

Even if it's not, since commit f3c99f97a3cd ("KVM: PPC: Book3S HV:
Don't access HFSCR, LPIDR or LPCR when running nested"), the KVM code
has unconditionally set HFSCR_DSCR when running guests.

So we should only get a hypervisor facility unavailable for the DSCR
if skiboot has disabled the "data-stream-control-register" feature,
and we are somehow in guest context but not via KVM.

Given all that, it should be unnecessary to add a restore of
non-volatile GPRs after the hypervisor facility exception, because we
never expect to hit that path. But equally we may as well add the
restore, because we never expect to hit that path, and if we ever did,
at least we would correctly restore the registers to their post
emulation state.

In future we can split the non-HV and HV facility unavailable handling
so that there is no emulation in the HV handler, and then remove the
restore for the HV case.

Fixes: 702f09805222 ("powerpc/64s/exception: Remove lite interrupt return")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200526061808.2472279-1-mpe@ellerman.id.au
1 parent 8659a0e
Raw File
stackleak.c
// SPDX-License-Identifier: GPL-2.0
/*
 * This code fills the used part of the kernel stack with a poison value
 * before returning to userspace. It's part of the STACKLEAK feature
 * ported from grsecurity/PaX.
 *
 * Author: Alexander Popov <alex.popov@linux.com>
 *
 * STACKLEAK reduces the information which kernel stack leak bugs can
 * reveal and blocks some uninitialized stack variable attacks.
 */

#include <linux/stackleak.h>
#include <linux/kprobes.h>

#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
#include <linux/jump_label.h>
#include <linux/sysctl.h>

static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass);

int stack_erasing_sysctl(struct ctl_table *table, int write,
			void __user *buffer, size_t *lenp, loff_t *ppos)
{
	int ret = 0;
	int state = !static_branch_unlikely(&stack_erasing_bypass);
	int prev_state = state;

	table->data = &state;
	table->maxlen = sizeof(int);
	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
	state = !!state;
	if (ret || !write || state == prev_state)
		return ret;

	if (state)
		static_branch_disable(&stack_erasing_bypass);
	else
		static_branch_enable(&stack_erasing_bypass);

	pr_warn("stackleak: kernel stack erasing is %s\n",
					state ? "enabled" : "disabled");
	return ret;
}

#define skip_erasing()	static_branch_unlikely(&stack_erasing_bypass)
#else
#define skip_erasing()	false
#endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */

asmlinkage void notrace stackleak_erase(void)
{
	/* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */
	unsigned long kstack_ptr = current->lowest_stack;
	unsigned long boundary = (unsigned long)end_of_stack(current);
	unsigned int poison_count = 0;
	const unsigned int depth = STACKLEAK_SEARCH_DEPTH / sizeof(unsigned long);

	if (skip_erasing())
		return;

	/* Check that 'lowest_stack' value is sane */
	if (unlikely(kstack_ptr - boundary >= THREAD_SIZE))
		kstack_ptr = boundary;

	/* Search for the poison value in the kernel stack */
	while (kstack_ptr > boundary && poison_count <= depth) {
		if (*(unsigned long *)kstack_ptr == STACKLEAK_POISON)
			poison_count++;
		else
			poison_count = 0;

		kstack_ptr -= sizeof(unsigned long);
	}

	/*
	 * One 'long int' at the bottom of the thread stack is reserved and
	 * should not be poisoned (see CONFIG_SCHED_STACK_END_CHECK=y).
	 */
	if (kstack_ptr == boundary)
		kstack_ptr += sizeof(unsigned long);

#ifdef CONFIG_STACKLEAK_METRICS
	current->prev_lowest_stack = kstack_ptr;
#endif

	/*
	 * Now write the poison value to the kernel stack. Start from
	 * 'kstack_ptr' and move up till the new 'boundary'. We assume that
	 * the stack pointer doesn't change when we write poison.
	 */
	if (on_thread_stack())
		boundary = current_stack_pointer;
	else
		boundary = current_top_of_stack();

	while (kstack_ptr < boundary) {
		*(unsigned long *)kstack_ptr = STACKLEAK_POISON;
		kstack_ptr += sizeof(unsigned long);
	}

	/* Reset the 'lowest_stack' value for the next syscall */
	current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64;
}
NOKPROBE_SYMBOL(stackleak_erase);

void __used notrace stackleak_track_stack(void)
{
	/*
	 * N.B. stackleak_erase() fills the kernel stack with the poison value,
	 * which has the register width. That code assumes that the value
	 * of 'lowest_stack' is aligned on the register width boundary.
	 *
	 * That is true for x86 and x86_64 because of the kernel stack
	 * alignment on these platforms (for details, see 'cc_stack_align' in
	 * arch/x86/Makefile). Take care of that when you port STACKLEAK to
	 * new platforms.
	 */
	unsigned long sp = (unsigned long)&sp;

	/*
	 * Having CONFIG_STACKLEAK_TRACK_MIN_SIZE larger than
	 * STACKLEAK_SEARCH_DEPTH makes the poison search in
	 * stackleak_erase() unreliable. Let's prevent that.
	 */
	BUILD_BUG_ON(CONFIG_STACKLEAK_TRACK_MIN_SIZE > STACKLEAK_SEARCH_DEPTH);

	if (sp < current->lowest_stack &&
	    sp >= (unsigned long)task_stack_page(current) +
						sizeof(unsigned long)) {
		current->lowest_stack = sp;
	}
}
EXPORT_SYMBOL(stackleak_track_stack);
back to top