Revision - e4eed03 - thp: avoid atomic64_read in pmd_read_atomic for 32bit PAE

Revision e4eed03fd06578571c01d4f1478c874bb432c815 authored by Andrea Arcangeli on 20 June 2012, 19:52:57 UTC, committed by Linus Torvalds on 20 June 2012, 21:39:35 UTC

thp: avoid atomic64_read in pmd_read_atomic for 32bit PAE

In the x86 32bit PAE CONFIG_TRANSPARENT_HUGEPAGE=y case while holding the
mmap_sem for reading, cmpxchg8b cannot be used to read pmd contents under
Xen.

So instead of dealing only with "consistent" pmdvals in
pmd_none_or_trans_huge_or_clear_bad() (which would be conceptually
simpler) we let pmd_none_or_trans_huge_or_clear_bad() deal with pmdvals
where the low 32bit and high 32bit could be inconsistent (to avoid having
to use cmpxchg8b).

The only guarantee we get from pmd_read_atomic is that if the low part of
the pmd was found null, the high part will be null too (so the pmd will be
considered unstable).  And if the low part of the pmd is found "stable"
later, then it means the whole pmd was read atomically (because after a
pmd is stable, neither MADV_DONTNEED nor page faults can alter it anymore,
and we read the high part after the low part).

In the 32bit PAE x86 case, it is enough to read the low part of the pmdval
atomically to declare the pmd as "stable" and that's true for THP and no
THP, furthermore in the THP case we also have a barrier() that will
prevent any inconsistent pmdvals to be cached by a later re-read of the
*pmd.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Jonathan Nieder <jrnieder@gmail.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Petr Matousek <pmatouse@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Tested-by: Andrew Jones <drjones@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

1 parent abca7c4

Files
Changes

Permalinks

blk-softirq.c

/*
 * Functions related to softirq rq completions
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/sched.h>

#include "blk.h"

static DEFINE_PER_CPU(struct list_head, blk_cpu_done);

/*
 * Softirq action handler - move entries to local list and loop over them
 * while passing them to the queue registered handler.
 */
static void blk_done_softirq(struct softirq_action *h)
{
	struct list_head *cpu_list, local_list;

	local_irq_disable();
	cpu_list = &__get_cpu_var(blk_cpu_done);
	list_replace_init(cpu_list, &local_list);
	local_irq_enable();

	while (!list_empty(&local_list)) {
		struct request *rq;

		rq = list_entry(local_list.next, struct request, csd.list);
		list_del_init(&rq->csd.list);
		rq->q->softirq_done_fn(rq);
	}
}

#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
static void trigger_softirq(void *data)
{
	struct request *rq = data;
	unsigned long flags;
	struct list_head *list;

	local_irq_save(flags);
	list = &__get_cpu_var(blk_cpu_done);
	list_add_tail(&rq->csd.list, list);

	if (list->next == &rq->csd.list)
		raise_softirq_irqoff(BLOCK_SOFTIRQ);

	local_irq_restore(flags);
}

/*
 * Setup and invoke a run of 'trigger_softirq' on the given cpu.
 */
static int raise_blk_irq(int cpu, struct request *rq)
{
	if (cpu_online(cpu)) {
		struct call_single_data *data = &rq->csd;

		data->func = trigger_softirq;
		data->info = rq;
		data->flags = 0;

		__smp_call_function_single(cpu, data, 0);
		return 0;
	}

	return 1;
}
#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
static int raise_blk_irq(int cpu, struct request *rq)
{
	return 1;
}
#endif

static int __cpuinit blk_cpu_notify(struct notifier_block *self,
				    unsigned long action, void *hcpu)
{
	/*
	 * If a CPU goes away, splice its entries to the current CPU
	 * and trigger a run of the softirq
	 */
	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
		int cpu = (unsigned long) hcpu;

		local_irq_disable();
		list_splice_init(&per_cpu(blk_cpu_done, cpu),
				 &__get_cpu_var(blk_cpu_done));
		raise_softirq_irqoff(BLOCK_SOFTIRQ);
		local_irq_enable();
	}

	return NOTIFY_OK;
}

static struct notifier_block __cpuinitdata blk_cpu_notifier = {
	.notifier_call	= blk_cpu_notify,
};

void __blk_complete_request(struct request *req)
{
	int ccpu, cpu;
	struct request_queue *q = req->q;
	unsigned long flags;
	bool shared = false;

	BUG_ON(!q->softirq_done_fn);

	local_irq_save(flags);
	cpu = smp_processor_id();

	/*
	 * Select completion CPU
	 */
	if (req->cpu != -1) {
		ccpu = req->cpu;
		if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
			shared = cpus_share_cache(cpu, ccpu);
	} else
		ccpu = cpu;

	/*
	 * If current CPU and requested CPU share a cache, run the softirq on
	 * the current CPU. One might concern this is just like
	 * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is
	 * running in interrupt handler, and currently I/O controller doesn't
	 * support multiple interrupts, so current CPU is unique actually. This
	 * avoids IPI sending from current CPU to the first CPU of a group.
	 */
	if (ccpu == cpu || shared) {
		struct list_head *list;
do_local:
		list = &__get_cpu_var(blk_cpu_done);
		list_add_tail(&req->csd.list, list);

		/*
		 * if the list only contains our just added request,
		 * signal a raise of the softirq. If there are already
		 * entries there, someone already raised the irq but it
		 * hasn't run yet.
		 */
		if (list->next == &req->csd.list)
			raise_softirq_irqoff(BLOCK_SOFTIRQ);
	} else if (raise_blk_irq(ccpu, req))
		goto do_local;

	local_irq_restore(flags);
}

/**
 * blk_complete_request - end I/O on a request
 * @req:      the request being processed
 *
 * Description:
 *     Ends all I/O on a request. It does not handle partial completions,
 *     unless the driver actually implements this in its completion callback
 *     through requeueing. The actual completion happens out-of-order,
 *     through a softirq handler. The user must have registered a completion
 *     callback through blk_queue_softirq_done().
 **/
void blk_complete_request(struct request *req)
{
	if (unlikely(blk_should_fake_timeout(req->q)))
		return;
	if (!blk_mark_rq_complete(req))
		__blk_complete_request(req);
}
EXPORT_SYMBOL(blk_complete_request);

static __init int blk_softirq_init(void)
{
	int i;

	for_each_possible_cpu(i)
		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));

	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
	register_hotcpu_notifier(&blk_cpu_notifier);
	return 0;
}
subsys_initcall(blk_softirq_init);

Showing with 0 additions and 0 deletions (0 / 0 diffs computed)

Computing file changes ...