Revision - 251b97f - mm: dirty page accounting vs VM_MIXEDMAP

Revision 251b97f552b1ad414cc5a9ccc8e4e94503edd5fc authored by Peter Zijlstra on 04 July 2008, 16:59:24 UTC, committed by Linus Torvalds on 04 July 2008, 17:40:04 UTC

mm: dirty page accounting vs VM_MIXEDMAP

Dirty page accounting accurately measures the amound of dirty pages in
writable shared mappings by mapping the pages RO (as indicated by
vma_wants_writenotify).  We then trap on first write and call
set_page_dirty() on the page, after which we map the page RW and
continue execution.

When we launder dirty pages, we call clear_page_dirty_for_io() which
clears both the dirty flag, and maps the page RO again before we start
writeout so that the story can repeat itself.

vma_wants_writenotify() excludes VM_PFNMAP on the basis that we cannot
do the regular dirty page stuff on raw PFNs and the memory isn't going
anywhere anyway.

The recently introduced VM_MIXEDMAP mixes both !pfn_valid() and
pfn_valid() pages in a single mapping.

We can't do dirty page accounting on !pfn_valid() pages as stated
above, and mapping them RO causes them to be COW'ed on write, which
breaks VM_SHARED semantics.

Excluding VM_MIXEDMAP in vma_wants_writenotify() would mean we don't do
the regular dirty page accounting for the pfn_valid() pages, which
would bring back all the head-aches from inaccurate dirty page
accounting.

So instead, we let the !pfn_valid() pages get mapped RO, but fix them
up unconditionally in the fault path.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Hugh Dickins <hugh@veritas.com>
Cc: "Jared Hulbert" <jaredeh@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

1 parent cde5353

Files
Changes

Permalinks

kernel_lock.c

/*
 * lib/kernel_lock.c
 *
 * This is the traditional BKL - big kernel lock. Largely
 * relegated to obsolescence, but used by various less
 * important (or lazy) subsystems.
 */
#include <linux/smp_lock.h>
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/semaphore.h>

/*
 * The 'big kernel lock'
 *
 * This spinlock is taken and released recursively by lock_kernel()
 * and unlock_kernel().  It is transparently dropped and reacquired
 * over schedule().  It is used to protect legacy code that hasn't
 * been migrated to a proper locking design yet.
 *
 * Don't use in new code.
 */
static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kernel_flag);


/*
 * Acquire/release the underlying lock from the scheduler.
 *
 * This is called with preemption disabled, and should
 * return an error value if it cannot get the lock and
 * TIF_NEED_RESCHED gets set.
 *
 * If it successfully gets the lock, it should increment
 * the preemption count like any spinlock does.
 *
 * (This works on UP too - _raw_spin_trylock will never
 * return false in that case)
 */
int __lockfunc __reacquire_kernel_lock(void)
{
	while (!_raw_spin_trylock(&kernel_flag)) {
		if (test_thread_flag(TIF_NEED_RESCHED))
			return -EAGAIN;
		cpu_relax();
	}
	preempt_disable();
	return 0;
}

void __lockfunc __release_kernel_lock(void)
{
	_raw_spin_unlock(&kernel_flag);
	preempt_enable_no_resched();
}

/*
 * These are the BKL spinlocks - we try to be polite about preemption.
 * If SMP is not on (ie UP preemption), this all goes away because the
 * _raw_spin_trylock() will always succeed.
 */
#ifdef CONFIG_PREEMPT
static inline void __lock_kernel(void)
{
	preempt_disable();
	if (unlikely(!_raw_spin_trylock(&kernel_flag))) {
		/*
		 * If preemption was disabled even before this
		 * was called, there's nothing we can be polite
		 * about - just spin.
		 */
		if (preempt_count() > 1) {
			_raw_spin_lock(&kernel_flag);
			return;
		}

		/*
		 * Otherwise, let's wait for the kernel lock
		 * with preemption enabled..
		 */
		do {
			preempt_enable();
			while (spin_is_locked(&kernel_flag))
				cpu_relax();
			preempt_disable();
		} while (!_raw_spin_trylock(&kernel_flag));
	}
}

#else

/*
 * Non-preemption case - just get the spinlock
 */
static inline void __lock_kernel(void)
{
	_raw_spin_lock(&kernel_flag);
}
#endif

static inline void __unlock_kernel(void)
{
	/*
	 * the BKL is not covered by lockdep, so we open-code the
	 * unlocking sequence (and thus avoid the dep-chain ops):
	 */
	_raw_spin_unlock(&kernel_flag);
	preempt_enable();
}

/*
 * Getting the big kernel lock.
 *
 * This cannot happen asynchronously, so we only need to
 * worry about other CPU's.
 */
void __lockfunc lock_kernel(void)
{
	int depth = current->lock_depth+1;
	if (likely(!depth))
		__lock_kernel();
	current->lock_depth = depth;
}

void __lockfunc unlock_kernel(void)
{
	BUG_ON(current->lock_depth < 0);
	if (likely(--current->lock_depth < 0))
		__unlock_kernel();
}

EXPORT_SYMBOL(lock_kernel);
EXPORT_SYMBOL(unlock_kernel);

Showing with 0 additions and 0 deletions (0 / 0 diffs computed)

Computing file changes ...