Revision 8c0bec2151a47906bf779c6715a10ce04453ab77 authored by Jiaying Zhang on 31 August 2011, 15:50:51 UTC, committed by Theodore Ts'o on 31 August 2011, 15:50:51 UTC
The i_mutex lock and flush_completed_IO() added by commit 2581fdc810
in ext4_evict_inode() causes lockdep complaining about potential
deadlock in several places.  In most/all of these LOCKDEP complaints
it looks like it's a false positive, since many of the potential
circular locking cases can't take place by the time the
ext4_evict_inode() is called; but since at the very least it may mask
real problems, we need to address this.

This change removes the flush_completed_IO() and i_mutex lock in
ext4_evict_inode().  Instead, we take a different approach to resolve
the software lockup that commit 2581fdc810 intends to fix.  Rather
than having ext4-dio-unwritten thread wait for grabing the i_mutex
lock of an inode, we use mutex_trylock() instead, and simply requeue
the work item if we fail to grab the inode's i_mutex lock.

This should speed up work queue processing in general and also
prevents the following deadlock scenario: During page fault,
shrink_icache_memory is called that in turn evicts another inode B.
Inode B has some pending io_end work so it calls ext4_ioend_wait()
that waits for inode B's i_ioend_count to become zero.  However, inode
B's ioend work was queued behind some of inode A's ioend work on the
same cpu's ext4-dio-unwritten workqueue.  As the ext4-dio-unwritten
thread on that cpu is processing inode A's ioend work, it tries to
grab inode A's i_mutex lock.  Since the i_mutex lock of inode A is
still hold before the page fault happened, we enter a deadlock.

Signed-off-by: Jiaying Zhang <jiayingz@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
1 parent fcb8ce5
Raw File
min_addr.c
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/security.h>
#include <linux/sysctl.h>

/* amount of vm to protect from userspace access by both DAC and the LSM*/
unsigned long mmap_min_addr;
/* amount of vm to protect from userspace using CAP_SYS_RAWIO (DAC) */
unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
/* amount of vm to protect from userspace using the LSM = CONFIG_LSM_MMAP_MIN_ADDR */

/*
 * Update mmap_min_addr = max(dac_mmap_min_addr, CONFIG_LSM_MMAP_MIN_ADDR)
 */
static void update_mmap_min_addr(void)
{
#ifdef CONFIG_LSM_MMAP_MIN_ADDR
	if (dac_mmap_min_addr > CONFIG_LSM_MMAP_MIN_ADDR)
		mmap_min_addr = dac_mmap_min_addr;
	else
		mmap_min_addr = CONFIG_LSM_MMAP_MIN_ADDR;
#else
	mmap_min_addr = dac_mmap_min_addr;
#endif
}

/*
 * sysctl handler which just sets dac_mmap_min_addr = the new value and then
 * calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly
 */
int mmap_min_addr_handler(struct ctl_table *table, int write,
			  void __user *buffer, size_t *lenp, loff_t *ppos)
{
	int ret;

	if (write && !capable(CAP_SYS_RAWIO))
		return -EPERM;

	ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);

	update_mmap_min_addr();

	return ret;
}

static int __init init_mmap_min_addr(void)
{
	update_mmap_min_addr();

	return 0;
}
pure_initcall(init_mmap_min_addr);
back to top