Revision 7906d00cd1f687268f0a3599442d113767795ae6 authored by Andrea Arcangeli on 28 July 2008, 22:46:26 UTC, committed by Linus Torvalds on 28 July 2008, 23:30:21 UTC
mm_take_all_locks holds off reclaim from an entire mm_struct. This allows mmu notifiers to register into the mm at any time with the guarantee that no mmu operation is in progress on the mm. This operation locks against the VM for all pte/vma/mm related operations that could ever happen on a certain mm. This includes vmtruncate, try_to_unmap, and all page faults. The caller must take the mmap_sem in write mode before calling mm_take_all_locks(). The caller isn't allowed to release the mmap_sem until mm_drop_all_locks() returns. mmap_sem in write mode is required in order to block all operations that could modify pagetables and free pages without need of altering the vma layout (for example populate_range() with nonlinear vmas). It's also needed in write mode to avoid new anon_vmas to be associated with existing vmas. A single task can't take more than one mm_take_all_locks() in a row or it would deadlock. mm_take_all_locks() and mm_drop_all_locks are expensive operations that may have to take thousand of locks. mm_take_all_locks() can fail if it's interrupted by signals. When mmu_notifier_register returns, we must be sure that the driver is notified if some task is in the middle of a vmtruncate for the 'mm' where the mmu notifier was registered (mmu_notifier_invalidate_range_start/end is run around the vmtruncation but mmu_notifier_register can run after mmu_notifier_invalidate_range_start and before mmu_notifier_invalidate_range_end). Same problem for rmap paths. And we've to remove page pinning to avoid replicating the tlb_gather logic inside KVM (and GRU doesn't work well with page pinning regardless of needing tlb_gather), so without mm_take_all_locks when vmtruncate frees the page, kvm would have no way to notice that it mapped into sptes a page that is going into the freelist without a chance of any further mmu_notifier notification. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andrea Arcangeli <andrea@qumranet.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Jack Steiner <steiner@sgi.com> Cc: Robin Holt <holt@sgi.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Kanoj Sarcar <kanojsarcar@yahoo.com> Cc: Roland Dreier <rdreier@cisco.com> Cc: Steve Wise <swise@opengridcomputing.com> Cc: Avi Kivity <avi@qumranet.com> Cc: Hugh Dickins <hugh@veritas.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Anthony Liguori <aliguori@us.ibm.com> Cc: Chris Wright <chrisw@redhat.com> Cc: Marcelo Tosatti <marcelo@kvack.org> Cc: Eric Dumazet <dada1@cosmosbay.com> Cc: "Paul E. McKenney" <paulmck@us.ibm.com> Cc: Izik Eidus <izike@qumranet.com> Cc: Anthony Liguori <aliguori@us.ibm.com> Cc: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 6beeac7
fifo.c
/*
* linux/fs/fifo.c
*
* written by Paul H. Hargrove
*
* Fixes:
* 10-06-1999, AV: fixed OOM handling in fifo_open(), moved
* initialization there, switched to external
* allocation of pipe_inode_info.
*/
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/pipe_fs_i.h>
static void wait_for_partner(struct inode* inode, unsigned int *cnt)
{
int cur = *cnt;
while (cur == *cnt) {
pipe_wait(inode->i_pipe);
if (signal_pending(current))
break;
}
}
static void wake_up_partner(struct inode* inode)
{
wake_up_interruptible(&inode->i_pipe->wait);
}
static int fifo_open(struct inode *inode, struct file *filp)
{
struct pipe_inode_info *pipe;
int ret;
mutex_lock(&inode->i_mutex);
pipe = inode->i_pipe;
if (!pipe) {
ret = -ENOMEM;
pipe = alloc_pipe_info(inode);
if (!pipe)
goto err_nocleanup;
inode->i_pipe = pipe;
}
filp->f_version = 0;
/* We can only do regular read/write on fifos */
filp->f_mode &= (FMODE_READ | FMODE_WRITE);
switch (filp->f_mode) {
case 1:
/*
* O_RDONLY
* POSIX.1 says that O_NONBLOCK means return with the FIFO
* opened, even when there is no process writing the FIFO.
*/
filp->f_op = &read_pipefifo_fops;
pipe->r_counter++;
if (pipe->readers++ == 0)
wake_up_partner(inode);
if (!pipe->writers) {
if ((filp->f_flags & O_NONBLOCK)) {
/* suppress POLLHUP until we have
* seen a writer */
filp->f_version = pipe->w_counter;
} else
{
wait_for_partner(inode, &pipe->w_counter);
if(signal_pending(current))
goto err_rd;
}
}
break;
case 2:
/*
* O_WRONLY
* POSIX.1 says that O_NONBLOCK means return -1 with
* errno=ENXIO when there is no process reading the FIFO.
*/
ret = -ENXIO;
if ((filp->f_flags & O_NONBLOCK) && !pipe->readers)
goto err;
filp->f_op = &write_pipefifo_fops;
pipe->w_counter++;
if (!pipe->writers++)
wake_up_partner(inode);
if (!pipe->readers) {
wait_for_partner(inode, &pipe->r_counter);
if (signal_pending(current))
goto err_wr;
}
break;
case 3:
/*
* O_RDWR
* POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
* This implementation will NEVER block on a O_RDWR open, since
* the process can at least talk to itself.
*/
filp->f_op = &rdwr_pipefifo_fops;
pipe->readers++;
pipe->writers++;
pipe->r_counter++;
pipe->w_counter++;
if (pipe->readers == 1 || pipe->writers == 1)
wake_up_partner(inode);
break;
default:
ret = -EINVAL;
goto err;
}
/* Ok! */
mutex_unlock(&inode->i_mutex);
return 0;
err_rd:
if (!--pipe->readers)
wake_up_interruptible(&pipe->wait);
ret = -ERESTARTSYS;
goto err;
err_wr:
if (!--pipe->writers)
wake_up_interruptible(&pipe->wait);
ret = -ERESTARTSYS;
goto err;
err:
if (!pipe->readers && !pipe->writers)
free_pipe_info(inode);
err_nocleanup:
mutex_unlock(&inode->i_mutex);
return ret;
}
/*
* Dummy default file-operations: the only thing this does
* is contain the open that then fills in the correct operations
* depending on the access mode of the file...
*/
const struct file_operations def_fifo_fops = {
.open = fifo_open, /* will set read_ or write_pipefifo_fops */
};
Computing file changes ...