Revision 7906d00cd1f687268f0a3599442d113767795ae6 authored by Andrea Arcangeli on 28 July 2008, 22:46:26 UTC, committed by Linus Torvalds on 28 July 2008, 23:30:21 UTC
mm_take_all_locks holds off reclaim from an entire mm_struct. This allows mmu notifiers to register into the mm at any time with the guarantee that no mmu operation is in progress on the mm. This operation locks against the VM for all pte/vma/mm related operations that could ever happen on a certain mm. This includes vmtruncate, try_to_unmap, and all page faults. The caller must take the mmap_sem in write mode before calling mm_take_all_locks(). The caller isn't allowed to release the mmap_sem until mm_drop_all_locks() returns. mmap_sem in write mode is required in order to block all operations that could modify pagetables and free pages without need of altering the vma layout (for example populate_range() with nonlinear vmas). It's also needed in write mode to avoid new anon_vmas to be associated with existing vmas. A single task can't take more than one mm_take_all_locks() in a row or it would deadlock. mm_take_all_locks() and mm_drop_all_locks are expensive operations that may have to take thousand of locks. mm_take_all_locks() can fail if it's interrupted by signals. When mmu_notifier_register returns, we must be sure that the driver is notified if some task is in the middle of a vmtruncate for the 'mm' where the mmu notifier was registered (mmu_notifier_invalidate_range_start/end is run around the vmtruncation but mmu_notifier_register can run after mmu_notifier_invalidate_range_start and before mmu_notifier_invalidate_range_end). Same problem for rmap paths. And we've to remove page pinning to avoid replicating the tlb_gather logic inside KVM (and GRU doesn't work well with page pinning regardless of needing tlb_gather), so without mm_take_all_locks when vmtruncate frees the page, kvm would have no way to notice that it mapped into sptes a page that is going into the freelist without a chance of any further mmu_notifier notification. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andrea Arcangeli <andrea@qumranet.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Jack Steiner <steiner@sgi.com> Cc: Robin Holt <holt@sgi.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Kanoj Sarcar <kanojsarcar@yahoo.com> Cc: Roland Dreier <rdreier@cisco.com> Cc: Steve Wise <swise@opengridcomputing.com> Cc: Avi Kivity <avi@qumranet.com> Cc: Hugh Dickins <hugh@veritas.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Anthony Liguori <aliguori@us.ibm.com> Cc: Chris Wright <chrisw@redhat.com> Cc: Marcelo Tosatti <marcelo@kvack.org> Cc: Eric Dumazet <dada1@cosmosbay.com> Cc: "Paul E. McKenney" <paulmck@us.ibm.com> Cc: Izik Eidus <izike@qumranet.com> Cc: Anthony Liguori <aliguori@us.ibm.com> Cc: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 6beeac7
stat.c
/*
* linux/fs/stat.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/highuid.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
void generic_fillattr(struct inode *inode, struct kstat *stat)
{
stat->dev = inode->i_sb->s_dev;
stat->ino = inode->i_ino;
stat->mode = inode->i_mode;
stat->nlink = inode->i_nlink;
stat->uid = inode->i_uid;
stat->gid = inode->i_gid;
stat->rdev = inode->i_rdev;
stat->atime = inode->i_atime;
stat->mtime = inode->i_mtime;
stat->ctime = inode->i_ctime;
stat->size = i_size_read(inode);
stat->blocks = inode->i_blocks;
stat->blksize = (1 << inode->i_blkbits);
}
EXPORT_SYMBOL(generic_fillattr);
int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
int retval;
retval = security_inode_getattr(mnt, dentry);
if (retval)
return retval;
if (inode->i_op->getattr)
return inode->i_op->getattr(mnt, dentry, stat);
generic_fillattr(inode, stat);
return 0;
}
EXPORT_SYMBOL(vfs_getattr);
int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat)
{
struct path path;
int error;
error = user_path_at(dfd, name, LOOKUP_FOLLOW, &path);
if (!error) {
error = vfs_getattr(path.mnt, path.dentry, stat);
path_put(&path);
}
return error;
}
int vfs_stat(char __user *name, struct kstat *stat)
{
return vfs_stat_fd(AT_FDCWD, name, stat);
}
EXPORT_SYMBOL(vfs_stat);
int vfs_lstat_fd(int dfd, char __user *name, struct kstat *stat)
{
struct path path;
int error;
error = user_path_at(dfd, name, 0, &path);
if (!error) {
error = vfs_getattr(path.mnt, path.dentry, stat);
path_put(&path);
}
return error;
}
int vfs_lstat(char __user *name, struct kstat *stat)
{
return vfs_lstat_fd(AT_FDCWD, name, stat);
}
EXPORT_SYMBOL(vfs_lstat);
int vfs_fstat(unsigned int fd, struct kstat *stat)
{
struct file *f = fget(fd);
int error = -EBADF;
if (f) {
error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat);
fput(f);
}
return error;
}
EXPORT_SYMBOL(vfs_fstat);
#ifdef __ARCH_WANT_OLD_STAT
/*
* For backward compatibility? Maybe this should be moved
* into arch/i386 instead?
*/
static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * statbuf)
{
static int warncount = 5;
struct __old_kernel_stat tmp;
if (warncount > 0) {
warncount--;
printk(KERN_WARNING "VFS: Warning: %s using old stat() call. Recompile your binary.\n",
current->comm);
} else if (warncount < 0) {
/* it's laughable, but... */
warncount = 0;
}
memset(&tmp, 0, sizeof(struct __old_kernel_stat));
tmp.st_dev = old_encode_dev(stat->dev);
tmp.st_ino = stat->ino;
if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
return -EOVERFLOW;
tmp.st_mode = stat->mode;
tmp.st_nlink = stat->nlink;
if (tmp.st_nlink != stat->nlink)
return -EOVERFLOW;
SET_UID(tmp.st_uid, stat->uid);
SET_GID(tmp.st_gid, stat->gid);
tmp.st_rdev = old_encode_dev(stat->rdev);
#if BITS_PER_LONG == 32
if (stat->size > MAX_NON_LFS)
return -EOVERFLOW;
#endif
tmp.st_size = stat->size;
tmp.st_atime = stat->atime.tv_sec;
tmp.st_mtime = stat->mtime.tv_sec;
tmp.st_ctime = stat->ctime.tv_sec;
return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
}
asmlinkage long sys_stat(char __user * filename, struct __old_kernel_stat __user * statbuf)
{
struct kstat stat;
int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
if (!error)
error = cp_old_stat(&stat, statbuf);
return error;
}
asmlinkage long sys_lstat(char __user * filename, struct __old_kernel_stat __user * statbuf)
{
struct kstat stat;
int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
if (!error)
error = cp_old_stat(&stat, statbuf);
return error;
}
asmlinkage long sys_fstat(unsigned int fd, struct __old_kernel_stat __user * statbuf)
{
struct kstat stat;
int error = vfs_fstat(fd, &stat);
if (!error)
error = cp_old_stat(&stat, statbuf);
return error;
}
#endif /* __ARCH_WANT_OLD_STAT */
static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
{
struct stat tmp;
#if BITS_PER_LONG == 32
if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
return -EOVERFLOW;
#else
if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev))
return -EOVERFLOW;
#endif
memset(&tmp, 0, sizeof(tmp));
#if BITS_PER_LONG == 32
tmp.st_dev = old_encode_dev(stat->dev);
#else
tmp.st_dev = new_encode_dev(stat->dev);
#endif
tmp.st_ino = stat->ino;
if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
return -EOVERFLOW;
tmp.st_mode = stat->mode;
tmp.st_nlink = stat->nlink;
if (tmp.st_nlink != stat->nlink)
return -EOVERFLOW;
SET_UID(tmp.st_uid, stat->uid);
SET_GID(tmp.st_gid, stat->gid);
#if BITS_PER_LONG == 32
tmp.st_rdev = old_encode_dev(stat->rdev);
#else
tmp.st_rdev = new_encode_dev(stat->rdev);
#endif
#if BITS_PER_LONG == 32
if (stat->size > MAX_NON_LFS)
return -EOVERFLOW;
#endif
tmp.st_size = stat->size;
tmp.st_atime = stat->atime.tv_sec;
tmp.st_mtime = stat->mtime.tv_sec;
tmp.st_ctime = stat->ctime.tv_sec;
#ifdef STAT_HAVE_NSEC
tmp.st_atime_nsec = stat->atime.tv_nsec;
tmp.st_mtime_nsec = stat->mtime.tv_nsec;
tmp.st_ctime_nsec = stat->ctime.tv_nsec;
#endif
tmp.st_blocks = stat->blocks;
tmp.st_blksize = stat->blksize;
return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
}
asmlinkage long sys_newstat(char __user *filename, struct stat __user *statbuf)
{
struct kstat stat;
int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
if (!error)
error = cp_new_stat(&stat, statbuf);
return error;
}
asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf)
{
struct kstat stat;
int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
if (!error)
error = cp_new_stat(&stat, statbuf);
return error;
}
#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
asmlinkage long sys_newfstatat(int dfd, char __user *filename,
struct stat __user *statbuf, int flag)
{
struct kstat stat;
int error = -EINVAL;
if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
goto out;
if (flag & AT_SYMLINK_NOFOLLOW)
error = vfs_lstat_fd(dfd, filename, &stat);
else
error = vfs_stat_fd(dfd, filename, &stat);
if (!error)
error = cp_new_stat(&stat, statbuf);
out:
return error;
}
#endif
asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf)
{
struct kstat stat;
int error = vfs_fstat(fd, &stat);
if (!error)
error = cp_new_stat(&stat, statbuf);
return error;
}
asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
char __user *buf, int bufsiz)
{
struct path path;
int error;
if (bufsiz <= 0)
return -EINVAL;
error = user_path_at(dfd, pathname, 0, &path);
if (!error) {
struct inode *inode = path.dentry->d_inode;
error = -EINVAL;
if (inode->i_op && inode->i_op->readlink) {
error = security_inode_readlink(path.dentry);
if (!error) {
touch_atime(path.mnt, path.dentry);
error = inode->i_op->readlink(path.dentry,
buf, bufsiz);
}
}
path_put(&path);
}
return error;
}
asmlinkage long sys_readlink(const char __user *path, char __user *buf,
int bufsiz)
{
return sys_readlinkat(AT_FDCWD, path, buf, bufsiz);
}
/* ---------- LFS-64 ----------- */
#ifdef __ARCH_WANT_STAT64
static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
{
struct stat64 tmp;
memset(&tmp, 0, sizeof(struct stat64));
#ifdef CONFIG_MIPS
/* mips has weird padding, so we don't get 64 bits there */
if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev))
return -EOVERFLOW;
tmp.st_dev = new_encode_dev(stat->dev);
tmp.st_rdev = new_encode_dev(stat->rdev);
#else
tmp.st_dev = huge_encode_dev(stat->dev);
tmp.st_rdev = huge_encode_dev(stat->rdev);
#endif
tmp.st_ino = stat->ino;
if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
return -EOVERFLOW;
#ifdef STAT64_HAS_BROKEN_ST_INO
tmp.__st_ino = stat->ino;
#endif
tmp.st_mode = stat->mode;
tmp.st_nlink = stat->nlink;
tmp.st_uid = stat->uid;
tmp.st_gid = stat->gid;
tmp.st_atime = stat->atime.tv_sec;
tmp.st_atime_nsec = stat->atime.tv_nsec;
tmp.st_mtime = stat->mtime.tv_sec;
tmp.st_mtime_nsec = stat->mtime.tv_nsec;
tmp.st_ctime = stat->ctime.tv_sec;
tmp.st_ctime_nsec = stat->ctime.tv_nsec;
tmp.st_size = stat->size;
tmp.st_blocks = stat->blocks;
tmp.st_blksize = stat->blksize;
return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
}
asmlinkage long sys_stat64(char __user * filename, struct stat64 __user * statbuf)
{
struct kstat stat;
int error = vfs_stat(filename, &stat);
if (!error)
error = cp_new_stat64(&stat, statbuf);
return error;
}
asmlinkage long sys_lstat64(char __user * filename, struct stat64 __user * statbuf)
{
struct kstat stat;
int error = vfs_lstat(filename, &stat);
if (!error)
error = cp_new_stat64(&stat, statbuf);
return error;
}
asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user * statbuf)
{
struct kstat stat;
int error = vfs_fstat(fd, &stat);
if (!error)
error = cp_new_stat64(&stat, statbuf);
return error;
}
asmlinkage long sys_fstatat64(int dfd, char __user *filename,
struct stat64 __user *statbuf, int flag)
{
struct kstat stat;
int error = -EINVAL;
if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
goto out;
if (flag & AT_SYMLINK_NOFOLLOW)
error = vfs_lstat_fd(dfd, filename, &stat);
else
error = vfs_stat_fd(dfd, filename, &stat);
if (!error)
error = cp_new_stat64(&stat, statbuf);
out:
return error;
}
#endif /* __ARCH_WANT_STAT64 */
void inode_add_bytes(struct inode *inode, loff_t bytes)
{
spin_lock(&inode->i_lock);
inode->i_blocks += bytes >> 9;
bytes &= 511;
inode->i_bytes += bytes;
if (inode->i_bytes >= 512) {
inode->i_blocks++;
inode->i_bytes -= 512;
}
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(inode_add_bytes);
void inode_sub_bytes(struct inode *inode, loff_t bytes)
{
spin_lock(&inode->i_lock);
inode->i_blocks -= bytes >> 9;
bytes &= 511;
if (inode->i_bytes < bytes) {
inode->i_blocks--;
inode->i_bytes += 512;
}
inode->i_bytes -= bytes;
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(inode_sub_bytes);
loff_t inode_get_bytes(struct inode *inode)
{
loff_t ret;
spin_lock(&inode->i_lock);
ret = (((loff_t)inode->i_blocks) << 9) + inode->i_bytes;
spin_unlock(&inode->i_lock);
return ret;
}
EXPORT_SYMBOL(inode_get_bytes);
void inode_set_bytes(struct inode *inode, loff_t bytes)
{
/* Caller is here responsible for sufficient locking
* (ie. inode->i_lock) */
inode->i_blocks = bytes >> 9;
inode->i_bytes = bytes & 511;
}
EXPORT_SYMBOL(inode_set_bytes);
Computing file changes ...