Revision b91e1302ad9b80c174a4855533f7e3aa2873355e authored by Linus Torvalds on 27 December 2016, 19:40:38 UTC, committed by Linus Torvalds on 29 December 2016, 19:03:15 UTC
In commit 62906027091f ("mm: add PageWaiters indicating tasks are waiting for a page bit") Nick Piggin made our page locking no longer unconditionally touch the hashed page waitqueue, which not only helps performance in general, but is particularly helpful on NUMA machines where the hashed wait queues can bounce around a lot. However, the "clear lock bit atomically and then test the waiters bit" sequence turns out to be much more expensive than it needs to be, because you get a nasty stall when trying to access the same word that just got updated atomically. On architectures where locking is done with LL/SC, this would be trivial to fix with a new primitive that clears one bit and tests another atomically, but that ends up not working on x86, where the only atomic operations that return the result end up being cmpxchg and xadd. The atomic bit operations return the old value of the same bit we changed, not the value of an unrelated bit. On x86, we could put the lock bit in the high bit of the byte, and use "xadd" with that bit (where the overflow ends up not touching other bits), and look at the other bits of the result. However, an even simpler model is to just use a regular atomic "and" to clear the lock bit, and then the sign bit in eflags will indicate the resulting state of the unrelated bit #7. So by moving the PageWaiters bit up to bit #7, we can atomically clear the lock bit and test the waiters bit on x86 too. And architectures with LL/SC (which is all the usual RISC suspects), the particular bit doesn't matter, so they are fine with this approach too. This avoids the extra access to the same atomic word, and thus avoids the costly stall at page unlock time. The only downside is that the interface ends up being a bit odd and specialized: clear a bit in a byte, and test the sign bit. Nick doesn't love the resulting name of the new primitive, but I'd rather make the name be descriptive and very clear about the limitation imposed by trying to work across all relevant architectures than make it be some generic thing that doesn't make the odd semantics explicit. So this introduces the new architecture primitive clear_bit_unlock_is_negative_byte(); and adds the trivial implementation for x86. We have a generic non-optimized fallback (that just does a "clear_bit()"+"test_bit(7)" combination) which can be overridden by any architecture that can do better. According to Nick, Power has the same hickup x86 has, for example, but some other architectures may not even care. All these optimizations mean that my page locking stress-test (which is just executing a lot of small short-lived shell scripts: "make test" in the git source tree) no longer makes our page locking look horribly bad. Before all these optimizations, just the unlock_page() costs were just over 3% of all CPU overhead on "make test". After this, it's down to 0.66%, so just a quarter of the cost it used to be. (The difference on NUMA is bigger, but there this micro-optimization is likely less noticeable, since the big issue on NUMA was not the accesses to 'struct page', but the waitqueue accesses that were already removed by Nick's earlier commit). Acked-by: Nick Piggin <npiggin@gmail.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Bob Peterson <rpeterso@redhat.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Andrew Lutomirski <luto@kernel.org> Cc: Andreas Gruenbacher <agruenba@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 2d706e7
file.c
/* AFS filesystem file handling
*
* Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/gfp.h>
#include "internal.h"
static int afs_readpage(struct file *file, struct page *page);
static void afs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length);
static int afs_releasepage(struct page *page, gfp_t gfp_flags);
static int afs_launder_page(struct page *page);
static int afs_readpages(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
const struct file_operations afs_file_operations = {
.open = afs_open,
.release = afs_release,
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = afs_file_write,
.mmap = generic_file_readonly_mmap,
.splice_read = generic_file_splice_read,
.fsync = afs_fsync,
.lock = afs_lock,
.flock = afs_flock,
};
const struct inode_operations afs_file_inode_operations = {
.getattr = afs_getattr,
.setattr = afs_setattr,
.permission = afs_permission,
};
const struct address_space_operations afs_fs_aops = {
.readpage = afs_readpage,
.readpages = afs_readpages,
.set_page_dirty = afs_set_page_dirty,
.launder_page = afs_launder_page,
.releasepage = afs_releasepage,
.invalidatepage = afs_invalidatepage,
.write_begin = afs_write_begin,
.write_end = afs_write_end,
.writepage = afs_writepage,
.writepages = afs_writepages,
};
/*
* open an AFS file or directory and attach a key to it
*/
int afs_open(struct inode *inode, struct file *file)
{
struct afs_vnode *vnode = AFS_FS_I(inode);
struct key *key;
int ret;
_enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
key = afs_request_key(vnode->volume->cell);
if (IS_ERR(key)) {
_leave(" = %ld [key]", PTR_ERR(key));
return PTR_ERR(key);
}
ret = afs_validate(vnode, key);
if (ret < 0) {
_leave(" = %d [val]", ret);
return ret;
}
file->private_data = key;
_leave(" = 0");
return 0;
}
/*
* release an AFS file or directory and discard its key
*/
int afs_release(struct inode *inode, struct file *file)
{
struct afs_vnode *vnode = AFS_FS_I(inode);
_enter("{%x:%u},", vnode->fid.vid, vnode->fid.vnode);
key_put(file->private_data);
_leave(" = 0");
return 0;
}
#ifdef CONFIG_AFS_FSCACHE
/*
* deal with notification that a page was read from the cache
*/
static void afs_file_readpage_read_complete(struct page *page,
void *data,
int error)
{
_enter("%p,%p,%d", page, data, error);
/* if the read completes with an error, we just unlock the page and let
* the VM reissue the readpage */
if (!error)
SetPageUptodate(page);
unlock_page(page);
}
#endif
/*
* read page from file, directory or symlink, given a key to use
*/
int afs_page_filler(void *data, struct page *page)
{
struct inode *inode = page->mapping->host;
struct afs_vnode *vnode = AFS_FS_I(inode);
struct key *key = data;
size_t len;
off_t offset;
int ret;
_enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
BUG_ON(!PageLocked(page));
ret = -ESTALE;
if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
goto error;
/* is it cached? */
#ifdef CONFIG_AFS_FSCACHE
ret = fscache_read_or_alloc_page(vnode->cache,
page,
afs_file_readpage_read_complete,
NULL,
GFP_KERNEL);
#else
ret = -ENOBUFS;
#endif
switch (ret) {
/* read BIO submitted (page in cache) */
case 0:
break;
/* page not yet cached */
case -ENODATA:
_debug("cache said ENODATA");
goto go_on;
/* page will not be cached */
case -ENOBUFS:
_debug("cache said ENOBUFS");
default:
go_on:
offset = page->index << PAGE_SHIFT;
len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
/* read the contents of the file from the server into the
* page */
ret = afs_vnode_fetch_data(vnode, key, offset, len, page);
if (ret < 0) {
if (ret == -ENOENT) {
_debug("got NOENT from server"
" - marking file deleted and stale");
set_bit(AFS_VNODE_DELETED, &vnode->flags);
ret = -ESTALE;
}
#ifdef CONFIG_AFS_FSCACHE
fscache_uncache_page(vnode->cache, page);
#endif
BUG_ON(PageFsCache(page));
goto error;
}
SetPageUptodate(page);
/* send the page to the cache */
#ifdef CONFIG_AFS_FSCACHE
if (PageFsCache(page) &&
fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) {
fscache_uncache_page(vnode->cache, page);
BUG_ON(PageFsCache(page));
}
#endif
unlock_page(page);
}
_leave(" = 0");
return 0;
error:
SetPageError(page);
unlock_page(page);
_leave(" = %d", ret);
return ret;
}
/*
* read page from file, directory or symlink, given a file to nominate the key
* to be used
*/
static int afs_readpage(struct file *file, struct page *page)
{
struct key *key;
int ret;
if (file) {
key = file->private_data;
ASSERT(key != NULL);
ret = afs_page_filler(key, page);
} else {
struct inode *inode = page->mapping->host;
key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
if (IS_ERR(key)) {
ret = PTR_ERR(key);
} else {
ret = afs_page_filler(key, page);
key_put(key);
}
}
return ret;
}
/*
* read a set of pages
*/
static int afs_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
struct key *key = file->private_data;
struct afs_vnode *vnode;
int ret = 0;
_enter("{%d},{%lu},,%d",
key_serial(key), mapping->host->i_ino, nr_pages);
ASSERT(key != NULL);
vnode = AFS_FS_I(mapping->host);
if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
_leave(" = -ESTALE");
return -ESTALE;
}
/* attempt to read as many of the pages as possible */
#ifdef CONFIG_AFS_FSCACHE
ret = fscache_read_or_alloc_pages(vnode->cache,
mapping,
pages,
&nr_pages,
afs_file_readpage_read_complete,
NULL,
mapping_gfp_mask(mapping));
#else
ret = -ENOBUFS;
#endif
switch (ret) {
/* all pages are being read from the cache */
case 0:
BUG_ON(!list_empty(pages));
BUG_ON(nr_pages != 0);
_leave(" = 0 [reading all]");
return 0;
/* there were pages that couldn't be read from the cache */
case -ENODATA:
case -ENOBUFS:
break;
/* other error */
default:
_leave(" = %d", ret);
return ret;
}
/* load the missing pages from the network */
ret = read_cache_pages(mapping, pages, afs_page_filler, key);
_leave(" = %d [netting]", ret);
return ret;
}
/*
* write back a dirty page
*/
static int afs_launder_page(struct page *page)
{
_enter("{%lu}", page->index);
return 0;
}
/*
* invalidate part or all of a page
* - release a page and clean up its private data if offset is 0 (indicating
* the entire page)
*/
static void afs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
_enter("{%lu},%u,%u", page->index, offset, length);
BUG_ON(!PageLocked(page));
/* we clean up only if the entire page is being invalidated */
if (offset == 0 && length == PAGE_SIZE) {
#ifdef CONFIG_AFS_FSCACHE
if (PageFsCache(page)) {
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
fscache_wait_on_page_write(vnode->cache, page);
fscache_uncache_page(vnode->cache, page);
}
#endif
if (PagePrivate(page)) {
if (wb && !PageWriteback(page)) {
set_page_private(page, 0);
afs_put_writeback(wb);
}
if (!page_private(page))
ClearPagePrivate(page);
}
}
_leave("");
}
/*
* release a page and clean up its private state if it's not busy
* - return true if the page can now be released, false if not
*/
static int afs_releasepage(struct page *page, gfp_t gfp_flags)
{
struct afs_writeback *wb = (struct afs_writeback *) page_private(page);
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
_enter("{{%x:%u}[%lu],%lx},%x",
vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
gfp_flags);
/* deny if page is being written to the cache and the caller hasn't
* elected to wait */
#ifdef CONFIG_AFS_FSCACHE
if (!fscache_maybe_release_page(vnode->cache, page, gfp_flags)) {
_leave(" = F [cache busy]");
return 0;
}
#endif
if (PagePrivate(page)) {
if (wb) {
set_page_private(page, 0);
afs_put_writeback(wb);
}
ClearPagePrivate(page);
}
/* indicate that the page can be released */
_leave(" = T");
return 1;
}
![swh spinner](/static/img/swh-spinner.gif)
Computing file changes ...