Revision c61ea31dac0319ec64b33725917bda81fc293a25 authored by David Howells on 11 May 2010, 15:51:39 UTC, committed by Linus Torvalds on 11 May 2010, 17:07:53 UTC
Fix an occasional EIO returned by a call to vfs_unlink():

	[ 4868.465413] CacheFiles: I/O Error: Unlink failed
	[ 4868.465444] FS-Cache: Cache cachefiles stopped due to I/O error
	[ 4947.320011] CacheFiles: File cache on md3 unregistering
	[ 4947.320041] FS-Cache: Withdrawing cache "mycache"
	[ 5127.348683] FS-Cache: Cache "mycache" added (type cachefiles)
	[ 5127.348716] CacheFiles: File cache on md3 registered
	[ 7076.871081] CacheFiles: I/O Error: Unlink failed
	[ 7076.871130] FS-Cache: Cache cachefiles stopped due to I/O error
	[ 7116.780891] CacheFiles: File cache on md3 unregistering
	[ 7116.780937] FS-Cache: Withdrawing cache "mycache"
	[ 7296.813394] FS-Cache: Cache "mycache" added (type cachefiles)
	[ 7296.813432] CacheFiles: File cache on md3 registered

What happens is this:

 (1) A cached NFS file is seen to have become out of date, so NFS retires the
     object and immediately acquires a new object with the same key.

 (2) Retirement of the old object is done asynchronously - so the lookup/create
     to generate the new object may be done first.

     This can be a problem as the old object and the new object must exist at
     the same point in the backing filesystem (i.e. they must have the same
     pathname).

 (3) The lookup for the new object sees that a backing file already exists,
     checks to see whether it is valid and sees that it isn't.  It then deletes
     that file and creates a new one on disk.

 (4) The retirement phase for the old file is then performed.  It tries to
     delete the dentry it has, but ext4_unlink() returns -EIO because the inode
     attached to that dentry no longer matches the inode number associated with
     the filename in the parent directory.

The trace below shows this quite well.

	[md5sum] ==> __fscache_relinquish_cookie(ffff88002d12fb58{NFS.fh,ffff88002ce62100},1)
	[md5sum] ==> __fscache_acquire_cookie({NFS.server},{NFS.fh},ffff88002ce62100)

NFS has retired the old cookie and asked for a new one.

	[kslowd] ==> fscache_object_state_machine({OBJ52,OBJECT_ACTIVE,24})
	[kslowd] <== fscache_object_state_machine() [->OBJECT_DYING]
	[kslowd] ==> fscache_object_state_machine({OBJ53,OBJECT_INIT,0})
	[kslowd] <== fscache_object_state_machine() [->OBJECT_LOOKING_UP]
	[kslowd] ==> fscache_object_state_machine({OBJ52,OBJECT_DYING,24})
	[kslowd] <== fscache_object_state_machine() [->OBJECT_RECYCLING]

The old object (OBJ52) is going through the terminal states to get rid of it,
whilst the new object - (OBJ53) - is coming into being.

	[kslowd] ==> fscache_object_state_machine({OBJ53,OBJECT_LOOKING_UP,0})
	[kslowd] ==> cachefiles_walk_to_object({ffff88003029d8b8},OBJ53,@68,)
	[kslowd] lookup '@68'
	[kslowd] next -> ffff88002ce41bd0 positive
	[kslowd] advance
	[kslowd] lookup 'Es0g00og0_Nd_XCYe3BOzvXrsBLMlN6aw16M1htaA'
	[kslowd] next -> ffff8800369faac8 positive

The new object has looked up the subdir in which the file would be in (getting
dentry ffff88002ce41bd0) and then looked up the file itself (getting dentry
ffff8800369faac8).

	[kslowd] validate 'Es0g00og0_Nd_XCYe3BOzvXrsBLMlN6aw16M1htaA'
	[kslowd] ==> cachefiles_bury_object(,'@68','Es0g00og0_Nd_XCYe3BOzvXrsBLMlN6aw16M1htaA')
	[kslowd] remove ffff8800369faac8 from ffff88002ce41bd0
	[kslowd] unlink stale object
	[kslowd] <== cachefiles_bury_object() = 0

It then checks the file's xattrs to see if it's valid.  NFS says that the
auxiliary data indicate the file is out of date (obvious to us - that's why NFS
ditched the old version and got a new one).  CacheFiles then deletes the old
file (dentry ffff8800369faac8).

	[kslowd] redo lookup
	[kslowd] lookup 'Es0g00og0_Nd_XCYe3BOzvXrsBLMlN6aw16M1htaA'
	[kslowd] next -> ffff88002cd94288 negative
	[kslowd] create -> ffff88002cd94288{ffff88002cdaf238{ino=148247}}

CacheFiles then redoes the lookup and gets a negative result in a new dentry
(ffff88002cd94288) which it then creates a file for.

	[kslowd] ==> cachefiles_mark_object_active(,OBJ53)
	[kslowd] <== cachefiles_mark_object_active() = 0
	[kslowd] === OBTAINED_OBJECT ===
	[kslowd] <== cachefiles_walk_to_object() = 0 [148247]
	[kslowd] <== fscache_object_state_machine() [->OBJECT_AVAILABLE]

The new object is then marked active and the state machine moves to the
available state - at which point NFS can start filling the object.

	[kslowd] ==> fscache_object_state_machine({OBJ52,OBJECT_RECYCLING,20})
	[kslowd] ==> fscache_release_object()
	[kslowd] ==> cachefiles_drop_object({OBJ52,2})
	[kslowd] ==> cachefiles_delete_object(,OBJ52{ffff8800369faac8})

The old object, meanwhile, goes on with being retired.  If allocation occurs
first, cachefiles_delete_object() has to wait for dir->d_inode->i_mutex to
become available before it can continue.

	[kslowd] ==> cachefiles_bury_object(,'@68','Es0g00og0_Nd_XCYe3BOzvXrsBLMlN6aw16M1htaA')
	[kslowd] remove ffff8800369faac8 from ffff88002ce41bd0
	[kslowd] unlink stale object
	EXT4-fs warning (device sda6): ext4_unlink: Inode number mismatch in unlink (148247!=148193)
	CacheFiles: I/O Error: Unlink failed
	FS-Cache: Cache cachefiles stopped due to I/O error

CacheFiles then tries to delete the file for the old object, but the dentry it
has (ffff8800369faac8) no longer points to a valid inode for that directory
entry, and so ext4_unlink() returns -EIO when de->inode does not match i_ino.

	[kslowd] <== cachefiles_bury_object() = -5
	[kslowd] <== cachefiles_delete_object() = -5
	[kslowd] <== fscache_object_state_machine() [->OBJECT_DEAD]
	[kslowd] ==> fscache_object_state_machine({OBJ53,OBJECT_AVAILABLE,0})
	[kslowd] <== fscache_object_state_machine() [->OBJECT_ACTIVE]

(Note that the above trace includes extra information beyond that produced by
the upstream code).

The fix is to note when an object that is being retired has had its object
deleted preemptively by a replacement object that is being created, and to
skip the second removal attempt in such a case.

Reported-by: Greg M <gregm@servu.net.au>
Reported-by: Mark Moseley <moseleymark@gmail.com>
Reported-by: Romain DEGEZ <romain.degez@smartjog.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 7d6fb7b
Raw File
scatterlist.c
/*
 * Copyright (C) 2007 Jens Axboe <jens.axboe@oracle.com>
 *
 * Scatterlist handling helpers.
 *
 * This source code is licensed under the GNU General Public License,
 * Version 2. See the file COPYING for more details.
 */
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/scatterlist.h>
#include <linux/highmem.h>

/**
 * sg_next - return the next scatterlist entry in a list
 * @sg:		The current sg entry
 *
 * Description:
 *   Usually the next entry will be @sg@ + 1, but if this sg element is part
 *   of a chained scatterlist, it could jump to the start of a new
 *   scatterlist array.
 *
 **/
struct scatterlist *sg_next(struct scatterlist *sg)
{
#ifdef CONFIG_DEBUG_SG
	BUG_ON(sg->sg_magic != SG_MAGIC);
#endif
	if (sg_is_last(sg))
		return NULL;

	sg++;
	if (unlikely(sg_is_chain(sg)))
		sg = sg_chain_ptr(sg);

	return sg;
}
EXPORT_SYMBOL(sg_next);

/**
 * sg_last - return the last scatterlist entry in a list
 * @sgl:	First entry in the scatterlist
 * @nents:	Number of entries in the scatterlist
 *
 * Description:
 *   Should only be used casually, it (currently) scans the entire list
 *   to get the last entry.
 *
 *   Note that the @sgl@ pointer passed in need not be the first one,
 *   the important bit is that @nents@ denotes the number of entries that
 *   exist from @sgl@.
 *
 **/
struct scatterlist *sg_last(struct scatterlist *sgl, unsigned int nents)
{
#ifndef ARCH_HAS_SG_CHAIN
	struct scatterlist *ret = &sgl[nents - 1];
#else
	struct scatterlist *sg, *ret = NULL;
	unsigned int i;

	for_each_sg(sgl, sg, nents, i)
		ret = sg;

#endif
#ifdef CONFIG_DEBUG_SG
	BUG_ON(sgl[0].sg_magic != SG_MAGIC);
	BUG_ON(!sg_is_last(ret));
#endif
	return ret;
}
EXPORT_SYMBOL(sg_last);

/**
 * sg_init_table - Initialize SG table
 * @sgl:	   The SG table
 * @nents:	   Number of entries in table
 *
 * Notes:
 *   If this is part of a chained sg table, sg_mark_end() should be
 *   used only on the last table part.
 *
 **/
void sg_init_table(struct scatterlist *sgl, unsigned int nents)
{
	memset(sgl, 0, sizeof(*sgl) * nents);
#ifdef CONFIG_DEBUG_SG
	{
		unsigned int i;
		for (i = 0; i < nents; i++)
			sgl[i].sg_magic = SG_MAGIC;
	}
#endif
	sg_mark_end(&sgl[nents - 1]);
}
EXPORT_SYMBOL(sg_init_table);

/**
 * sg_init_one - Initialize a single entry sg list
 * @sg:		 SG entry
 * @buf:	 Virtual address for IO
 * @buflen:	 IO length
 *
 **/
void sg_init_one(struct scatterlist *sg, const void *buf, unsigned int buflen)
{
	sg_init_table(sg, 1);
	sg_set_buf(sg, buf, buflen);
}
EXPORT_SYMBOL(sg_init_one);

/*
 * The default behaviour of sg_alloc_table() is to use these kmalloc/kfree
 * helpers.
 */
static struct scatterlist *sg_kmalloc(unsigned int nents, gfp_t gfp_mask)
{
	if (nents == SG_MAX_SINGLE_ALLOC)
		return (struct scatterlist *) __get_free_page(gfp_mask);
	else
		return kmalloc(nents * sizeof(struct scatterlist), gfp_mask);
}

static void sg_kfree(struct scatterlist *sg, unsigned int nents)
{
	if (nents == SG_MAX_SINGLE_ALLOC)
		free_page((unsigned long) sg);
	else
		kfree(sg);
}

/**
 * __sg_free_table - Free a previously mapped sg table
 * @table:	The sg table header to use
 * @max_ents:	The maximum number of entries per single scatterlist
 * @free_fn:	Free function
 *
 *  Description:
 *    Free an sg table previously allocated and setup with
 *    __sg_alloc_table().  The @max_ents value must be identical to
 *    that previously used with __sg_alloc_table().
 *
 **/
void __sg_free_table(struct sg_table *table, unsigned int max_ents,
		     sg_free_fn *free_fn)
{
	struct scatterlist *sgl, *next;

	if (unlikely(!table->sgl))
		return;

	sgl = table->sgl;
	while (table->orig_nents) {
		unsigned int alloc_size = table->orig_nents;
		unsigned int sg_size;

		/*
		 * If we have more than max_ents segments left,
		 * then assign 'next' to the sg table after the current one.
		 * sg_size is then one less than alloc size, since the last
		 * element is the chain pointer.
		 */
		if (alloc_size > max_ents) {
			next = sg_chain_ptr(&sgl[max_ents - 1]);
			alloc_size = max_ents;
			sg_size = alloc_size - 1;
		} else {
			sg_size = alloc_size;
			next = NULL;
		}

		table->orig_nents -= sg_size;
		free_fn(sgl, alloc_size);
		sgl = next;
	}

	table->sgl = NULL;
}
EXPORT_SYMBOL(__sg_free_table);

/**
 * sg_free_table - Free a previously allocated sg table
 * @table:	The mapped sg table header
 *
 **/
void sg_free_table(struct sg_table *table)
{
	__sg_free_table(table, SG_MAX_SINGLE_ALLOC, sg_kfree);
}
EXPORT_SYMBOL(sg_free_table);

/**
 * __sg_alloc_table - Allocate and initialize an sg table with given allocator
 * @table:	The sg table header to use
 * @nents:	Number of entries in sg list
 * @max_ents:	The maximum number of entries the allocator returns per call
 * @gfp_mask:	GFP allocation mask
 * @alloc_fn:	Allocator to use
 *
 * Description:
 *   This function returns a @table @nents long. The allocator is
 *   defined to return scatterlist chunks of maximum size @max_ents.
 *   Thus if @nents is bigger than @max_ents, the scatterlists will be
 *   chained in units of @max_ents.
 *
 * Notes:
 *   If this function returns non-0 (eg failure), the caller must call
 *   __sg_free_table() to cleanup any leftover allocations.
 *
 **/
int __sg_alloc_table(struct sg_table *table, unsigned int nents,
		     unsigned int max_ents, gfp_t gfp_mask,
		     sg_alloc_fn *alloc_fn)
{
	struct scatterlist *sg, *prv;
	unsigned int left;

#ifndef ARCH_HAS_SG_CHAIN
	BUG_ON(nents > max_ents);
#endif

	memset(table, 0, sizeof(*table));

	left = nents;
	prv = NULL;
	do {
		unsigned int sg_size, alloc_size = left;

		if (alloc_size > max_ents) {
			alloc_size = max_ents;
			sg_size = alloc_size - 1;
		} else
			sg_size = alloc_size;

		left -= sg_size;

		sg = alloc_fn(alloc_size, gfp_mask);
		if (unlikely(!sg))
			return -ENOMEM;

		sg_init_table(sg, alloc_size);
		table->nents = table->orig_nents += sg_size;

		/*
		 * If this is the first mapping, assign the sg table header.
		 * If this is not the first mapping, chain previous part.
		 */
		if (prv)
			sg_chain(prv, max_ents, sg);
		else
			table->sgl = sg;

		/*
		 * If no more entries after this one, mark the end
		 */
		if (!left)
			sg_mark_end(&sg[sg_size - 1]);

		/*
		 * only really needed for mempool backed sg allocations (like
		 * SCSI), a possible improvement here would be to pass the
		 * table pointer into the allocator and let that clear these
		 * flags
		 */
		gfp_mask &= ~__GFP_WAIT;
		gfp_mask |= __GFP_HIGH;
		prv = sg;
	} while (left);

	return 0;
}
EXPORT_SYMBOL(__sg_alloc_table);

/**
 * sg_alloc_table - Allocate and initialize an sg table
 * @table:	The sg table header to use
 * @nents:	Number of entries in sg list
 * @gfp_mask:	GFP allocation mask
 *
 *  Description:
 *    Allocate and initialize an sg table. If @nents@ is larger than
 *    SG_MAX_SINGLE_ALLOC a chained sg table will be setup.
 *
 **/
int sg_alloc_table(struct sg_table *table, unsigned int nents, gfp_t gfp_mask)
{
	int ret;

	ret = __sg_alloc_table(table, nents, SG_MAX_SINGLE_ALLOC,
			       gfp_mask, sg_kmalloc);
	if (unlikely(ret))
		__sg_free_table(table, SG_MAX_SINGLE_ALLOC, sg_kfree);

	return ret;
}
EXPORT_SYMBOL(sg_alloc_table);

/**
 * sg_miter_start - start mapping iteration over a sg list
 * @miter: sg mapping iter to be started
 * @sgl: sg list to iterate over
 * @nents: number of sg entries
 *
 * Description:
 *   Starts mapping iterator @miter.
 *
 * Context:
 *   Don't care.
 */
void sg_miter_start(struct sg_mapping_iter *miter, struct scatterlist *sgl,
		    unsigned int nents, unsigned int flags)
{
	memset(miter, 0, sizeof(struct sg_mapping_iter));

	miter->__sg = sgl;
	miter->__nents = nents;
	miter->__offset = 0;
	WARN_ON(!(flags & (SG_MITER_TO_SG | SG_MITER_FROM_SG)));
	miter->__flags = flags;
}
EXPORT_SYMBOL(sg_miter_start);

/**
 * sg_miter_next - proceed mapping iterator to the next mapping
 * @miter: sg mapping iter to proceed
 *
 * Description:
 *   Proceeds @miter@ to the next mapping.  @miter@ should have been
 *   started using sg_miter_start().  On successful return,
 *   @miter@->page, @miter@->addr and @miter@->length point to the
 *   current mapping.
 *
 * Context:
 *   IRQ disabled if SG_MITER_ATOMIC.  IRQ must stay disabled till
 *   @miter@ is stopped.  May sleep if !SG_MITER_ATOMIC.
 *
 * Returns:
 *   true if @miter contains the next mapping.  false if end of sg
 *   list is reached.
 */
bool sg_miter_next(struct sg_mapping_iter *miter)
{
	unsigned int off, len;

	/* check for end and drop resources from the last iteration */
	if (!miter->__nents)
		return false;

	sg_miter_stop(miter);

	/* get to the next sg if necessary.  __offset is adjusted by stop */
	while (miter->__offset == miter->__sg->length) {
		if (--miter->__nents) {
			miter->__sg = sg_next(miter->__sg);
			miter->__offset = 0;
		} else
			return false;
	}

	/* map the next page */
	off = miter->__sg->offset + miter->__offset;
	len = miter->__sg->length - miter->__offset;

	miter->page = nth_page(sg_page(miter->__sg), off >> PAGE_SHIFT);
	off &= ~PAGE_MASK;
	miter->length = min_t(unsigned int, len, PAGE_SIZE - off);
	miter->consumed = miter->length;

	if (miter->__flags & SG_MITER_ATOMIC)
		miter->addr = kmap_atomic(miter->page, KM_BIO_SRC_IRQ) + off;
	else
		miter->addr = kmap(miter->page) + off;

	return true;
}
EXPORT_SYMBOL(sg_miter_next);

/**
 * sg_miter_stop - stop mapping iteration
 * @miter: sg mapping iter to be stopped
 *
 * Description:
 *   Stops mapping iterator @miter.  @miter should have been started
 *   started using sg_miter_start().  A stopped iteration can be
 *   resumed by calling sg_miter_next() on it.  This is useful when
 *   resources (kmap) need to be released during iteration.
 *
 * Context:
 *   IRQ disabled if the SG_MITER_ATOMIC is set.  Don't care otherwise.
 */
void sg_miter_stop(struct sg_mapping_iter *miter)
{
	WARN_ON(miter->consumed > miter->length);

	/* drop resources from the last iteration */
	if (miter->addr) {
		miter->__offset += miter->consumed;

		if (miter->__flags & SG_MITER_TO_SG)
			flush_kernel_dcache_page(miter->page);

		if (miter->__flags & SG_MITER_ATOMIC) {
			WARN_ON(!irqs_disabled());
			kunmap_atomic(miter->addr, KM_BIO_SRC_IRQ);
		} else
			kunmap(miter->page);

		miter->page = NULL;
		miter->addr = NULL;
		miter->length = 0;
		miter->consumed = 0;
	}
}
EXPORT_SYMBOL(sg_miter_stop);

/**
 * sg_copy_buffer - Copy data between a linear buffer and an SG list
 * @sgl:		 The SG list
 * @nents:		 Number of SG entries
 * @buf:		 Where to copy from
 * @buflen:		 The number of bytes to copy
 * @to_buffer: 		 transfer direction (non zero == from an sg list to a
 * 			 buffer, 0 == from a buffer to an sg list
 *
 * Returns the number of copied bytes.
 *
 **/
static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents,
			     void *buf, size_t buflen, int to_buffer)
{
	unsigned int offset = 0;
	struct sg_mapping_iter miter;
	unsigned long flags;
	unsigned int sg_flags = SG_MITER_ATOMIC;

	if (to_buffer)
		sg_flags |= SG_MITER_FROM_SG;
	else
		sg_flags |= SG_MITER_TO_SG;

	sg_miter_start(&miter, sgl, nents, sg_flags);

	local_irq_save(flags);

	while (sg_miter_next(&miter) && offset < buflen) {
		unsigned int len;

		len = min(miter.length, buflen - offset);

		if (to_buffer)
			memcpy(buf + offset, miter.addr, len);
		else
			memcpy(miter.addr, buf + offset, len);

		offset += len;
	}

	sg_miter_stop(&miter);

	local_irq_restore(flags);
	return offset;
}

/**
 * sg_copy_from_buffer - Copy from a linear buffer to an SG list
 * @sgl:		 The SG list
 * @nents:		 Number of SG entries
 * @buf:		 Where to copy from
 * @buflen:		 The number of bytes to copy
 *
 * Returns the number of copied bytes.
 *
 **/
size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents,
			   void *buf, size_t buflen)
{
	return sg_copy_buffer(sgl, nents, buf, buflen, 0);
}
EXPORT_SYMBOL(sg_copy_from_buffer);

/**
 * sg_copy_to_buffer - Copy from an SG list to a linear buffer
 * @sgl:		 The SG list
 * @nents:		 Number of SG entries
 * @buf:		 Where to copy to
 * @buflen:		 The number of bytes to copy
 *
 * Returns the number of copied bytes.
 *
 **/
size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents,
			 void *buf, size_t buflen)
{
	return sg_copy_buffer(sgl, nents, buf, buflen, 1);
}
EXPORT_SYMBOL(sg_copy_to_buffer);
back to top