Revision e34a314c5e49fe6b763568f6576b19f1299c33c2 authored by Dave Chinner on 27 January 2011, 01:13:35 UTC, committed by Alex Elder on 28 January 2011, 15:01:33 UTC
After test 139, kmemleak shows:

unreferenced object 0xffff880078b405d8 (size 400):
  comm "xfs_io", pid 4904, jiffies 4294909383 (age 1186.728s)
  hex dump (first 32 bytes):
    60 c1 17 79 00 88 ff ff 60 c1 17 79 00 88 ff ff  `..y....`..y....
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<ffffffff81afb04d>] kmemleak_alloc+0x2d/0x60
    [<ffffffff8115c6cf>] kmem_cache_alloc+0x13f/0x2b0
    [<ffffffff814aaa97>] kmem_zone_alloc+0x77/0xf0
    [<ffffffff814aab2e>] kmem_zone_zalloc+0x1e/0x50
    [<ffffffff8147cd6b>] xfs_efi_init+0x4b/0xb0
    [<ffffffff814a4ee8>] xfs_trans_get_efi+0x58/0x90
    [<ffffffff81455fab>] xfs_bmap_finish+0x8b/0x1d0
    [<ffffffff814851b4>] xfs_itruncate_finish+0x2c4/0x5d0
    [<ffffffff814a970f>] xfs_setattr+0x8df/0xa70
    [<ffffffff814b5c7b>] xfs_vn_setattr+0x1b/0x20
    [<ffffffff8117dc00>] notify_change+0x170/0x2e0
    [<ffffffff81163bf6>] do_truncate+0x66/0xa0
    [<ffffffff81163d0b>] sys_ftruncate+0xdb/0xe0
    [<ffffffff8103a002>] system_call_fastpath+0x16/0x1b
    [<ffffffffffffffff>] 0xffffffffffffffff

The cause of the leak is that the "remove" parameter of IOP_UNPIN()
is never set when a CIL push is aborted. This means that the EFI
item is never freed if it was in the push being cancelled. The
problem is specific to delayed logging, but has uncovered a couple
of problems with the handling of IOP_UNPIN(remove).

Firstly, we cannot safely call xfs_trans_del_item() from IOP_UNPIN()
in the CIL commit failure path or the iclog write failure path
because for delayed loging we have no transaction context. Hence we
must only call xfs_trans_del_item() if the log item being unpinned
has an active log item descriptor.

Secondly, xfs_trans_uncommit() does not handle log item descriptor
freeing during the traversal of log items on a transaction. It can
reference a freed log item descriptor when unpinning an EFI item.
Hence it needs to use a safe list traversal method to allow items to
be removed from the transaction during IOP_UNPIN().

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
1 parent 7db37c5
Raw File
xfs_attr.h
/*
 * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it would be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
#ifndef __XFS_ATTR_H__
#define	__XFS_ATTR_H__

struct xfs_inode;
struct xfs_da_args;
struct xfs_attr_list_context;

/*
 * Large attribute lists are structured around Btrees where all the data
 * elements are in the leaf nodes.  Attribute names are hashed into an int,
 * then that int is used as the index into the Btree.  Since the hashval
 * of an attribute name may not be unique, we may have duplicate keys.
 * The internal links in the Btree are logical block offsets into the file.
 *
 * Small attribute lists use a different format and are packed as tightly
 * as possible so as to fit into the literal area of the inode.
 */

/*========================================================================
 * External interfaces
 *========================================================================*/


#define ATTR_DONTFOLLOW	0x0001	/* -- unused, from IRIX -- */
#define ATTR_ROOT	0x0002	/* use attrs in root (trusted) namespace */
#define ATTR_TRUST	0x0004	/* -- unused, from IRIX -- */
#define ATTR_SECURE	0x0008	/* use attrs in security namespace */
#define ATTR_CREATE	0x0010	/* pure create: fail if attr already exists */
#define ATTR_REPLACE	0x0020	/* pure set: fail if attr does not exist */

#define ATTR_KERNOTIME	0x1000	/* [kernel] don't update inode timestamps */
#define ATTR_KERNOVAL	0x2000	/* [kernel] get attr size only, not value */

#define XFS_ATTR_FLAGS \
	{ ATTR_DONTFOLLOW, 	"DONTFOLLOW" }, \
	{ ATTR_ROOT,		"ROOT" }, \
	{ ATTR_TRUST,		"TRUST" }, \
	{ ATTR_SECURE,		"SECURE" }, \
	{ ATTR_CREATE,		"CREATE" }, \
	{ ATTR_REPLACE,		"REPLACE" }, \
	{ ATTR_KERNOTIME,	"KERNOTIME" }, \
	{ ATTR_KERNOVAL,	"KERNOVAL" }

/*
 * The maximum size (into the kernel or returned from the kernel) of an
 * attribute value or the buffer used for an attr_list() call.  Larger
 * sizes will result in an ERANGE return code.
 */
#define	ATTR_MAX_VALUELEN	(64*1024)	/* max length of a value */

/*
 * Define how lists of attribute names are returned to the user from
 * the attr_list() call.  A large, 32bit aligned, buffer is passed in
 * along with its size.  We put an array of offsets at the top that each
 * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom.
 */
typedef struct attrlist {
	__s32	al_count;	/* number of entries in attrlist */
	__s32	al_more;	/* T/F: more attrs (do call again) */
	__s32	al_offset[1];	/* byte offsets of attrs [var-sized] */
} attrlist_t;

/*
 * Show the interesting info about one attribute.  This is what the
 * al_offset[i] entry points to.
 */
typedef struct attrlist_ent {	/* data from attr_list() */
	__u32	a_valuelen;	/* number bytes in value of attr */
	char	a_name[1];	/* attr name (NULL terminated) */
} attrlist_ent_t;

/*
 * Given a pointer to the (char*) buffer containing the attr_list() result,
 * and an index, return a pointer to the indicated attribute in the buffer.
 */
#define	ATTR_ENTRY(buffer, index)		\
	((attrlist_ent_t *)			\
	 &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ])

/*
 * Kernel-internal version of the attrlist cursor.
 */
typedef struct attrlist_cursor_kern {
	__u32	hashval;	/* hash value of next entry to add */
	__u32	blkno;		/* block containing entry (suggestion) */
	__u32	offset;		/* offset in list of equal-hashvals */
	__u16	pad1;		/* padding to match user-level */
	__u8	pad2;		/* padding to match user-level */
	__u8	initted;	/* T/F: cursor has been initialized */
} attrlist_cursor_kern_t;


/*========================================================================
 * Structure used to pass context around among the routines.
 *========================================================================*/


typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
			      unsigned char *, int, int, unsigned char *);

typedef struct xfs_attr_list_context {
	struct xfs_inode		*dp;		/* inode */
	struct attrlist_cursor_kern	*cursor;	/* position in list */
	char				*alist;		/* output buffer */
	int				seen_enough;	/* T/F: seen enough of list? */
	ssize_t				count;		/* num used entries */
	int				dupcnt;		/* count dup hashvals seen */
	int				bufsize;	/* total buffer size */
	int				firstu;		/* first used byte in buffer */
	int				flags;		/* from VOP call */
	int				resynch;	/* T/F: resynch with cursor */
	int				put_value;	/* T/F: need value for listent */
	put_listent_func_t		put_listent;	/* list output fmt function */
	int				index;		/* index into output buffer */
} xfs_attr_list_context_t;


/*========================================================================
 * Function prototypes for the kernel.
 *========================================================================*/

/*
 * Overall external interface routines.
 */
int xfs_attr_inactive(struct xfs_inode *dp);
int xfs_attr_rmtval_get(struct xfs_da_args *args);
int xfs_attr_list_int(struct xfs_attr_list_context *);

#endif	/* __XFS_ATTR_H__ */
back to top