Revision 8e7f82deb0c0386a03b62e30082574347f8b57d5 authored by Filipe Manana on 12 September 2023, 10:45:39 UTC, committed by David Sterba on 14 September 2023, 21:24:42 UTC
When opening a directory (opendir(3)) or rewinding it (rewinddir(3)), we
are not holding the directory's inode locked, and this can result in later
attempting to add two entries to the directory with the same index number,
resulting in a transaction abort, with -EEXIST (-17), when inserting the
second delayed dir index. This results in a trace like the following:

  Sep 11 22:34:59 myhostname kernel: BTRFS error (device dm-3): err add delayed dir index item(name: cockroach-stderr.log) into the insertion tree of the delayed node(root id: 5, inode id: 4539217, errno: -17)
  Sep 11 22:34:59 myhostname kernel: ------------[ cut here ]------------
  Sep 11 22:34:59 myhostname kernel: kernel BUG at fs/btrfs/delayed-inode.c:1504!
  Sep 11 22:34:59 myhostname kernel: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
  Sep 11 22:34:59 myhostname kernel: CPU: 0 PID: 7159 Comm: cockroach Not tainted 6.4.15-200.fc38.x86_64 #1
  Sep 11 22:34:59 myhostname kernel: Hardware name: ASUS ESC500 G3/P9D WS, BIOS 2402 06/27/2018
  Sep 11 22:34:59 myhostname kernel: RIP: 0010:btrfs_insert_delayed_dir_index+0x1da/0x260
  Sep 11 22:34:59 myhostname kernel: Code: eb dd 48 (...)
  Sep 11 22:34:59 myhostname kernel: RSP: 0000:ffffa9980e0fbb28 EFLAGS: 00010282
  Sep 11 22:34:59 myhostname kernel: RAX: 0000000000000000 RBX: ffff8b10b8f4a3c0 RCX: 0000000000000000
  Sep 11 22:34:59 myhostname kernel: RDX: 0000000000000000 RSI: ffff8b177ec21540 RDI: ffff8b177ec21540
  Sep 11 22:34:59 myhostname kernel: RBP: ffff8b110cf80888 R08: 0000000000000000 R09: ffffa9980e0fb938
  Sep 11 22:34:59 myhostname kernel: R10: 0000000000000003 R11: ffffffff86146508 R12: 0000000000000014
  Sep 11 22:34:59 myhostname kernel: R13: ffff8b1131ae5b40 R14: ffff8b10b8f4a418 R15: 00000000ffffffef
  Sep 11 22:34:59 myhostname kernel: FS:  00007fb14a7fe6c0(0000) GS:ffff8b177ec00000(0000) knlGS:0000000000000000
  Sep 11 22:34:59 myhostname kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  Sep 11 22:34:59 myhostname kernel: CR2: 000000c00143d000 CR3: 00000001b3b4e002 CR4: 00000000001706f0
  Sep 11 22:34:59 myhostname kernel: Call Trace:
  Sep 11 22:34:59 myhostname kernel:  <TASK>
  Sep 11 22:34:59 myhostname kernel:  ? die+0x36/0x90
  Sep 11 22:34:59 myhostname kernel:  ? do_trap+0xda/0x100
  Sep 11 22:34:59 myhostname kernel:  ? btrfs_insert_delayed_dir_index+0x1da/0x260
  Sep 11 22:34:59 myhostname kernel:  ? do_error_trap+0x6a/0x90
  Sep 11 22:34:59 myhostname kernel:  ? btrfs_insert_delayed_dir_index+0x1da/0x260
  Sep 11 22:34:59 myhostname kernel:  ? exc_invalid_op+0x50/0x70
  Sep 11 22:34:59 myhostname kernel:  ? btrfs_insert_delayed_dir_index+0x1da/0x260
  Sep 11 22:34:59 myhostname kernel:  ? asm_exc_invalid_op+0x1a/0x20
  Sep 11 22:34:59 myhostname kernel:  ? btrfs_insert_delayed_dir_index+0x1da/0x260
  Sep 11 22:34:59 myhostname kernel:  ? btrfs_insert_delayed_dir_index+0x1da/0x260
  Sep 11 22:34:59 myhostname kernel:  btrfs_insert_dir_item+0x200/0x280
  Sep 11 22:34:59 myhostname kernel:  btrfs_add_link+0xab/0x4f0
  Sep 11 22:34:59 myhostname kernel:  ? ktime_get_real_ts64+0x47/0xe0
  Sep 11 22:34:59 myhostname kernel:  btrfs_create_new_inode+0x7cd/0xa80
  Sep 11 22:34:59 myhostname kernel:  btrfs_symlink+0x190/0x4d0
  Sep 11 22:34:59 myhostname kernel:  ? schedule+0x5e/0xd0
  Sep 11 22:34:59 myhostname kernel:  ? __d_lookup+0x7e/0xc0
  Sep 11 22:34:59 myhostname kernel:  vfs_symlink+0x148/0x1e0
  Sep 11 22:34:59 myhostname kernel:  do_symlinkat+0x130/0x140
  Sep 11 22:34:59 myhostname kernel:  __x64_sys_symlinkat+0x3d/0x50
  Sep 11 22:34:59 myhostname kernel:  do_syscall_64+0x5d/0x90
  Sep 11 22:34:59 myhostname kernel:  ? syscall_exit_to_user_mode+0x2b/0x40
  Sep 11 22:34:59 myhostname kernel:  ? do_syscall_64+0x6c/0x90
  Sep 11 22:34:59 myhostname kernel:  entry_SYSCALL_64_after_hwframe+0x72/0xdc

The race leading to the problem happens like this:

1) Directory inode X is loaded into memory, its ->index_cnt field is
   initialized to (u64)-1 (at btrfs_alloc_inode());

2) Task A is adding a new file to directory X, holding its vfs inode lock,
   and calls btrfs_set_inode_index() to get an index number for the entry.

   Because the inode's index_cnt field is set to (u64)-1 it calls
   btrfs_inode_delayed_dir_index_count() which fails because no dir index
   entries were added yet to the delayed inode and then it calls
   btrfs_set_inode_index_count(). This functions finds the last dir index
   key and then sets index_cnt to that index value + 1. It found that the
   last index key has an offset of 100. However before it assigns a value
   of 101 to index_cnt...

3) Task B calls opendir(3), ending up at btrfs_opendir(), where the VFS
   lock for inode X is not taken, so it calls btrfs_get_dir_last_index()
   and sees index_cnt still with a value of (u64)-1. Because of that it
   calls btrfs_inode_delayed_dir_index_count() which fails since no dir
   index entries were added to the delayed inode yet, and then it also
   calls btrfs_set_inode_index_count(). This also finds that the last
   index key has an offset of 100, and before it assigns the value 101
   to the index_cnt field of inode X...

4) Task A assigns a value of 101 to index_cnt. And then the code flow
   goes to btrfs_set_inode_index() where it increments index_cnt from
   101 to 102. Task A then creates a delayed dir index entry with a
   sequence number of 101 and adds it to the delayed inode;

5) Task B assigns 101 to the index_cnt field of inode X;

6) At some later point when someone tries to add a new entry to the
   directory, btrfs_set_inode_index() will return 101 again and shortly
   after an attempt to add another delayed dir index key with index
   number 101 will fail with -EEXIST resulting in a transaction abort.

Fix this by locking the inode at btrfs_get_dir_last_index(), which is only
only used when opening a directory or attempting to lseek on it.

Reported-by: ken <ken@bllue.org>
Link: https://lore.kernel.org/linux-btrfs/CAE6xmH+Lp=Q=E61bU+v9eWX8gYfLvu6jLYxjxjFpo3zHVPR0EQ@mail.gmail.com/
Reported-by: syzbot+d13490c82ad5353c779d@syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/00000000000036e1290603e097e0@google.com/
Fixes: 9b378f6ad48c ("btrfs: fix infinite directory reads")
CC: stable@vger.kernel.org # 6.5+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
1 parent e60aa5d
Raw File
crc32.c
/*
 * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
 * cleaned up code to current version of sparse and added the slicing-by-8
 * algorithm to the closely similar existing slicing-by-4 algorithm.
 *
 * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com>
 * Nicer crc32 functions/docs submitted by linux@horizon.com.  Thanks!
 * Code was from the public domain, copyright abandoned.  Code was
 * subsequently included in the kernel, thus was re-licensed under the
 * GNU GPL v2.
 *
 * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com>
 * Same crc32 function was used in 5 other places in the kernel.
 * I made one version, and deleted the others.
 * There are various incantations of crc32().  Some use a seed of 0 or ~0.
 * Some xor at the end with ~0.  The generic crc32() function takes
 * seed as an argument, and doesn't xor at the end.  Then individual
 * users can do whatever they need.
 *   drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0.
 *   fs/jffs2 uses seed 0, doesn't xor with ~0.
 *   fs/partitions/efi.c uses seed ~0, xor's with ~0.
 *
 * This source code is licensed under the GNU General Public License,
 * Version 2.  See the file COPYING for more details.
 */

/* see: Documentation/staging/crc32.rst for a description of algorithms */

#include <linux/crc32.h>
#include <linux/crc32poly.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/sched.h>
#include "crc32defs.h"

#if CRC_LE_BITS > 8
# define tole(x) ((__force u32) cpu_to_le32(x))
#else
# define tole(x) (x)
#endif

#if CRC_BE_BITS > 8
# define tobe(x) ((__force u32) cpu_to_be32(x))
#else
# define tobe(x) (x)
#endif

#include "crc32table.h"

MODULE_AUTHOR("Matt Domsch <Matt_Domsch@dell.com>");
MODULE_DESCRIPTION("Various CRC32 calculations");
MODULE_LICENSE("GPL");

#if CRC_LE_BITS > 8 || CRC_BE_BITS > 8

/* implements slicing-by-4 or slicing-by-8 algorithm */
static inline u32 __pure
crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
{
# ifdef __LITTLE_ENDIAN
#  define DO_CRC(x) crc = t0[(crc ^ (x)) & 255] ^ (crc >> 8)
#  define DO_CRC4 (t3[(q) & 255] ^ t2[(q >> 8) & 255] ^ \
		   t1[(q >> 16) & 255] ^ t0[(q >> 24) & 255])
#  define DO_CRC8 (t7[(q) & 255] ^ t6[(q >> 8) & 255] ^ \
		   t5[(q >> 16) & 255] ^ t4[(q >> 24) & 255])
# else
#  define DO_CRC(x) crc = t0[((crc >> 24) ^ (x)) & 255] ^ (crc << 8)
#  define DO_CRC4 (t0[(q) & 255] ^ t1[(q >> 8) & 255] ^ \
		   t2[(q >> 16) & 255] ^ t3[(q >> 24) & 255])
#  define DO_CRC8 (t4[(q) & 255] ^ t5[(q >> 8) & 255] ^ \
		   t6[(q >> 16) & 255] ^ t7[(q >> 24) & 255])
# endif
	const u32 *b;
	size_t    rem_len;
# ifdef CONFIG_X86
	size_t i;
# endif
	const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3];
# if CRC_LE_BITS != 32
	const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7];
# endif
	u32 q;

	/* Align it */
	if (unlikely((long)buf & 3 && len)) {
		do {
			DO_CRC(*buf++);
		} while ((--len) && ((long)buf)&3);
	}

# if CRC_LE_BITS == 32
	rem_len = len & 3;
	len = len >> 2;
# else
	rem_len = len & 7;
	len = len >> 3;
# endif

	b = (const u32 *)buf;
# ifdef CONFIG_X86
	--b;
	for (i = 0; i < len; i++) {
# else
	for (--b; len; --len) {
# endif
		q = crc ^ *++b; /* use pre increment for speed */
# if CRC_LE_BITS == 32
		crc = DO_CRC4;
# else
		crc = DO_CRC8;
		q = *++b;
		crc ^= DO_CRC4;
# endif
	}
	len = rem_len;
	/* And the last few bytes */
	if (len) {
		u8 *p = (u8 *)(b + 1) - 1;
# ifdef CONFIG_X86
		for (i = 0; i < len; i++)
			DO_CRC(*++p); /* use pre increment for speed */
# else
		do {
			DO_CRC(*++p); /* use pre increment for speed */
		} while (--len);
# endif
	}
	return crc;
#undef DO_CRC
#undef DO_CRC4
#undef DO_CRC8
}
#endif


/**
 * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II
 *			CRC32/CRC32C
 * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for other
 *	 uses, or the previous crc32/crc32c value if computing incrementally.
 * @p: pointer to buffer over which CRC32/CRC32C is run
 * @len: length of buffer @p
 * @tab: little-endian Ethernet table
 * @polynomial: CRC32/CRC32c LE polynomial
 */
static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
					  size_t len, const u32 (*tab)[256],
					  u32 polynomial)
{
#if CRC_LE_BITS == 1
	int i;
	while (len--) {
		crc ^= *p++;
		for (i = 0; i < 8; i++)
			crc = (crc >> 1) ^ ((crc & 1) ? polynomial : 0);
	}
# elif CRC_LE_BITS == 2
	while (len--) {
		crc ^= *p++;
		crc = (crc >> 2) ^ tab[0][crc & 3];
		crc = (crc >> 2) ^ tab[0][crc & 3];
		crc = (crc >> 2) ^ tab[0][crc & 3];
		crc = (crc >> 2) ^ tab[0][crc & 3];
	}
# elif CRC_LE_BITS == 4
	while (len--) {
		crc ^= *p++;
		crc = (crc >> 4) ^ tab[0][crc & 15];
		crc = (crc >> 4) ^ tab[0][crc & 15];
	}
# elif CRC_LE_BITS == 8
	/* aka Sarwate algorithm */
	while (len--) {
		crc ^= *p++;
		crc = (crc >> 8) ^ tab[0][crc & 255];
	}
# else
	crc = (__force u32) __cpu_to_le32(crc);
	crc = crc32_body(crc, p, len, tab);
	crc = __le32_to_cpu((__force __le32)crc);
#endif
	return crc;
}

#if CRC_LE_BITS == 1
u32 __pure __weak crc32_le(u32 crc, unsigned char const *p, size_t len)
{
	return crc32_le_generic(crc, p, len, NULL, CRC32_POLY_LE);
}
u32 __pure __weak __crc32c_le(u32 crc, unsigned char const *p, size_t len)
{
	return crc32_le_generic(crc, p, len, NULL, CRC32C_POLY_LE);
}
#else
u32 __pure __weak crc32_le(u32 crc, unsigned char const *p, size_t len)
{
	return crc32_le_generic(crc, p, len, crc32table_le, CRC32_POLY_LE);
}
u32 __pure __weak __crc32c_le(u32 crc, unsigned char const *p, size_t len)
{
	return crc32_le_generic(crc, p, len, crc32ctable_le, CRC32C_POLY_LE);
}
#endif
EXPORT_SYMBOL(crc32_le);
EXPORT_SYMBOL(__crc32c_le);

u32 __pure crc32_le_base(u32, unsigned char const *, size_t) __alias(crc32_le);
u32 __pure __crc32c_le_base(u32, unsigned char const *, size_t) __alias(__crc32c_le);
u32 __pure crc32_be_base(u32, unsigned char const *, size_t) __alias(crc32_be);

/*
 * This multiplies the polynomials x and y modulo the given modulus.
 * This follows the "little-endian" CRC convention that the lsbit
 * represents the highest power of x, and the msbit represents x^0.
 */
static u32 __attribute_const__ gf2_multiply(u32 x, u32 y, u32 modulus)
{
	u32 product = x & 1 ? y : 0;
	int i;

	for (i = 0; i < 31; i++) {
		product = (product >> 1) ^ (product & 1 ? modulus : 0);
		x >>= 1;
		product ^= x & 1 ? y : 0;
	}

	return product;
}

/**
 * crc32_generic_shift - Append @len 0 bytes to crc, in logarithmic time
 * @crc: The original little-endian CRC (i.e. lsbit is x^31 coefficient)
 * @len: The number of bytes. @crc is multiplied by x^(8*@len)
 * @polynomial: The modulus used to reduce the result to 32 bits.
 *
 * It's possible to parallelize CRC computations by computing a CRC
 * over separate ranges of a buffer, then summing them.
 * This shifts the given CRC by 8*len bits (i.e. produces the same effect
 * as appending len bytes of zero to the data), in time proportional
 * to log(len).
 */
static u32 __attribute_const__ crc32_generic_shift(u32 crc, size_t len,
						   u32 polynomial)
{
	u32 power = polynomial;	/* CRC of x^32 */
	int i;

	/* Shift up to 32 bits in the simple linear way */
	for (i = 0; i < 8 * (int)(len & 3); i++)
		crc = (crc >> 1) ^ (crc & 1 ? polynomial : 0);

	len >>= 2;
	if (!len)
		return crc;

	for (;;) {
		/* "power" is x^(2^i), modulo the polynomial */
		if (len & 1)
			crc = gf2_multiply(crc, power, polynomial);

		len >>= 1;
		if (!len)
			break;

		/* Square power, advancing to x^(2^(i+1)) */
		power = gf2_multiply(power, power, polynomial);
	}

	return crc;
}

u32 __attribute_const__ crc32_le_shift(u32 crc, size_t len)
{
	return crc32_generic_shift(crc, len, CRC32_POLY_LE);
}

u32 __attribute_const__ __crc32c_le_shift(u32 crc, size_t len)
{
	return crc32_generic_shift(crc, len, CRC32C_POLY_LE);
}
EXPORT_SYMBOL(crc32_le_shift);
EXPORT_SYMBOL(__crc32c_le_shift);

/**
 * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
 * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for
 *	other uses, or the previous crc32 value if computing incrementally.
 * @p: pointer to buffer over which CRC32 is run
 * @len: length of buffer @p
 * @tab: big-endian Ethernet table
 * @polynomial: CRC32 BE polynomial
 */
static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p,
					  size_t len, const u32 (*tab)[256],
					  u32 polynomial)
{
#if CRC_BE_BITS == 1
	int i;
	while (len--) {
		crc ^= *p++ << 24;
		for (i = 0; i < 8; i++)
			crc =
			    (crc << 1) ^ ((crc & 0x80000000) ? polynomial :
					  0);
	}
# elif CRC_BE_BITS == 2
	while (len--) {
		crc ^= *p++ << 24;
		crc = (crc << 2) ^ tab[0][crc >> 30];
		crc = (crc << 2) ^ tab[0][crc >> 30];
		crc = (crc << 2) ^ tab[0][crc >> 30];
		crc = (crc << 2) ^ tab[0][crc >> 30];
	}
# elif CRC_BE_BITS == 4
	while (len--) {
		crc ^= *p++ << 24;
		crc = (crc << 4) ^ tab[0][crc >> 28];
		crc = (crc << 4) ^ tab[0][crc >> 28];
	}
# elif CRC_BE_BITS == 8
	while (len--) {
		crc ^= *p++ << 24;
		crc = (crc << 8) ^ tab[0][crc >> 24];
	}
# else
	crc = (__force u32) __cpu_to_be32(crc);
	crc = crc32_body(crc, p, len, tab);
	crc = __be32_to_cpu((__force __be32)crc);
# endif
	return crc;
}

#if CRC_BE_BITS == 1
u32 __pure __weak crc32_be(u32 crc, unsigned char const *p, size_t len)
{
	return crc32_be_generic(crc, p, len, NULL, CRC32_POLY_BE);
}
#else
u32 __pure __weak crc32_be(u32 crc, unsigned char const *p, size_t len)
{
	return crc32_be_generic(crc, p, len, crc32table_be, CRC32_POLY_BE);
}
#endif
EXPORT_SYMBOL(crc32_be);
back to top