Revision 8e7f82deb0c0386a03b62e30082574347f8b57d5 authored by Filipe Manana on 12 September 2023, 10:45:39 UTC, committed by David Sterba on 14 September 2023, 21:24:42 UTC
When opening a directory (opendir(3)) or rewinding it (rewinddir(3)), we
are not holding the directory's inode locked, and this can result in later
attempting to add two entries to the directory with the same index number,
resulting in a transaction abort, with -EEXIST (-17), when inserting the
second delayed dir index. This results in a trace like the following:

  Sep 11 22:34:59 myhostname kernel: BTRFS error (device dm-3): err add delayed dir index item(name: cockroach-stderr.log) into the insertion tree of the delayed node(root id: 5, inode id: 4539217, errno: -17)
  Sep 11 22:34:59 myhostname kernel: ------------[ cut here ]------------
  Sep 11 22:34:59 myhostname kernel: kernel BUG at fs/btrfs/delayed-inode.c:1504!
  Sep 11 22:34:59 myhostname kernel: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
  Sep 11 22:34:59 myhostname kernel: CPU: 0 PID: 7159 Comm: cockroach Not tainted 6.4.15-200.fc38.x86_64 #1
  Sep 11 22:34:59 myhostname kernel: Hardware name: ASUS ESC500 G3/P9D WS, BIOS 2402 06/27/2018
  Sep 11 22:34:59 myhostname kernel: RIP: 0010:btrfs_insert_delayed_dir_index+0x1da/0x260
  Sep 11 22:34:59 myhostname kernel: Code: eb dd 48 (...)
  Sep 11 22:34:59 myhostname kernel: RSP: 0000:ffffa9980e0fbb28 EFLAGS: 00010282
  Sep 11 22:34:59 myhostname kernel: RAX: 0000000000000000 RBX: ffff8b10b8f4a3c0 RCX: 0000000000000000
  Sep 11 22:34:59 myhostname kernel: RDX: 0000000000000000 RSI: ffff8b177ec21540 RDI: ffff8b177ec21540
  Sep 11 22:34:59 myhostname kernel: RBP: ffff8b110cf80888 R08: 0000000000000000 R09: ffffa9980e0fb938
  Sep 11 22:34:59 myhostname kernel: R10: 0000000000000003 R11: ffffffff86146508 R12: 0000000000000014
  Sep 11 22:34:59 myhostname kernel: R13: ffff8b1131ae5b40 R14: ffff8b10b8f4a418 R15: 00000000ffffffef
  Sep 11 22:34:59 myhostname kernel: FS:  00007fb14a7fe6c0(0000) GS:ffff8b177ec00000(0000) knlGS:0000000000000000
  Sep 11 22:34:59 myhostname kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  Sep 11 22:34:59 myhostname kernel: CR2: 000000c00143d000 CR3: 00000001b3b4e002 CR4: 00000000001706f0
  Sep 11 22:34:59 myhostname kernel: Call Trace:
  Sep 11 22:34:59 myhostname kernel:  <TASK>
  Sep 11 22:34:59 myhostname kernel:  ? die+0x36/0x90
  Sep 11 22:34:59 myhostname kernel:  ? do_trap+0xda/0x100
  Sep 11 22:34:59 myhostname kernel:  ? btrfs_insert_delayed_dir_index+0x1da/0x260
  Sep 11 22:34:59 myhostname kernel:  ? do_error_trap+0x6a/0x90
  Sep 11 22:34:59 myhostname kernel:  ? btrfs_insert_delayed_dir_index+0x1da/0x260
  Sep 11 22:34:59 myhostname kernel:  ? exc_invalid_op+0x50/0x70
  Sep 11 22:34:59 myhostname kernel:  ? btrfs_insert_delayed_dir_index+0x1da/0x260
  Sep 11 22:34:59 myhostname kernel:  ? asm_exc_invalid_op+0x1a/0x20
  Sep 11 22:34:59 myhostname kernel:  ? btrfs_insert_delayed_dir_index+0x1da/0x260
  Sep 11 22:34:59 myhostname kernel:  ? btrfs_insert_delayed_dir_index+0x1da/0x260
  Sep 11 22:34:59 myhostname kernel:  btrfs_insert_dir_item+0x200/0x280
  Sep 11 22:34:59 myhostname kernel:  btrfs_add_link+0xab/0x4f0
  Sep 11 22:34:59 myhostname kernel:  ? ktime_get_real_ts64+0x47/0xe0
  Sep 11 22:34:59 myhostname kernel:  btrfs_create_new_inode+0x7cd/0xa80
  Sep 11 22:34:59 myhostname kernel:  btrfs_symlink+0x190/0x4d0
  Sep 11 22:34:59 myhostname kernel:  ? schedule+0x5e/0xd0
  Sep 11 22:34:59 myhostname kernel:  ? __d_lookup+0x7e/0xc0
  Sep 11 22:34:59 myhostname kernel:  vfs_symlink+0x148/0x1e0
  Sep 11 22:34:59 myhostname kernel:  do_symlinkat+0x130/0x140
  Sep 11 22:34:59 myhostname kernel:  __x64_sys_symlinkat+0x3d/0x50
  Sep 11 22:34:59 myhostname kernel:  do_syscall_64+0x5d/0x90
  Sep 11 22:34:59 myhostname kernel:  ? syscall_exit_to_user_mode+0x2b/0x40
  Sep 11 22:34:59 myhostname kernel:  ? do_syscall_64+0x6c/0x90
  Sep 11 22:34:59 myhostname kernel:  entry_SYSCALL_64_after_hwframe+0x72/0xdc

The race leading to the problem happens like this:

1) Directory inode X is loaded into memory, its ->index_cnt field is
   initialized to (u64)-1 (at btrfs_alloc_inode());

2) Task A is adding a new file to directory X, holding its vfs inode lock,
   and calls btrfs_set_inode_index() to get an index number for the entry.

   Because the inode's index_cnt field is set to (u64)-1 it calls
   btrfs_inode_delayed_dir_index_count() which fails because no dir index
   entries were added yet to the delayed inode and then it calls
   btrfs_set_inode_index_count(). This functions finds the last dir index
   key and then sets index_cnt to that index value + 1. It found that the
   last index key has an offset of 100. However before it assigns a value
   of 101 to index_cnt...

3) Task B calls opendir(3), ending up at btrfs_opendir(), where the VFS
   lock for inode X is not taken, so it calls btrfs_get_dir_last_index()
   and sees index_cnt still with a value of (u64)-1. Because of that it
   calls btrfs_inode_delayed_dir_index_count() which fails since no dir
   index entries were added to the delayed inode yet, and then it also
   calls btrfs_set_inode_index_count(). This also finds that the last
   index key has an offset of 100, and before it assigns the value 101
   to the index_cnt field of inode X...

4) Task A assigns a value of 101 to index_cnt. And then the code flow
   goes to btrfs_set_inode_index() where it increments index_cnt from
   101 to 102. Task A then creates a delayed dir index entry with a
   sequence number of 101 and adds it to the delayed inode;

5) Task B assigns 101 to the index_cnt field of inode X;

6) At some later point when someone tries to add a new entry to the
   directory, btrfs_set_inode_index() will return 101 again and shortly
   after an attempt to add another delayed dir index key with index
   number 101 will fail with -EEXIST resulting in a transaction abort.

Fix this by locking the inode at btrfs_get_dir_last_index(), which is only
only used when opening a directory or attempting to lseek on it.

Reported-by: ken <ken@bllue.org>
Link: https://lore.kernel.org/linux-btrfs/CAE6xmH+Lp=Q=E61bU+v9eWX8gYfLvu6jLYxjxjFpo3zHVPR0EQ@mail.gmail.com/
Reported-by: syzbot+d13490c82ad5353c779d@syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/00000000000036e1290603e097e0@google.com/
Fixes: 9b378f6ad48c ("btrfs: fix infinite directory reads")
CC: stable@vger.kernel.org # 6.5+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
1 parent e60aa5d
Raw File
dhry.h
/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
/*
 ****************************************************************************
 *
 *                   "DHRYSTONE" Benchmark Program
 *                   -----------------------------
 *
 *  Version:    C, Version 2.1
 *
 *  File:       dhry.h (part 1 of 3)
 *
 *  Date:       May 25, 1988
 *
 *  Author:     Reinhold P. Weicker
 *                      Siemens AG, AUT E 51
 *                      Postfach 3220
 *                      8520 Erlangen
 *                      Germany (West)
 *                              Phone:  [+49]-9131-7-20330
 *                                      (8-17 Central European Time)
 *                              Usenet: ..!mcsun!unido!estevax!weicker
 *
 *              Original Version (in Ada) published in
 *              "Communications of the ACM" vol. 27., no. 10 (Oct. 1984),
 *              pp. 1013 - 1030, together with the statistics
 *              on which the distribution of statements etc. is based.
 *
 *              In this C version, the following C library functions are used:
 *              - strcpy, strcmp (inside the measurement loop)
 *              - printf, scanf (outside the measurement loop)
 *              In addition, Berkeley UNIX system calls "times ()" or "time ()"
 *              are used for execution time measurement. For measurements
 *              on other systems, these calls have to be changed.
 *
 *  Collection of Results:
 *              Reinhold Weicker (address see above) and
 *
 *              Rick Richardson
 *              PC Research. Inc.
 *              94 Apple Orchard Drive
 *              Tinton Falls, NJ 07724
 *                      Phone:  (201) 389-8963 (9-17 EST)
 *                      Usenet: ...!uunet!pcrat!rick
 *
 *      Please send results to Rick Richardson and/or Reinhold Weicker.
 *      Complete information should be given on hardware and software used.
 *      Hardware information includes: Machine type, CPU, type and size
 *      of caches; for microprocessors: clock frequency, memory speed
 *      (number of wait states).
 *      Software information includes: Compiler (and runtime library)
 *      manufacturer and version, compilation switches, OS version.
 *      The Operating System version may give an indication about the
 *      compiler; Dhrystone itself performs no OS calls in the measurement loop.
 *
 *      The complete output generated by the program should be mailed
 *      such that at least some checks for correctness can be made.
 *
 ***************************************************************************
 *
 *  History:    This version C/2.1 has been made for two reasons:
 *
 *              1) There is an obvious need for a common C version of
 *              Dhrystone, since C is at present the most popular system
 *              programming language for the class of processors
 *              (microcomputers, minicomputers) where Dhrystone is used most.
 *              There should be, as far as possible, only one C version of
 *              Dhrystone such that results can be compared without
 *              restrictions. In the past, the C versions distributed
 *              by Rick Richardson (Version 1.1) and by Reinhold Weicker
 *              had small (though not significant) differences.
 *
 *              2) As far as it is possible without changes to the Dhrystone
 *              statistics, optimizing compilers should be prevented from
 *              removing significant statements.
 *
 *              This C version has been developed in cooperation with
 *              Rick Richardson (Tinton Falls, NJ), it incorporates many
 *              ideas from the "Version 1.1" distributed previously by
 *              him over the UNIX network Usenet.
 *              I also thank Chaim Benedelac (National Semiconductor),
 *              David Ditzel (SUN), Earl Killian and John Mashey (MIPS),
 *              Alan Smith and Rafael Saavedra-Barrera (UC at Berkeley)
 *              for their help with comments on earlier versions of the
 *              benchmark.
 *
 *  Changes:    In the initialization part, this version follows mostly
 *              Rick Richardson's version distributed via Usenet, not the
 *              version distributed earlier via floppy disk by Reinhold Weicker.
 *              As a concession to older compilers, names have been made
 *              unique within the first 8 characters.
 *              Inside the measurement loop, this version follows the
 *              version previously distributed by Reinhold Weicker.
 *
 *              At several places in the benchmark, code has been added,
 *              but within the measurement loop only in branches that
 *              are not executed. The intention is that optimizing compilers
 *              should be prevented from moving code out of the measurement
 *              loop, or from removing code altogether. Since the statements
 *              that are executed within the measurement loop have NOT been
 *              changed, the numbers defining the "Dhrystone distribution"
 *              (distribution of statements, operand types and locality)
 *              still hold. Except for sophisticated optimizing compilers,
 *              execution times for this version should be the same as
 *              for previous versions.
 *
 *              Since it has proven difficult to subtract the time for the
 *              measurement loop overhead in a correct way, the loop check
 *              has been made a part of the benchmark. This does have
 *              an impact - though a very minor one - on the distribution
 *              statistics which have been updated for this version.
 *
 *              All changes within the measurement loop are described
 *              and discussed in the companion paper "Rationale for
 *              Dhrystone version 2".
 *
 *              Because of the self-imposed limitation that the order and
 *              distribution of the executed statements should not be
 *              changed, there are still cases where optimizing compilers
 *              may not generate code for some statements. To a certain
 *              degree, this is unavoidable for small synthetic benchmarks.
 *              Users of the benchmark are advised to check code listings
 *              whether code is generated for all statements of Dhrystone.
 *
 *              Version 2.1 is identical to version 2.0 distributed via
 *              the UNIX network Usenet in March 1988 except that it corrects
 *              some minor deficiencies that were found by users of version 2.0.
 *              The only change within the measurement loop is that a
 *              non-executed "else" part was added to the "if" statement in
 *              Func_3, and a non-executed "else" part removed from Proc_3.
 *
 ***************************************************************************
 *
 *  Compilation model and measurement (IMPORTANT):
 *
 *  This C version of Dhrystone consists of three files:
 *  - dhry.h (this file, containing global definitions and comments)
 *  - dhry_1.c (containing the code corresponding to Ada package Pack_1)
 *  - dhry_2.c (containing the code corresponding to Ada package Pack_2)
 *
 *  The following "ground rules" apply for measurements:
 *  - Separate compilation
 *  - No procedure merging
 *  - Otherwise, compiler optimizations are allowed but should be indicated
 *  - Default results are those without register declarations
 *  See the companion paper "Rationale for Dhrystone Version 2" for a more
 *  detailed discussion of these ground rules.
 *
 *  For 16-Bit processors (e.g. 80186, 80286), times for all compilation
 *  models ("small", "medium", "large" etc.) should be given if possible,
 *  together with a definition of these models for the compiler system used.
 *
 **************************************************************************
 *
 *  Dhrystone (C version) statistics:
 *
 *  [Comment from the first distribution, updated for version 2.
 *   Note that because of language differences, the numbers are slightly
 *   different from the Ada version.]
 *
 *  The following program contains statements of a high level programming
 *  language (here: C) in a distribution considered representative:
 *
 *    assignments                  52 (51.0 %)
 *    control statements           33 (32.4 %)
 *    procedure, function calls    17 (16.7 %)
 *
 *  103 statements are dynamically executed. The program is balanced with
 *  respect to the three aspects:
 *
 *    - statement type
 *    - operand type
 *    - operand locality
 *         operand global, local, parameter, or constant.
 *
 *  The combination of these three aspects is balanced only approximately.
 *
 *  1. Statement Type:
 *  -----------------             number
 *
 *     V1 = V2                     9
 *       (incl. V1 = F(..)
 *     V = Constant               12
 *     Assignment,                 7
 *       with array element
 *     Assignment,                 6
 *       with record component
 *                                --
 *                                34       34
 *
 *     X = Y +|-|"&&"|"|" Z        5
 *     X = Y +|-|"==" Constant     6
 *     X = X +|- 1                 3
 *     X = Y *|/ Z                 2
 *     X = Expression,             1
 *           two operators
 *     X = Expression,             1
 *           three operators
 *                                --
 *                                18       18
 *
 *     if ....                    14
 *       with "else"      7
 *       without "else"   7
 *           executed        3
 *           not executed    4
 *     for ...                     7  |  counted every time
 *     while ...                   4  |  the loop condition
 *     do ... while                1  |  is evaluated
 *     switch ...                  1
 *     break                       1
 *     declaration with            1
 *       initialization
 *                                --
 *                                34       34
 *
 *     P (...)  procedure call    11
 *       user procedure      10
 *       library procedure    1
 *     X = F (...)
 *             function  call      6
 *       user function        5
 *       library function     1
 *                                --
 *                                17       17
 *                                        ---
 *                                        103
 *
 *    The average number of parameters in procedure or function calls
 *    is 1.82 (not counting the function values as implicit parameters).
 *
 *
 *  2. Operators
 *  ------------
 *                          number    approximate
 *                                    percentage
 *
 *    Arithmetic             32          50.8
 *
 *       +                     21          33.3
 *       -                      7          11.1
 *       *                      3           4.8
 *       / (int div)            1           1.6
 *
 *    Comparison             27           42.8
 *
 *       ==                     9           14.3
 *       /=                     4            6.3
 *       >                      1            1.6
 *       <                      3            4.8
 *       >=                     1            1.6
 *       <=                     9           14.3
 *
 *    Logic                   4            6.3
 *
 *       && (AND-THEN)          1            1.6
 *       |  (OR)                1            1.6
 *       !  (NOT)               2            3.2
 *
 *                           --          -----
 *                           63          100.1
 *
 *
 *  3. Operand Type (counted once per operand reference):
 *  ---------------
 *                          number    approximate
 *                                    percentage
 *
 *     Integer               175        72.3 %
 *     Character              45        18.6 %
 *     Pointer                12         5.0 %
 *     String30                6         2.5 %
 *     Array                   2         0.8 %
 *     Record                  2         0.8 %
 *                           ---       -------
 *                           242       100.0 %
 *
 *  When there is an access path leading to the final operand (e.g. a record
 *  component), only the final data type on the access path is counted.
 *
 *
 *  4. Operand Locality:
 *  -------------------
 *                                number    approximate
 *                                          percentage
 *
 *     local variable              114        47.1 %
 *     global variable              22         9.1 %
 *     parameter                    45        18.6 %
 *        value                        23         9.5 %
 *        reference                    22         9.1 %
 *     function result               6         2.5 %
 *     constant                     55        22.7 %
 *                                 ---       -------
 *                                 242       100.0 %
 *
 *
 *  The program does not compute anything meaningful, but it is syntactically
 *  and semantically correct. All variables have a value assigned to them
 *  before they are used as a source operand.
 *
 *  There has been no explicit effort to account for the effects of a
 *  cache, or to balance the use of long or short displacements for code or
 *  data.
 *
 ***************************************************************************
 */

typedef enum {
	Ident_1,
	Ident_2,
	Ident_3,
	Ident_4,
	Ident_5
} Enumeration;	/* for boolean and enumeration types in Ada, Pascal */

/* General definitions: */

typedef int One_Thirty;
typedef int One_Fifty;
typedef char Capital_Letter;
typedef int Boolean;
typedef char Str_30[31];
typedef int Arr_1_Dim[50];
typedef int Arr_2_Dim[50][50];

typedef struct record {
	struct record *Ptr_Comp;
	Enumeration    Discr;
	union {
		struct {
			Enumeration Enum_Comp;
			int Int_Comp;
			char Str_Comp[31];
		} var_1;
		struct {
			Enumeration E_Comp_2;
			char Str_2_Comp[31];
		} var_2;
		struct {
			char Ch_1_Comp;
			char Ch_2_Comp;
		} var_3;
	} variant;
} Rec_Type, *Rec_Pointer;


extern int Int_Glob;
extern char Ch_1_Glob;

void Proc_6(Enumeration  Enum_Val_Par, Enumeration *Enum_Ref_Par);
void Proc_7(One_Fifty Int_1_Par_Val, One_Fifty Int_2_Par_Val,
	    One_Fifty *Int_Par_Ref);
void Proc_8(Arr_1_Dim Arr_1_Par_Ref, Arr_2_Dim Arr_2_Par_Ref,
	    int Int_1_Par_Val, int Int_2_Par_Val);
Enumeration Func_1(Capital_Letter Ch_1_Par_Val, Capital_Letter Ch_2_Par_Val);
Boolean Func_2(Str_30 Str_1_Par_Ref, Str_30 Str_2_Par_Ref);

int dhry(int n);
back to top