Revision a5ae50dea9111db63d30d700766dd5509602f7ad authored by Filipe Manana on 20 February 2020, 13:29:49 UTC, committed by David Sterba on 21 February 2020, 15:21:19 UTC
While logging the prealloc extents of an inode during a fast fsync we call
btrfs_truncate_inode_items(), through btrfs_log_prealloc_extents(), while
holding a read lock on a leaf of the inode's root (not the log root, the
fs/subvol root), and then that function locks the file range in the inode's
iotree. This can lead to a deadlock when:

* the fsync is ranged

* the file has prealloc extents beyond eof

* writeback for a range different from the fsync range starts
  during the fsync

* the size of the file is not sector size aligned

Because when finishing an ordered extent we lock first a file range and
then try to COW the fs/subvol tree to insert an extent item.

The following diagram shows how the deadlock can happen.

           CPU 1                                        CPU 2

  btrfs_sync_file()
    --> for range [0, 1MiB)

    --> inode has a size of
        1MiB and has 1 prealloc
        extent beyond the
        i_size, starting at offset
        4MiB

    flushes all delalloc for the
    range [0MiB, 1MiB) and waits
    for the respective ordered
    extents to complete

                                              --> before task at CPU 1 locks the
                                                  inode, a write into file range
                                                  [1MiB, 2MiB + 1KiB) is made

                                              --> i_size is updated to 2MiB + 1KiB

                                              --> writeback is started for that
                                                  range, [1MiB, 2MiB + 4KiB)
                                                  --> end offset rounded up to
                                                      be sector size aligned

    btrfs_log_dentry_safe()
      btrfs_log_inode_parent()
        btrfs_log_inode()

          btrfs_log_changed_extents()
            btrfs_log_prealloc_extents()
              --> does a search on the
                  inode's root
              --> holds a read lock on
                  leaf X

                                              btrfs_finish_ordered_io()
                                                --> locks range [1MiB, 2MiB + 4KiB)
                                                    --> end offset rounded up
                                                        to be sector size aligned

                                                --> tries to cow leaf X, through
                                                    insert_reserved_file_extent()
                                                    --> already locked by the
                                                        task at CPU 1

              btrfs_truncate_inode_items()

                --> gets an i_size of
                    2MiB + 1KiB, which is
                    not sector size
                    aligned

                --> tries to lock file
                    range [2MiB, (u64)-1)
                    --> the start range
                        is rounded down
                        from 2MiB + 1K
                        to 2MiB to be sector
                        size aligned

                    --> but the subrange
                        [2MiB, 2MiB + 4KiB) is
                        already locked by
                        task at CPU 2 which
                        is waiting to get a
                        write lock on leaf X
                        for which we are
                        holding a read lock

                                *** deadlock ***

This results in a stack trace like the following, triggered by test case
generic/561 from fstests:

  [ 2779.973608] INFO: task kworker/u8:6:247 blocked for more than 120 seconds.
  [ 2779.979536]       Not tainted 5.6.0-rc2-btrfs-next-53 #1
  [ 2779.984503] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  [ 2779.990136] kworker/u8:6    D    0   247      2 0x80004000
  [ 2779.990457] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs]
  [ 2779.990466] Call Trace:
  [ 2779.990491]  ? __schedule+0x384/0xa30
  [ 2779.990521]  schedule+0x33/0xe0
  [ 2779.990616]  btrfs_tree_read_lock+0x19e/0x2e0 [btrfs]
  [ 2779.990632]  ? remove_wait_queue+0x60/0x60
  [ 2779.990730]  btrfs_read_lock_root_node+0x2f/0x40 [btrfs]
  [ 2779.990782]  btrfs_search_slot+0x510/0x1000 [btrfs]
  [ 2779.990869]  btrfs_lookup_file_extent+0x4a/0x70 [btrfs]
  [ 2779.990944]  __btrfs_drop_extents+0x161/0x1060 [btrfs]
  [ 2779.990987]  ? mark_held_locks+0x6d/0xc0
  [ 2779.990994]  ? __slab_alloc.isra.49+0x99/0x100
  [ 2779.991060]  ? insert_reserved_file_extent.constprop.19+0x64/0x300 [btrfs]
  [ 2779.991145]  insert_reserved_file_extent.constprop.19+0x97/0x300 [btrfs]
  [ 2779.991222]  ? start_transaction+0xdd/0x5c0 [btrfs]
  [ 2779.991291]  btrfs_finish_ordered_io+0x4f4/0x840 [btrfs]
  [ 2779.991405]  btrfs_work_helper+0xaa/0x720 [btrfs]
  [ 2779.991432]  process_one_work+0x26d/0x6a0
  [ 2779.991460]  worker_thread+0x4f/0x3e0
  [ 2779.991481]  ? process_one_work+0x6a0/0x6a0
  [ 2779.991489]  kthread+0x103/0x140
  [ 2779.991499]  ? kthread_create_worker_on_cpu+0x70/0x70
  [ 2779.991515]  ret_from_fork+0x3a/0x50
  (...)
  [ 2780.026211] INFO: task fsstress:17375 blocked for more than 120 seconds.
  [ 2780.027480]       Not tainted 5.6.0-rc2-btrfs-next-53 #1
  [ 2780.028482] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  [ 2780.030035] fsstress        D    0 17375  17373 0x00004000
  [ 2780.030038] Call Trace:
  [ 2780.030044]  ? __schedule+0x384/0xa30
  [ 2780.030052]  schedule+0x33/0xe0
  [ 2780.030075]  lock_extent_bits+0x20c/0x320 [btrfs]
  [ 2780.030094]  ? btrfs_truncate_inode_items+0xf4/0x1150 [btrfs]
  [ 2780.030098]  ? rcu_read_lock_sched_held+0x59/0xa0
  [ 2780.030102]  ? remove_wait_queue+0x60/0x60
  [ 2780.030122]  btrfs_truncate_inode_items+0x133/0x1150 [btrfs]
  [ 2780.030151]  ? btrfs_set_path_blocking+0xb2/0x160 [btrfs]
  [ 2780.030165]  ? btrfs_search_slot+0x379/0x1000 [btrfs]
  [ 2780.030195]  btrfs_log_changed_extents.isra.8+0x841/0x93e [btrfs]
  [ 2780.030202]  ? do_raw_spin_unlock+0x49/0xc0
  [ 2780.030215]  ? btrfs_get_num_csums+0x10/0x10 [btrfs]
  [ 2780.030239]  btrfs_log_inode+0xf83/0x1124 [btrfs]
  [ 2780.030251]  ? __mutex_unlock_slowpath+0x45/0x2a0
  [ 2780.030275]  btrfs_log_inode_parent+0x2a0/0xe40 [btrfs]
  [ 2780.030282]  ? dget_parent+0xa1/0x370
  [ 2780.030309]  btrfs_log_dentry_safe+0x4a/0x70 [btrfs]
  [ 2780.030329]  btrfs_sync_file+0x3f3/0x490 [btrfs]
  [ 2780.030339]  do_fsync+0x38/0x60
  [ 2780.030343]  __x64_sys_fdatasync+0x13/0x20
  [ 2780.030345]  do_syscall_64+0x5c/0x280
  [ 2780.030348]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
  [ 2780.030356] RIP: 0033:0x7f2d80f6d5f0
  [ 2780.030361] Code: Bad RIP value.
  [ 2780.030362] RSP: 002b:00007ffdba3c8548 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
  [ 2780.030364] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f2d80f6d5f0
  [ 2780.030365] RDX: 00007ffdba3c84b0 RSI: 00007ffdba3c84b0 RDI: 0000000000000003
  [ 2780.030367] RBP: 000000000000004a R08: 0000000000000001 R09: 00007ffdba3c855c
  [ 2780.030368] R10: 0000000000000078 R11: 0000000000000246 R12: 00000000000001f4
  [ 2780.030369] R13: 0000000051eb851f R14: 00007ffdba3c85f0 R15: 0000557a49220d90

So fix this by making btrfs_truncate_inode_items() not lock the range in
the inode's iotree when the target root is a log root, since it's not
needed to lock the range for log roots as the protection from the inode's
lock and log_mutex are all that's needed.

Fixes: 28553fa992cb28 ("Btrfs: fix race between shrinking truncate and fiemap")
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
1 parent e75fd33
History
File Mode Size
842
crypto
dim
fonts
kunit
livepatch
lz4
lzo
math
mpi
raid6
reed_solomon
vdso
xz
zlib_deflate
zlib_inflate
zstd
.gitignore -rw-r--r-- 98 bytes
Kconfig -rw-r--r-- 14.7 KB
Kconfig.debug -rw-r--r-- 71.2 KB
Kconfig.kasan -rw-r--r-- 5.9 KB
Kconfig.kgdb -rw-r--r-- 4.1 KB
Kconfig.ubsan -rw-r--r-- 1.7 KB
Makefile -rw-r--r-- 9.6 KB
argv_split.c -rw-r--r-- 2.1 KB
ashldi3.c -rw-r--r-- 541 bytes
ashrdi3.c -rw-r--r-- 565 bytes
asn1_decoder.c -rw-r--r-- 13.2 KB
assoc_array.c -rw-r--r-- 51.8 KB
atomic64.c -rw-r--r-- 4.5 KB
atomic64_test.c -rw-r--r-- 6.4 KB
audit.c -rw-r--r-- 1.8 KB
bcd.c -rw-r--r-- 297 bytes
bch.c -rw-r--r-- 36.1 KB
bitmap.c -rw-r--r-- 36.3 KB
bitrev.c -rw-r--r-- 1.9 KB
bsearch.c -rw-r--r-- 1.4 KB
btree.c -rw-r--r-- 19.2 KB
bucket_locks.c -rw-r--r-- 1.4 KB
bug.c -rw-r--r-- 5.8 KB
build_OID_registry -rwxr-xr-x 4.5 KB
bust_spinlocks.c -rw-r--r-- 676 bytes
check_signature.c -rw-r--r-- 635 bytes
checksum.c -rw-r--r-- 4.8 KB
clz_ctz.c -rw-r--r-- 1.2 KB
clz_tab.c -rw-r--r-- 891 bytes
cmdline.c -rw-r--r-- 5.1 KB
cmpdi2.c -rw-r--r-- 501 bytes
compat_audit.c -rw-r--r-- 832 bytes
cpu_rmap.c -rw-r--r-- 7.6 KB
cpumask.c -rw-r--r-- 6.0 KB
crc-ccitt.c -rw-r--r-- 5.6 KB
crc-itu-t.c -rw-r--r-- 2.7 KB
crc-t10dif.c -rw-r--r-- 2.9 KB
crc16.c -rw-r--r-- 2.7 KB
crc32.c -rw-r--r-- 9.3 KB
crc32defs.h -rw-r--r-- 1.6 KB
crc32test.c -rw-r--r-- 37.5 KB
crc4.c -rw-r--r-- 1003 bytes
crc64.c -rw-r--r-- 1.7 KB
crc7.c -rw-r--r-- 2.5 KB
crc8.c -rw-r--r-- 2.4 KB
ctype.c -rw-r--r-- 1.4 KB
debug_info.c -rw-r--r-- 777 bytes
debug_locks.c -rw-r--r-- 1.2 KB
debugobjects.c -rw-r--r-- 34.2 KB
dec_and_lock.c -rw-r--r-- 1.2 KB
decompress.c -rw-r--r-- 1.7 KB
decompress_bunzip2.c -rw-r--r-- 23.5 KB
decompress_inflate.c -rw-r--r-- 4.5 KB
decompress_unlz4.c -rw-r--r-- 4.0 KB
decompress_unlzma.c -rw-r--r-- 15.8 KB
decompress_unlzo.c -rw-r--r-- 6.4 KB
decompress_unxz.c -rw-r--r-- 10.9 KB
devres.c -rw-r--r-- 12.3 KB
digsig.c -rw-r--r-- 5.5 KB
dump_stack.c -rw-r--r-- 3.2 KB
dynamic_debug.c -rw-r--r-- 26.1 KB
dynamic_queue_limits.c -rw-r--r-- 4.3 KB
earlycpio.c -rw-r--r-- 3.6 KB
errname.c -rw-r--r-- 3.6 KB
error-inject.c -rw-r--r-- 5.4 KB
errseq.c -rw-r--r-- 6.6 KB
extable.c -rw-r--r-- 3.0 KB
fault-inject.c -rw-r--r-- 5.8 KB
fdt.c -rw-r--r-- 69 bytes
fdt_empty_tree.c -rw-r--r-- 80 bytes
fdt_ro.c -rw-r--r-- 72 bytes
fdt_rw.c -rw-r--r-- 72 bytes
fdt_strerror.c -rw-r--r-- 78 bytes
fdt_sw.c -rw-r--r-- 72 bytes
fdt_wip.c -rw-r--r-- 73 bytes
find_bit.c -rw-r--r-- 5.4 KB
find_bit_benchmark.c -rw-r--r-- 3.9 KB
flex_proportions.c -rw-r--r-- 6.9 KB
gen_crc32table.c -rw-r--r-- 3.3 KB
gen_crc64table.c -rw-r--r-- 1.4 KB
genalloc.c -rw-r--r-- 26.1 KB
generic-radix-tree.c -rw-r--r-- 5.3 KB
glob.c -rw-r--r-- 3.5 KB
globtest.c -rw-r--r-- 4.2 KB
hexdump.c -rw-r--r-- 7.3 KB
hweight.c -rw-r--r-- 1.9 KB
idr.c -rw-r--r-- 17.3 KB
inflate.c -rw-r--r-- 38.7 KB
interval_tree.c -rw-r--r-- 540 bytes
interval_tree_test.c -rw-r--r-- 3.4 KB
iomap.c -rw-r--r-- 9.1 KB
iomap_copy.c -rw-r--r-- 2.2 KB
iommu-helper.c -rw-r--r-- 755 bytes
ioremap.c -rw-r--r-- 6.1 KB
iov_iter.c -rw-r--r-- 42.0 KB
irq_poll.c -rw-r--r-- 5.4 KB
irq_regs.c -rw-r--r-- 394 bytes
is_single_threaded.c -rw-r--r-- 1.2 KB
kasprintf.c -rw-r--r-- 1.4 KB
kfifo.c -rw-r--r-- 12.1 KB
klist.c -rw-r--r-- 10.4 KB
kobject.c -rw-r--r-- 27.9 KB
kobject_uevent.c -rw-r--r-- 18.8 KB
kstrtox.c -rw-r--r-- 10.5 KB
kstrtox.h -rw-r--r-- 293 bytes
libcrc32c.c -rw-r--r-- 2.0 KB
list-test.c -rw-r--r-- 17.4 KB
list_debug.c -rw-r--r-- 1.8 KB
list_sort.c -rw-r--r-- 8.4 KB
llist.c -rw-r--r-- 2.5 KB
locking-selftest-hardirq.h -rw-r--r-- 246 bytes
locking-selftest-mutex.h -rw-r--r-- 159 bytes
locking-selftest-rlock-hardirq.h -rw-r--r-- 74 bytes
locking-selftest-rlock-softirq.h -rw-r--r-- 74 bytes
locking-selftest-rlock.h -rw-r--r-- 197 bytes
locking-selftest-rsem.h -rw-r--r-- 202 bytes
locking-selftest-rtmutex.h -rw-r--r-- 162 bytes
locking-selftest-softirq.h -rw-r--r-- 246 bytes
locking-selftest-spin-hardirq.h -rw-r--r-- 73 bytes
locking-selftest-spin-softirq.h -rw-r--r-- 73 bytes
locking-selftest-spin.h -rw-r--r-- 157 bytes
locking-selftest-wlock-hardirq.h -rw-r--r-- 74 bytes
locking-selftest-wlock-softirq.h -rw-r--r-- 74 bytes
locking-selftest-wlock.h -rw-r--r-- 197 bytes
locking-selftest-wsem.h -rw-r--r-- 202 bytes
locking-selftest.c -rw-r--r-- 43.7 KB
lockref.c -rw-r--r-- 4.5 KB
logic_pio.c -rw-r--r-- 8.4 KB
lru_cache.c -rw-r--r-- 18.8 KB
lshrdi3.c -rw-r--r-- 559 bytes
memcat_p.c -rw-r--r-- 753 bytes
memory-notifier-error-inject.c -rw-r--r-- 1.1 KB
memregion.c -rw-r--r-- 400 bytes
memweight.c -rw-r--r-- 1.0 KB
muldi3.c -rw-r--r-- 1.7 KB
net_utils.c -rw-r--r-- 640 bytes
netdev-notifier-error-inject.c -rw-r--r-- 1.5 KB
nlattr.c -rw-r--r-- 22.6 KB
nmi_backtrace.c -rw-r--r-- 3.0 KB
nodemask.c -rw-r--r-- 653 bytes
notifier-error-inject.c -rw-r--r-- 2.5 KB
notifier-error-inject.h -rw-r--r-- 653 bytes
objagg.c -rw-r--r-- 28.3 KB
of-reconfig-notifier-error-inject.c -rw-r--r-- 1.3 KB
oid_registry.c -rw-r--r-- 3.7 KB
once.c -rw-r--r-- 1.4 KB
packing.c -rw-r--r-- 6.5 KB
parman.c -rw-r--r-- 10.6 KB
parser.c -rw-r--r-- 8.1 KB
pci_iomap.c -rw-r--r-- 4.2 KB
percpu-refcount.c -rw-r--r-- 13.5 KB
percpu_counter.c -rw-r--r-- 5.8 KB
percpu_test.c -rw-r--r-- 3.2 KB
plist.c -rw-r--r-- 5.9 KB
pm-notifier-error-inject.c -rw-r--r-- 1.2 KB
radix-tree.c -rw-r--r-- 43.3 KB
random32.c -rw-r--r-- 12.8 KB
ratelimit.c -rw-r--r-- 1.6 KB
rbtree.c -rw-r--r-- 17.1 KB
rbtree_test.c -rw-r--r-- 9.4 KB
refcount.c -rw-r--r-- 4.8 KB
rhashtable.c -rw-r--r-- 29.4 KB
sbitmap.c -rw-r--r-- 16.3 KB
scatterlist.c -rw-r--r-- 25.2 KB
seq_buf.c -rw-r--r-- 9.9 KB
sg_pool.c -rw-r--r-- 4.2 KB
sg_split.c -rw-r--r-- 5.0 KB
sha1.c -rw-r--r-- 6.1 KB
show_mem.c -rw-r--r-- 1.1 KB
siphash.c -rw-r--r-- 12.0 KB
smp_processor_id.c -rw-r--r-- 1.4 KB
sort.c -rw-r--r-- 8.4 KB
stackdepot.c -rw-r--r-- 8.6 KB
stmp_device.c -rw-r--r-- 1.9 KB
string.c -rw-r--r-- 23.5 KB
string_helpers.c -rw-r--r-- 13.9 KB
strncpy_from_user.c -rw-r--r-- 3.2 KB
strnlen_user.c -rw-r--r-- 3.3 KB
syscall.c -rw-r--r-- 2.5 KB
test-kstrtox.c -rw-r--r-- 17.3 KB
test-string_helpers.c -rw-r--r-- 10.3 KB
test_bitfield.c -rw-r--r-- 4.3 KB
test_bitmap.c -rw-r--r-- 13.7 KB
test_blackhole_dev.c -rw-r--r-- 2.5 KB
test_bpf.c -rw-r--r-- 161.1 KB
test_debug_virtual.c -rw-r--r-- 981 bytes
test_firmware.c -rw-r--r-- 22.9 KB
test_hash.c -rw-r--r-- 6.3 KB
test_hexdump.c -rw-r--r-- 6.3 KB
test_ida.c -rw-r--r-- 4.3 KB
test_kasan.c -rw-r--r-- 16.8 KB
test_kmod.c -rw-r--r-- 30.0 KB
test_list_sort.c -rw-r--r-- 3.3 KB
test_memcat_p.c -rw-r--r-- 2.2 KB
test_meminit.c -rw-r--r-- 9.7 KB
test_module.c -rw-r--r-- 794 bytes
test_objagg.c -rw-r--r-- 24.6 KB
test_overflow.c -rw-r--r-- 22.3 KB
test_parman.c -rw-r--r-- 11.2 KB
test_printf.c -rw-r--r-- 16.0 KB
test_rhashtable.c -rw-r--r-- 20.0 KB
test_siphash.c -rw-r--r-- 7.5 KB
test_sort.c -rw-r--r-- 870 bytes
test_stackinit.c -rw-r--r-- 10.9 KB
test_static_key_base.c -rw-r--r-- 1.6 KB
test_static_keys.c -rw-r--r-- 5.6 KB
test_string.c -rw-r--r-- 3.8 KB
test_strscpy.c -rw-r--r-- 4.0 KB
test_sysctl.c -rw-r--r-- 3.7 KB
test_ubsan.c -rw-r--r-- 2.4 KB
test_user_copy.c -rw-r--r-- 9.1 KB
test_uuid.c -rw-r--r-- 3.4 KB
test_vmalloc.c -rw-r--r-- 10.6 KB
test_xarray.c -rw-r--r-- 42.0 KB
textsearch.c -rw-r--r-- 9.3 KB
timerqueue.c -rw-r--r-- 2.5 KB
ts_bm.c -rw-r--r-- 5.1 KB
ts_fsm.c -rw-r--r-- 10.4 KB
ts_kmp.c -rw-r--r-- 4.1 KB
ubsan.c -rw-r--r-- 10.0 KB
ubsan.h -rw-r--r-- 1.6 KB
ucmpdi2.c -rw-r--r-- 568 bytes
ucs2_string.c -rw-r--r-- 2.5 KB
usercopy.c -rw-r--r-- 2.0 KB
uuid.c -rw-r--r-- 2.6 KB
vsprintf.c -rw-r--r-- 80.9 KB
win_minmax.c -rw-r--r-- 3.4 KB
xarray.c -rw-r--r-- 52.8 KB
xxhash.c -rw-r--r-- 12.7 KB

back to top