Revision a9ce385344f916cd1c36a33905e564f5581beae9 authored by Jens Axboe on 15 September 2023, 19:14:23 UTC, committed by Mike Snitzer on 15 September 2023, 19:39:59 UTC
dm looks up the table for IO based on the request type, with an
assumption that if the request is marked REQ_NOWAIT, it's fine to
attempt to submit that IO while under RCU read lock protection. This
is not OK, as REQ_NOWAIT just means that we should not be sleeping
waiting on other IO, it does not mean that we can't potentially
schedule.

A simple test case demonstrates this quite nicely:

int main(int argc, char *argv[])
{
        struct iovec iov;
        int fd;

        fd = open("/dev/dm-0", O_RDONLY | O_DIRECT);
        posix_memalign(&iov.iov_base, 4096, 4096);
        iov.iov_len = 4096;
        preadv2(fd, &iov, 1, 0, RWF_NOWAIT);
        return 0;
}

which will instantly spew:

BUG: sleeping function called from invalid context at include/linux/sched/mm.h:306
in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 5580, name: dm-nowait
preempt_count: 0, expected: 0
RCU nest depth: 1, expected: 0
INFO: lockdep is turned off.
CPU: 7 PID: 5580 Comm: dm-nowait Not tainted 6.6.0-rc1-g39956d2dcd81 #132
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0x11d/0x1b0
 __might_resched+0x3c3/0x5e0
 ? preempt_count_sub+0x150/0x150
 mempool_alloc+0x1e2/0x390
 ? mempool_resize+0x7d0/0x7d0
 ? lock_sync+0x190/0x190
 ? lock_release+0x4b7/0x670
 ? internal_get_user_pages_fast+0x868/0x2d40
 bio_alloc_bioset+0x417/0x8c0
 ? bvec_alloc+0x200/0x200
 ? internal_get_user_pages_fast+0xb8c/0x2d40
 bio_alloc_clone+0x53/0x100
 dm_submit_bio+0x27f/0x1a20
 ? lock_release+0x4b7/0x670
 ? blk_try_enter_queue+0x1a0/0x4d0
 ? dm_dax_direct_access+0x260/0x260
 ? rcu_is_watching+0x12/0xb0
 ? blk_try_enter_queue+0x1cc/0x4d0
 __submit_bio+0x239/0x310
 ? __bio_queue_enter+0x700/0x700
 ? kvm_clock_get_cycles+0x40/0x60
 ? ktime_get+0x285/0x470
 submit_bio_noacct_nocheck+0x4d9/0xb80
 ? should_fail_request+0x80/0x80
 ? preempt_count_sub+0x150/0x150
 ? lock_release+0x4b7/0x670
 ? __bio_add_page+0x143/0x2d0
 ? iov_iter_revert+0x27/0x360
 submit_bio_noacct+0x53e/0x1b30
 submit_bio_wait+0x10a/0x230
 ? submit_bio_wait_endio+0x40/0x40
 __blkdev_direct_IO_simple+0x4f8/0x780
 ? blkdev_bio_end_io+0x4c0/0x4c0
 ? stack_trace_save+0x90/0xc0
 ? __bio_clone+0x3c0/0x3c0
 ? lock_release+0x4b7/0x670
 ? lock_sync+0x190/0x190
 ? atime_needs_update+0x3bf/0x7e0
 ? timestamp_truncate+0x21b/0x2d0
 ? inode_owner_or_capable+0x240/0x240
 blkdev_direct_IO.part.0+0x84a/0x1810
 ? rcu_is_watching+0x12/0xb0
 ? lock_release+0x4b7/0x670
 ? blkdev_read_iter+0x40d/0x530
 ? reacquire_held_locks+0x4e0/0x4e0
 ? __blkdev_direct_IO_simple+0x780/0x780
 ? rcu_is_watching+0x12/0xb0
 ? __mark_inode_dirty+0x297/0xd50
 ? preempt_count_add+0x72/0x140
 blkdev_read_iter+0x2a4/0x530
 do_iter_readv_writev+0x2f2/0x3c0
 ? generic_copy_file_range+0x1d0/0x1d0
 ? fsnotify_perm.part.0+0x25d/0x630
 ? security_file_permission+0xd8/0x100
 do_iter_read+0x31b/0x880
 ? import_iovec+0x10b/0x140
 vfs_readv+0x12d/0x1a0
 ? vfs_iter_read+0xb0/0xb0
 ? rcu_is_watching+0x12/0xb0
 ? rcu_is_watching+0x12/0xb0
 ? lock_release+0x4b7/0x670
 do_preadv+0x1b3/0x260
 ? do_readv+0x370/0x370
 __x64_sys_preadv2+0xef/0x150
 do_syscall_64+0x39/0xb0
 entry_SYSCALL_64_after_hwframe+0x63/0xcd
RIP: 0033:0x7f5af41ad806
Code: 41 54 41 89 fc 55 44 89 c5 53 48 89 cb 48 83 ec 18 80 3d e4 dd 0d 00 00 74 7a 45 89 c1 49 89 ca 45 31 c0 b8 47 01 00 00 0f 05 <48> 3d 00 f0 ff ff 0f 87 be 00 00 00 48 85 c0 79 4a 48 8b 0d da 55
RSP: 002b:00007ffd3145c7f0 EFLAGS: 00000246 ORIG_RAX: 0000000000000147
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f5af41ad806
RDX: 0000000000000001 RSI: 00007ffd3145c850 RDI: 0000000000000003
RBP: 0000000000000008 R08: 0000000000000000 R09: 0000000000000008
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000003
R13: 00007ffd3145c850 R14: 000055f5f0431dd8 R15: 0000000000000001
 </TASK>

where in fact it is dm itself that attempts to allocate a bio clone with
GFP_NOIO under the rcu read lock, regardless of the request type.

Fix this by getting rid of the special casing for REQ_NOWAIT, and just
use the normal SRCU protected table lookup. Get rid of the bio based
table locking helpers at the same time, as they are now unused.

Cc: stable@vger.kernel.org
Fixes: 563a225c9fd2 ("dm: introduce dm_{get,put}_live_table_bio called from dm_submit_bio")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
1 parent f6007dc
History
File Mode Size
damon
kasan
kfence
kmsan
Kconfig -rw-r--r-- 39.3 KB
Kconfig.debug -rw-r--r-- 10.3 KB
Makefile -rw-r--r-- 5.1 KB
backing-dev.c -rw-r--r-- 27.6 KB
balloon_compaction.c -rw-r--r-- 8.2 KB
bootmem_info.c -rw-r--r-- 3.4 KB
cma.c -rw-r--r-- 15.7 KB
cma.h -rw-r--r-- 1.4 KB
cma_debug.c -rw-r--r-- 4.5 KB
cma_sysfs.c -rw-r--r-- 2.4 KB
compaction.c -rw-r--r-- 88.7 KB
debug.c -rw-r--r-- 7.0 KB
debug_page_alloc.c -rw-r--r-- 1.6 KB
debug_page_ref.c -rw-r--r-- 1.4 KB
debug_vm_pgtable.c -rw-r--r-- 40.1 KB
dmapool.c -rw-r--r-- 13.1 KB
dmapool_test.c -rw-r--r-- 2.9 KB
early_ioremap.c -rw-r--r-- 6.7 KB
fadvise.c -rw-r--r-- 5.5 KB
fail_page_alloc.c -rw-r--r-- 1.6 KB
failslab.c -rw-r--r-- 1.6 KB
filemap.c -rw-r--r-- 120.1 KB
folio-compat.c -rw-r--r-- 3.1 KB
gup.c -rw-r--r-- 96.1 KB
gup_test.c -rw-r--r-- 9.0 KB
gup_test.h -rw-r--r-- 1.2 KB
highmem.c -rw-r--r-- 20.2 KB
hmm.c -rw-r--r-- 17.1 KB
huge_memory.c -rw-r--r-- 87.4 KB
hugetlb.c -rw-r--r-- 205.0 KB
hugetlb_cgroup.c -rw-r--r-- 24.7 KB
hugetlb_vmemmap.c -rw-r--r-- 16.6 KB
hugetlb_vmemmap.h -rw-r--r-- 1.6 KB
hwpoison-inject.c -rw-r--r-- 2.8 KB
init-mm.c -rw-r--r-- 1.8 KB
internal.h -rw-r--r-- 35.5 KB
interval_tree.c -rw-r--r-- 3.1 KB
io-mapping.c -rw-r--r-- 993 bytes
ioremap.c -rw-r--r-- 1.7 KB
khugepaged.c -rw-r--r-- 71.4 KB
kmemleak.c -rw-r--r-- 61.1 KB
ksm.c -rw-r--r-- 96.9 KB
list_lru.c -rw-r--r-- 13.8 KB
maccess.c -rw-r--r-- 5.9 KB
madvise.c -rw-r--r-- 38.9 KB
mapping_dirty_helpers.c -rw-r--r-- 10.4 KB
memblock.c -rw-r--r-- 62.9 KB
memcontrol.c -rw-r--r-- 203.4 KB
memfd.c -rw-r--r-- 9.8 KB
memory-failure.c -rw-r--r-- 72.5 KB
memory-tiers.c -rw-r--r-- 18.2 KB
memory.c -rw-r--r-- 166.4 KB
memory_hotplug.c -rw-r--r-- 66.3 KB
mempolicy.c -rw-r--r-- 79.0 KB
mempool.c -rw-r--r-- 16.1 KB
memremap.c -rw-r--r-- 15.0 KB
memtest.c -rw-r--r-- 3.4 KB
migrate.c -rw-r--r-- 68.5 KB
migrate_device.c -rw-r--r-- 26.8 KB
mincore.c -rw-r--r-- 7.1 KB
mlock.c -rw-r--r-- 19.6 KB
mm_init.c -rw-r--r-- 78.4 KB
mm_slot.h -rw-r--r-- 1.4 KB
mmap.c -rw-r--r-- 103.7 KB
mmap_lock.c -rw-r--r-- 6.2 KB
mmu_gather.c -rw-r--r-- 9.9 KB
mmu_notifier.c -rw-r--r-- 34.5 KB
mmzone.c -rw-r--r-- 2.5 KB
mprotect.c -rw-r--r-- 22.6 KB
mremap.c -rw-r--r-- 29.1 KB
msync.c -rw-r--r-- 2.9 KB
nommu.c -rw-r--r-- 44.4 KB
oom_kill.c -rw-r--r-- 33.2 KB
page-writeback.c -rw-r--r-- 93.5 KB
page_alloc.c -rw-r--r-- 185.1 KB
page_counter.c -rw-r--r-- 6.8 KB
page_ext.c -rw-r--r-- 13.6 KB
page_idle.c -rw-r--r-- 5.3 KB
page_io.c -rw-r--r-- 13.6 KB
page_isolation.c -rw-r--r-- 21.4 KB
page_owner.c -rw-r--r-- 18.0 KB
page_poison.c -rw-r--r-- 2.5 KB
page_reporting.c -rw-r--r-- 11.6 KB
page_reporting.h -rw-r--r-- 1.6 KB
page_table_check.c -rw-r--r-- 5.8 KB
page_vma_mapped.c -rw-r--r-- 9.2 KB
pagewalk.c -rw-r--r-- 17.5 KB
percpu-internal.h -rw-r--r-- 7.3 KB
percpu-km.c -rw-r--r-- 3.2 KB
percpu-stats.c -rw-r--r-- 5.8 KB
percpu-vm.c -rw-r--r-- 11.7 KB
percpu.c -rw-r--r-- 102.0 KB
pgalloc-track.h -rw-r--r-- 1.3 KB
pgtable-generic.c -rw-r--r-- 11.1 KB
process_vm_access.c -rw-r--r-- 8.2 KB
ptdump.c -rw-r--r-- 4.2 KB
readahead.c -rw-r--r-- 25.4 KB
rmap.c -rw-r--r-- 73.5 KB
rodata_test.c -rw-r--r-- 1.2 KB
secretmem.c -rw-r--r-- 6.4 KB
shmem.c -rw-r--r-- 127.3 KB
shmem_quota.c -rw-r--r-- 9.4 KB
show_mem.c -rw-r--r-- 11.9 KB
shrinker_debug.c -rw-r--r-- 6.4 KB
shuffle.c -rw-r--r-- 4.6 KB
shuffle.h -rw-r--r-- 1.2 KB
slab.c -rw-r--r-- 101.4 KB
slab.h -rw-r--r-- 23.5 KB
slab_common.c -rw-r--r-- 39.5 KB
slub.c -rw-r--r-- 160.8 KB
sparse-vmemmap.c -rw-r--r-- 11.9 KB
sparse.c -rw-r--r-- 25.6 KB
swap.c -rw-r--r-- 30.9 KB
swap.h -rw-r--r-- 3.9 KB
swap_cgroup.c -rw-r--r-- 5.2 KB
swap_slots.c -rw-r--r-- 9.2 KB
swap_state.c -rw-r--r-- 24.0 KB
swapfile.c -rw-r--r-- 91.8 KB
truncate.c -rw-r--r-- 25.8 KB
usercopy.c -rw-r--r-- 8.1 KB
userfaultfd.c -rw-r--r-- 21.0 KB
util.c -rw-r--r-- 28.7 KB
vmalloc.c -rw-r--r-- 116.1 KB
vmpressure.c -rw-r--r-- 14.1 KB
vmscan.c -rw-r--r-- 222.4 KB
vmstat.c -rw-r--r-- 55.5 KB
workingset.c -rw-r--r-- 26.6 KB
z3fold.c -rw-r--r-- 36.8 KB
zbud.c -rw-r--r-- 12.8 KB
zpool.c -rw-r--r-- 9.9 KB
zsmalloc.c -rw-r--r-- 56.1 KB
zswap.c -rw-r--r-- 42.1 KB

back to top