Revision a9ce385344f916cd1c36a33905e564f5581beae9 authored by Jens Axboe on 15 September 2023, 19:14:23 UTC, committed by Mike Snitzer on 15 September 2023, 19:39:59 UTC
dm looks up the table for IO based on the request type, with an assumption that if the request is marked REQ_NOWAIT, it's fine to attempt to submit that IO while under RCU read lock protection. This is not OK, as REQ_NOWAIT just means that we should not be sleeping waiting on other IO, it does not mean that we can't potentially schedule. A simple test case demonstrates this quite nicely: int main(int argc, char *argv[]) { struct iovec iov; int fd; fd = open("/dev/dm-0", O_RDONLY | O_DIRECT); posix_memalign(&iov.iov_base, 4096, 4096); iov.iov_len = 4096; preadv2(fd, &iov, 1, 0, RWF_NOWAIT); return 0; } which will instantly spew: BUG: sleeping function called from invalid context at include/linux/sched/mm.h:306 in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 5580, name: dm-nowait preempt_count: 0, expected: 0 RCU nest depth: 1, expected: 0 INFO: lockdep is turned off. CPU: 7 PID: 5580 Comm: dm-nowait Not tainted 6.6.0-rc1-g39956d2dcd81 #132 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 Call Trace: <TASK> dump_stack_lvl+0x11d/0x1b0 __might_resched+0x3c3/0x5e0 ? preempt_count_sub+0x150/0x150 mempool_alloc+0x1e2/0x390 ? mempool_resize+0x7d0/0x7d0 ? lock_sync+0x190/0x190 ? lock_release+0x4b7/0x670 ? internal_get_user_pages_fast+0x868/0x2d40 bio_alloc_bioset+0x417/0x8c0 ? bvec_alloc+0x200/0x200 ? internal_get_user_pages_fast+0xb8c/0x2d40 bio_alloc_clone+0x53/0x100 dm_submit_bio+0x27f/0x1a20 ? lock_release+0x4b7/0x670 ? blk_try_enter_queue+0x1a0/0x4d0 ? dm_dax_direct_access+0x260/0x260 ? rcu_is_watching+0x12/0xb0 ? blk_try_enter_queue+0x1cc/0x4d0 __submit_bio+0x239/0x310 ? __bio_queue_enter+0x700/0x700 ? kvm_clock_get_cycles+0x40/0x60 ? ktime_get+0x285/0x470 submit_bio_noacct_nocheck+0x4d9/0xb80 ? should_fail_request+0x80/0x80 ? preempt_count_sub+0x150/0x150 ? lock_release+0x4b7/0x670 ? __bio_add_page+0x143/0x2d0 ? iov_iter_revert+0x27/0x360 submit_bio_noacct+0x53e/0x1b30 submit_bio_wait+0x10a/0x230 ? submit_bio_wait_endio+0x40/0x40 __blkdev_direct_IO_simple+0x4f8/0x780 ? blkdev_bio_end_io+0x4c0/0x4c0 ? stack_trace_save+0x90/0xc0 ? __bio_clone+0x3c0/0x3c0 ? lock_release+0x4b7/0x670 ? lock_sync+0x190/0x190 ? atime_needs_update+0x3bf/0x7e0 ? timestamp_truncate+0x21b/0x2d0 ? inode_owner_or_capable+0x240/0x240 blkdev_direct_IO.part.0+0x84a/0x1810 ? rcu_is_watching+0x12/0xb0 ? lock_release+0x4b7/0x670 ? blkdev_read_iter+0x40d/0x530 ? reacquire_held_locks+0x4e0/0x4e0 ? __blkdev_direct_IO_simple+0x780/0x780 ? rcu_is_watching+0x12/0xb0 ? __mark_inode_dirty+0x297/0xd50 ? preempt_count_add+0x72/0x140 blkdev_read_iter+0x2a4/0x530 do_iter_readv_writev+0x2f2/0x3c0 ? generic_copy_file_range+0x1d0/0x1d0 ? fsnotify_perm.part.0+0x25d/0x630 ? security_file_permission+0xd8/0x100 do_iter_read+0x31b/0x880 ? import_iovec+0x10b/0x140 vfs_readv+0x12d/0x1a0 ? vfs_iter_read+0xb0/0xb0 ? rcu_is_watching+0x12/0xb0 ? rcu_is_watching+0x12/0xb0 ? lock_release+0x4b7/0x670 do_preadv+0x1b3/0x260 ? do_readv+0x370/0x370 __x64_sys_preadv2+0xef/0x150 do_syscall_64+0x39/0xb0 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f5af41ad806 Code: 41 54 41 89 fc 55 44 89 c5 53 48 89 cb 48 83 ec 18 80 3d e4 dd 0d 00 00 74 7a 45 89 c1 49 89 ca 45 31 c0 b8 47 01 00 00 0f 05 <48> 3d 00 f0 ff ff 0f 87 be 00 00 00 48 85 c0 79 4a 48 8b 0d da 55 RSP: 002b:00007ffd3145c7f0 EFLAGS: 00000246 ORIG_RAX: 0000000000000147 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f5af41ad806 RDX: 0000000000000001 RSI: 00007ffd3145c850 RDI: 0000000000000003 RBP: 0000000000000008 R08: 0000000000000000 R09: 0000000000000008 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000003 R13: 00007ffd3145c850 R14: 000055f5f0431dd8 R15: 0000000000000001 </TASK> where in fact it is dm itself that attempts to allocate a bio clone with GFP_NOIO under the rcu read lock, regardless of the request type. Fix this by getting rid of the special casing for REQ_NOWAIT, and just use the normal SRCU protected table lookup. Get rid of the bio based table locking helpers at the same time, as they are now unused. Cc: stable@vger.kernel.org Fixes: 563a225c9fd2 ("dm: introduce dm_{get,put}_live_table_bio called from dm_submit_bio") Signed-off-by: Jens Axboe <axboe@kernel.dk> Signed-off-by: Mike Snitzer <snitzer@kernel.org>
1 parent f6007dc
File | Mode | Size |
---|---|---|
damon | ||
kasan | ||
kfence | ||
kmsan | ||
Kconfig | -rw-r--r-- | 39.3 KB |
Kconfig.debug | -rw-r--r-- | 10.3 KB |
Makefile | -rw-r--r-- | 5.1 KB |
backing-dev.c | -rw-r--r-- | 27.6 KB |
balloon_compaction.c | -rw-r--r-- | 8.2 KB |
bootmem_info.c | -rw-r--r-- | 3.4 KB |
cma.c | -rw-r--r-- | 15.7 KB |
cma.h | -rw-r--r-- | 1.4 KB |
cma_debug.c | -rw-r--r-- | 4.5 KB |
cma_sysfs.c | -rw-r--r-- | 2.4 KB |
compaction.c | -rw-r--r-- | 88.7 KB |
debug.c | -rw-r--r-- | 7.0 KB |
debug_page_alloc.c | -rw-r--r-- | 1.6 KB |
debug_page_ref.c | -rw-r--r-- | 1.4 KB |
debug_vm_pgtable.c | -rw-r--r-- | 40.1 KB |
dmapool.c | -rw-r--r-- | 13.1 KB |
dmapool_test.c | -rw-r--r-- | 2.9 KB |
early_ioremap.c | -rw-r--r-- | 6.7 KB |
fadvise.c | -rw-r--r-- | 5.5 KB |
fail_page_alloc.c | -rw-r--r-- | 1.6 KB |
failslab.c | -rw-r--r-- | 1.6 KB |
filemap.c | -rw-r--r-- | 120.1 KB |
folio-compat.c | -rw-r--r-- | 3.1 KB |
gup.c | -rw-r--r-- | 96.1 KB |
gup_test.c | -rw-r--r-- | 9.0 KB |
gup_test.h | -rw-r--r-- | 1.2 KB |
highmem.c | -rw-r--r-- | 20.2 KB |
hmm.c | -rw-r--r-- | 17.1 KB |
huge_memory.c | -rw-r--r-- | 87.4 KB |
hugetlb.c | -rw-r--r-- | 205.0 KB |
hugetlb_cgroup.c | -rw-r--r-- | 24.7 KB |
hugetlb_vmemmap.c | -rw-r--r-- | 16.6 KB |
hugetlb_vmemmap.h | -rw-r--r-- | 1.6 KB |
hwpoison-inject.c | -rw-r--r-- | 2.8 KB |
init-mm.c | -rw-r--r-- | 1.8 KB |
internal.h | -rw-r--r-- | 35.5 KB |
interval_tree.c | -rw-r--r-- | 3.1 KB |
io-mapping.c | -rw-r--r-- | 993 bytes |
ioremap.c | -rw-r--r-- | 1.7 KB |
khugepaged.c | -rw-r--r-- | 71.4 KB |
kmemleak.c | -rw-r--r-- | 61.1 KB |
ksm.c | -rw-r--r-- | 96.9 KB |
list_lru.c | -rw-r--r-- | 13.8 KB |
maccess.c | -rw-r--r-- | 5.9 KB |
madvise.c | -rw-r--r-- | 38.9 KB |
mapping_dirty_helpers.c | -rw-r--r-- | 10.4 KB |
memblock.c | -rw-r--r-- | 62.9 KB |
memcontrol.c | -rw-r--r-- | 203.4 KB |
memfd.c | -rw-r--r-- | 9.8 KB |
memory-failure.c | -rw-r--r-- | 72.5 KB |
memory-tiers.c | -rw-r--r-- | 18.2 KB |
memory.c | -rw-r--r-- | 166.4 KB |
memory_hotplug.c | -rw-r--r-- | 66.3 KB |
mempolicy.c | -rw-r--r-- | 79.0 KB |
mempool.c | -rw-r--r-- | 16.1 KB |
memremap.c | -rw-r--r-- | 15.0 KB |
memtest.c | -rw-r--r-- | 3.4 KB |
migrate.c | -rw-r--r-- | 68.5 KB |
migrate_device.c | -rw-r--r-- | 26.8 KB |
mincore.c | -rw-r--r-- | 7.1 KB |
mlock.c | -rw-r--r-- | 19.6 KB |
mm_init.c | -rw-r--r-- | 78.4 KB |
mm_slot.h | -rw-r--r-- | 1.4 KB |
mmap.c | -rw-r--r-- | 103.7 KB |
mmap_lock.c | -rw-r--r-- | 6.2 KB |
mmu_gather.c | -rw-r--r-- | 9.9 KB |
mmu_notifier.c | -rw-r--r-- | 34.5 KB |
mmzone.c | -rw-r--r-- | 2.5 KB |
mprotect.c | -rw-r--r-- | 22.6 KB |
mremap.c | -rw-r--r-- | 29.1 KB |
msync.c | -rw-r--r-- | 2.9 KB |
nommu.c | -rw-r--r-- | 44.4 KB |
oom_kill.c | -rw-r--r-- | 33.2 KB |
page-writeback.c | -rw-r--r-- | 93.5 KB |
page_alloc.c | -rw-r--r-- | 185.1 KB |
page_counter.c | -rw-r--r-- | 6.8 KB |
page_ext.c | -rw-r--r-- | 13.6 KB |
page_idle.c | -rw-r--r-- | 5.3 KB |
page_io.c | -rw-r--r-- | 13.6 KB |
page_isolation.c | -rw-r--r-- | 21.4 KB |
page_owner.c | -rw-r--r-- | 18.0 KB |
page_poison.c | -rw-r--r-- | 2.5 KB |
page_reporting.c | -rw-r--r-- | 11.6 KB |
page_reporting.h | -rw-r--r-- | 1.6 KB |
page_table_check.c | -rw-r--r-- | 5.8 KB |
page_vma_mapped.c | -rw-r--r-- | 9.2 KB |
pagewalk.c | -rw-r--r-- | 17.5 KB |
percpu-internal.h | -rw-r--r-- | 7.3 KB |
percpu-km.c | -rw-r--r-- | 3.2 KB |
percpu-stats.c | -rw-r--r-- | 5.8 KB |
percpu-vm.c | -rw-r--r-- | 11.7 KB |
percpu.c | -rw-r--r-- | 102.0 KB |
pgalloc-track.h | -rw-r--r-- | 1.3 KB |
pgtable-generic.c | -rw-r--r-- | 11.1 KB |
process_vm_access.c | -rw-r--r-- | 8.2 KB |
ptdump.c | -rw-r--r-- | 4.2 KB |
readahead.c | -rw-r--r-- | 25.4 KB |
rmap.c | -rw-r--r-- | 73.5 KB |
rodata_test.c | -rw-r--r-- | 1.2 KB |
secretmem.c | -rw-r--r-- | 6.4 KB |
shmem.c | -rw-r--r-- | 127.3 KB |
shmem_quota.c | -rw-r--r-- | 9.4 KB |
show_mem.c | -rw-r--r-- | 11.9 KB |
shrinker_debug.c | -rw-r--r-- | 6.4 KB |
shuffle.c | -rw-r--r-- | 4.6 KB |
shuffle.h | -rw-r--r-- | 1.2 KB |
slab.c | -rw-r--r-- | 101.4 KB |
slab.h | -rw-r--r-- | 23.5 KB |
slab_common.c | -rw-r--r-- | 39.5 KB |
slub.c | -rw-r--r-- | 160.8 KB |
sparse-vmemmap.c | -rw-r--r-- | 11.9 KB |
sparse.c | -rw-r--r-- | 25.6 KB |
swap.c | -rw-r--r-- | 30.9 KB |
swap.h | -rw-r--r-- | 3.9 KB |
swap_cgroup.c | -rw-r--r-- | 5.2 KB |
swap_slots.c | -rw-r--r-- | 9.2 KB |
swap_state.c | -rw-r--r-- | 24.0 KB |
swapfile.c | -rw-r--r-- | 91.8 KB |
truncate.c | -rw-r--r-- | 25.8 KB |
usercopy.c | -rw-r--r-- | 8.1 KB |
userfaultfd.c | -rw-r--r-- | 21.0 KB |
util.c | -rw-r--r-- | 28.7 KB |
vmalloc.c | -rw-r--r-- | 116.1 KB |
vmpressure.c | -rw-r--r-- | 14.1 KB |
vmscan.c | -rw-r--r-- | 222.4 KB |
vmstat.c | -rw-r--r-- | 55.5 KB |
workingset.c | -rw-r--r-- | 26.6 KB |
z3fold.c | -rw-r--r-- | 36.8 KB |
zbud.c | -rw-r--r-- | 12.8 KB |
zpool.c | -rw-r--r-- | 9.9 KB |
zsmalloc.c | -rw-r--r-- | 56.1 KB |
zswap.c | -rw-r--r-- | 42.1 KB |
Computing file changes ...