Revision 0f640dca08330dfc7820d610578e5935b5e654b2 authored by Mike Snitzer on 31 January 2013, 14:11:14 UTC, committed by Alasdair G Kergon on 31 January 2013, 14:11:14 UTC
thin_io_hints() is blindly copying the queue limits from the thin-pool which can lead to incorrect limits being set. The fix here simply deletes the thin_io_hints() hook which leaves the existing stacking infrastructure to set the limits correctly. When a thin-pool uses an MD device for the data device a thin device from the thin-pool must respect MD's constraints about disallowing a bio from spanning multiple chunks. Otherwise we can see problems. If the raid0 chunksize is 1152K and thin-pool chunksize is 256K I see the following md/raid0 error (with extra debug tracing added to thin_endio) when mkfs.xfs is executed against the thin device: md/raid0:md99: make_request bug: can't convert block across chunks or bigger than 1152k 6688 127 device-mapper: thin: bio sector=2080 err=-5 bi_size=130560 bi_rw=17 bi_vcnt=32 bi_idx=0 This extra DM debugging shows that the failing bio is spanning across the first and second logical 1152K chunk (sector 2080 + 255 takes the bio beyond the first chunk's boundary of sector 2304). So the bio splitting that DM is doing clearly isn't respecting the MD limits. max_hw_sectors_kb is 127 for both the thin-pool and thin device (queue_max_hw_sectors returns 255 so we'll excuse sysfs's lack of precision). So this explains why bi_size is 130560. But the thin device's max_hw_sectors_kb should be 4 (PAGE_SIZE) given that it doesn't have a .merge function (for bio_add_page to consult indirectly via dm_merge_bvec) yet the thin-pool does sit above an MD device that has a compulsory merge_bvec_fn. This scenario is exactly why DM must resort to sending single PAGE_SIZE bios to the underlying layer. Some additional context for this is available in the header for commit 8cbeb67a ("dm: avoid unsupported spanning of md stripe boundaries"). Long story short, the reason a thin device doesn't properly get configured to have a max_hw_sectors_kb of 4 (PAGE_SIZE) is that thin_io_hints() is blindly copying the queue limits from the thin-pool device directly to the thin device's queue limits. Fix this by eliminating thin_io_hints. Doing so is safe because the block layer's queue limits stacking already enables the upper level thin device to inherit the thin-pool device's discard and minimum_io_size and optimal_io_size limits that get set in pool_io_hints. But avoiding the queue limits copy allows the thin and thin-pool limits to be different where it is important, namely max_hw_sectors_kb. Reported-by: Daniel Browning <db@kavod.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Alasdair G Kergon <agk@redhat.com>
1 parent 949db15
File | Mode | Size |
---|---|---|
debug | ||
events | ||
gcov | ||
irq | ||
power | ||
sched | ||
time | ||
trace | ||
.gitignore | -rw-r--r-- | 63 bytes |
Kconfig.freezer | -rw-r--r-- | 52 bytes |
Kconfig.hz | -rw-r--r-- | 1.7 KB |
Kconfig.locks | -rw-r--r-- | 4.3 KB |
Kconfig.preempt | -rw-r--r-- | 2.1 KB |
Makefile | -rw-r--r-- | 7.2 KB |
acct.c | -rw-r--r-- | 16.6 KB |
async.c | -rw-r--r-- | 10.9 KB |
audit.c | -rw-r--r-- | 40.7 KB |
audit.h | -rw-r--r-- | 5.8 KB |
audit_tree.c | -rw-r--r-- | 22.2 KB |
audit_watch.c | -rw-r--r-- | 13.9 KB |
auditfilter.c | -rw-r--r-- | 36.1 KB |
auditsc.c | -rw-r--r-- | 71.9 KB |
backtracetest.c | -rw-r--r-- | 2.1 KB |
bounds.c | -rw-r--r-- | 600 bytes |
capability.c | -rw-r--r-- | 11.7 KB |
cgroup.c | -rw-r--r-- | 147.5 KB |
cgroup_freezer.c | -rw-r--r-- | 12.9 KB |
compat.c | -rw-r--r-- | 30.9 KB |
configs.c | -rw-r--r-- | 2.8 KB |
context_tracking.c | -rw-r--r-- | 2.1 KB |
cpu.c | -rw-r--r-- | 17.4 KB |
cpu_pm.c | -rw-r--r-- | 6.5 KB |
cpuset.c | -rw-r--r-- | 73.8 KB |
crash_dump.c | -rw-r--r-- | 1.2 KB |
cred.c | -rw-r--r-- | 21.2 KB |
delayacct.c | -rw-r--r-- | 5.0 KB |
dma.c | -rw-r--r-- | 3.6 KB |
elfcore.c | -rw-r--r-- | 459 bytes |
exec_domain.c | -rw-r--r-- | 4.3 KB |
exit.c | -rw-r--r-- | 42.9 KB |
extable.c | -rw-r--r-- | 3.9 KB |
fork.c | -rw-r--r-- | 45.5 KB |
freezer.c | -rw-r--r-- | 4.0 KB |
futex.c | -rw-r--r-- | 71.2 KB |
futex_compat.c | -rw-r--r-- | 4.5 KB |
groups.c | -rw-r--r-- | 6.0 KB |
hrtimer.c | -rw-r--r-- | 47.0 KB |
hung_task.c | -rw-r--r-- | 5.3 KB |
irq_work.c | -rw-r--r-- | 3.1 KB |
itimer.c | -rw-r--r-- | 7.3 KB |
jump_label.c | -rw-r--r-- | 10.8 KB |
kallsyms.c | -rw-r--r-- | 14.7 KB |
kcmp.c | -rw-r--r-- | 4.3 KB |
kexec.c | -rw-r--r-- | 39.6 KB |
kfifo.c | -rw-r--r-- | 12.8 KB |
kmod.c | -rw-r--r-- | 18.9 KB |
kprobes.c | -rw-r--r-- | 59.1 KB |
ksysfs.c | -rw-r--r-- | 5.5 KB |
kthread.c | -rw-r--r-- | 16.9 KB |
latencytop.c | -rw-r--r-- | 7.6 KB |
lglock.c | -rw-r--r-- | 1.9 KB |
lockdep.c | -rw-r--r-- | 103.8 KB |
lockdep_internals.h | -rw-r--r-- | 4.5 KB |
lockdep_proc.c | -rw-r--r-- | 17.0 KB |
lockdep_states.h | -rw-r--r-- | 233 bytes |
modsign_certificate.S | -rw-r--r-- | 467 bytes |
modsign_pubkey.c | -rw-r--r-- | 2.6 KB |
module-internal.h | -rw-r--r-- | 495 bytes |
module.c | -rw-r--r-- | 95.9 KB |
module_signing.c | -rw-r--r-- | 5.9 KB |
mutex-debug.c | -rw-r--r-- | 2.9 KB |
mutex-debug.h | -rw-r--r-- | 1.7 KB |
mutex.c | -rw-r--r-- | 13.2 KB |
mutex.h | -rw-r--r-- | 1.3 KB |
notifier.c | -rw-r--r-- | 16.0 KB |
nsproxy.c | -rw-r--r-- | 6.3 KB |
padata.c | -rw-r--r-- | 27.1 KB |
panic.c | -rw-r--r-- | 11.2 KB |
params.c | -rw-r--r-- | 21.7 KB |
pid.c | -rw-r--r-- | 14.8 KB |
pid_namespace.c | -rw-r--r-- | 8.8 KB |
posix-cpu-timers.c | -rw-r--r-- | 40.7 KB |
posix-timers.c | -rw-r--r-- | 29.1 KB |
printk.c | -rw-r--r-- | 70.2 KB |
profile.c | -rw-r--r-- | 16.7 KB |
ptrace.c | -rw-r--r-- | 27.6 KB |
range.c | -rw-r--r-- | 2.9 KB |
rcu.h | -rw-r--r-- | 3.8 KB |
rcupdate.c | -rw-r--r-- | 11.8 KB |
rcutiny.c | -rw-r--r-- | 10.3 KB |
rcutiny_plugin.h | -rw-r--r-- | 31.6 KB |
rcutorture.c | -rw-r--r-- | 62.2 KB |
rcutree.c | -rw-r--r-- | 94.4 KB |
rcutree.h | -rw-r--r-- | 22.7 KB |
rcutree_plugin.h | -rw-r--r-- | 73.5 KB |
rcutree_trace.c | -rw-r--r-- | 12.9 KB |
relay.c | -rw-r--r-- | 32.8 KB |
res_counter.c | -rw-r--r-- | 4.3 KB |
resource.c | -rw-r--r-- | 27.8 KB |
rtmutex-debug.c | -rw-r--r-- | 4.7 KB |
rtmutex-debug.h | -rw-r--r-- | 1.4 KB |
rtmutex-tester.c | -rw-r--r-- | 8.7 KB |
rtmutex.c | -rw-r--r-- | 26.3 KB |
rtmutex.h | -rw-r--r-- | 1.1 KB |
rtmutex_common.h | -rw-r--r-- | 3.3 KB |
rwsem.c | -rw-r--r-- | 2.6 KB |
seccomp.c | -rw-r--r-- | 13.9 KB |
semaphore.c | -rw-r--r-- | 7.2 KB |
signal.c | -rw-r--r-- | 87.8 KB |
smp.c | -rw-r--r-- | 21.7 KB |
smpboot.c | -rw-r--r-- | 6.5 KB |
smpboot.h | -rw-r--r-- | 564 bytes |
softirq.c | -rw-r--r-- | 20.8 KB |
spinlock.c | -rw-r--r-- | 9.5 KB |
srcu.c | -rw-r--r-- | 19.6 KB |
stacktrace.c | -rw-r--r-- | 1.1 KB |
stop_machine.c | -rw-r--r-- | 15.5 KB |
sys.c | -rw-r--r-- | 52.2 KB |
sys_ni.c | -rw-r--r-- | 5.9 KB |
sysctl.c | -rw-r--r-- | 59.5 KB |
sysctl_binary.c | -rw-r--r-- | 51.5 KB |
task_work.c | -rw-r--r-- | 2.2 KB |
taskstats.c | -rw-r--r-- | 16.4 KB |
test_kprobes.c | -rw-r--r-- | 8.5 KB |
time.c | -rw-r--r-- | 18.6 KB |
timeconst.pl | -rw-r--r-- | 7.2 KB |
timer.c | -rw-r--r-- | 49.6 KB |
tracepoint.c | -rw-r--r-- | 19.7 KB |
tsacct.c | -rw-r--r-- | 4.4 KB |
uid16.c | -rw-r--r-- | 6.0 KB |
up.c | -rw-r--r-- | 413 bytes |
user-return-notifier.c | -rw-r--r-- | 1.3 KB |
user.c | -rw-r--r-- | 5.2 KB |
user_namespace.c | -rw-r--r-- | 21.2 KB |
utsname.c | -rw-r--r-- | 2.8 KB |
utsname_sysctl.c | -rw-r--r-- | 3.0 KB |
wait.c | -rw-r--r-- | 8.3 KB |
watchdog.c | -rw-r--r-- | 13.9 KB |
workqueue.c | -rw-r--r-- | 106.5 KB |
workqueue_sched.h | -rw-r--r-- | 311 bytes |
Computing file changes ...