Revision 0f640dca08330dfc7820d610578e5935b5e654b2 authored by Mike Snitzer on 31 January 2013, 14:11:14 UTC, committed by Alasdair G Kergon on 31 January 2013, 14:11:14 UTC
thin_io_hints() is blindly copying the queue limits from the thin-pool
which can lead to incorrect limits being set.  The fix here simply
deletes the thin_io_hints() hook which leaves the existing stacking
infrastructure to set the limits correctly.

When a thin-pool uses an MD device for the data device a thin device
from the thin-pool must respect MD's constraints about disallowing a bio
from spanning multiple chunks.  Otherwise we can see problems.  If the raid0
chunksize is 1152K and thin-pool chunksize is 256K I see the following
md/raid0 error (with extra debug tracing added to thin_endio) when
mkfs.xfs is executed against the thin device:

md/raid0:md99: make_request bug: can't convert block across chunks or bigger than 1152k 6688 127
device-mapper: thin: bio sector=2080 err=-5 bi_size=130560 bi_rw=17 bi_vcnt=32 bi_idx=0

This extra DM debugging shows that the failing bio is spanning across
the first and second logical 1152K chunk (sector 2080 + 255 takes the
bio beyond the first chunk's boundary of sector 2304).  So the bio
splitting that DM is doing clearly isn't respecting the MD limits.

max_hw_sectors_kb is 127 for both the thin-pool and thin device
(queue_max_hw_sectors returns 255 so we'll excuse sysfs's lack of
precision).  So this explains why bi_size is 130560.

But the thin device's max_hw_sectors_kb should be 4 (PAGE_SIZE) given
that it doesn't have a .merge function (for bio_add_page to consult
indirectly via dm_merge_bvec) yet the thin-pool does sit above an MD
device that has a compulsory merge_bvec_fn.  This scenario is exactly
why DM must resort to sending single PAGE_SIZE bios to the underlying
layer. Some additional context for this is available in the header for
commit 8cbeb67a ("dm: avoid unsupported spanning of md stripe boundaries").

Long story short, the reason a thin device doesn't properly get
configured to have a max_hw_sectors_kb of 4 (PAGE_SIZE) is that
thin_io_hints() is blindly copying the queue limits from the thin-pool
device directly to the thin device's queue limits.

Fix this by eliminating thin_io_hints.  Doing so is safe because the
block layer's queue limits stacking already enables the upper level thin
device to inherit the thin-pool device's discard and minimum_io_size and
optimal_io_size limits that get set in pool_io_hints.  But avoiding the
queue limits copy allows the thin and thin-pool limits to be different
where it is important, namely max_hw_sectors_kb.

Reported-by: Daniel Browning <db@kavod.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
1 parent 949db15
Raw File
fadvise.c
/*
 * mm/fadvise.c
 *
 * Copyright (C) 2002, Linus Torvalds
 *
 * 11Jan2003	Andrew Morton
 *		Initial version.
 */

#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/fadvise.h>
#include <linux/writeback.h>
#include <linux/syscalls.h>

#include <asm/unistd.h>

/*
 * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
 * deactivate the pages and clear PG_Referenced.
 */
SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
{
	struct fd f = fdget(fd);
	struct address_space *mapping;
	struct backing_dev_info *bdi;
	loff_t endbyte;			/* inclusive */
	pgoff_t start_index;
	pgoff_t end_index;
	unsigned long nrpages;
	int ret = 0;

	if (!f.file)
		return -EBADF;

	if (S_ISFIFO(f.file->f_path.dentry->d_inode->i_mode)) {
		ret = -ESPIPE;
		goto out;
	}

	mapping = f.file->f_mapping;
	if (!mapping || len < 0) {
		ret = -EINVAL;
		goto out;
	}

	if (mapping->a_ops->get_xip_mem) {
		switch (advice) {
		case POSIX_FADV_NORMAL:
		case POSIX_FADV_RANDOM:
		case POSIX_FADV_SEQUENTIAL:
		case POSIX_FADV_WILLNEED:
		case POSIX_FADV_NOREUSE:
		case POSIX_FADV_DONTNEED:
			/* no bad return value, but ignore advice */
			break;
		default:
			ret = -EINVAL;
		}
		goto out;
	}

	/* Careful about overflows. Len == 0 means "as much as possible" */
	endbyte = offset + len;
	if (!len || endbyte < len)
		endbyte = -1;
	else
		endbyte--;		/* inclusive */

	bdi = mapping->backing_dev_info;

	switch (advice) {
	case POSIX_FADV_NORMAL:
		f.file->f_ra.ra_pages = bdi->ra_pages;
		spin_lock(&f.file->f_lock);
		f.file->f_mode &= ~FMODE_RANDOM;
		spin_unlock(&f.file->f_lock);
		break;
	case POSIX_FADV_RANDOM:
		spin_lock(&f.file->f_lock);
		f.file->f_mode |= FMODE_RANDOM;
		spin_unlock(&f.file->f_lock);
		break;
	case POSIX_FADV_SEQUENTIAL:
		f.file->f_ra.ra_pages = bdi->ra_pages * 2;
		spin_lock(&f.file->f_lock);
		f.file->f_mode &= ~FMODE_RANDOM;
		spin_unlock(&f.file->f_lock);
		break;
	case POSIX_FADV_WILLNEED:
		/* First and last PARTIAL page! */
		start_index = offset >> PAGE_CACHE_SHIFT;
		end_index = endbyte >> PAGE_CACHE_SHIFT;

		/* Careful about overflow on the "+1" */
		nrpages = end_index - start_index + 1;
		if (!nrpages)
			nrpages = ~0UL;

		/*
		 * Ignore return value because fadvise() shall return
		 * success even if filesystem can't retrieve a hint,
		 */
		force_page_cache_readahead(mapping, f.file, start_index,
					   nrpages);
		break;
	case POSIX_FADV_NOREUSE:
		break;
	case POSIX_FADV_DONTNEED:
		if (!bdi_write_congested(mapping->backing_dev_info))
			__filemap_fdatawrite_range(mapping, offset, endbyte,
						   WB_SYNC_NONE);

		/* First and last FULL page! */
		start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
		end_index = (endbyte >> PAGE_CACHE_SHIFT);

		if (end_index >= start_index)
			invalidate_mapping_pages(mapping, start_index,
						end_index);
		break;
	default:
		ret = -EINVAL;
	}
out:
	fdput(f);
	return ret;
}
#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
asmlinkage long SyS_fadvise64_64(long fd, loff_t offset, loff_t len, long advice)
{
	return SYSC_fadvise64_64((int) fd, offset, len, (int) advice);
}
SYSCALL_ALIAS(sys_fadvise64_64, SyS_fadvise64_64);
#endif

#ifdef __ARCH_WANT_SYS_FADVISE64

SYSCALL_DEFINE(fadvise64)(int fd, loff_t offset, size_t len, int advice)
{
	return sys_fadvise64_64(fd, offset, len, advice);
}
#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
asmlinkage long SyS_fadvise64(long fd, loff_t offset, long len, long advice)
{
	return SYSC_fadvise64((int) fd, offset, (size_t)len, (int)advice);
}
SYSCALL_ALIAS(sys_fadvise64, SyS_fadvise64);
#endif

#endif
back to top