Revision - 0f640dc - dm thin: fix queue limits stacking

Revision 0f640dca08330dfc7820d610578e5935b5e654b2 authored by Mike Snitzer on 31 January 2013, 14:11:14 UTC, committed by Alasdair G Kergon on 31 January 2013, 14:11:14 UTC

dm thin: fix queue limits stacking

thin_io_hints() is blindly copying the queue limits from the thin-pool
which can lead to incorrect limits being set.  The fix here simply
deletes the thin_io_hints() hook which leaves the existing stacking
infrastructure to set the limits correctly.

When a thin-pool uses an MD device for the data device a thin device
from the thin-pool must respect MD's constraints about disallowing a bio
from spanning multiple chunks.  Otherwise we can see problems.  If the raid0
chunksize is 1152K and thin-pool chunksize is 256K I see the following
md/raid0 error (with extra debug tracing added to thin_endio) when
mkfs.xfs is executed against the thin device:

md/raid0:md99: make_request bug: can't convert block across chunks or bigger than 1152k 6688 127
device-mapper: thin: bio sector=2080 err=-5 bi_size=130560 bi_rw=17 bi_vcnt=32 bi_idx=0

This extra DM debugging shows that the failing bio is spanning across
the first and second logical 1152K chunk (sector 2080 + 255 takes the
bio beyond the first chunk's boundary of sector 2304).  So the bio
splitting that DM is doing clearly isn't respecting the MD limits.

max_hw_sectors_kb is 127 for both the thin-pool and thin device
(queue_max_hw_sectors returns 255 so we'll excuse sysfs's lack of
precision).  So this explains why bi_size is 130560.

But the thin device's max_hw_sectors_kb should be 4 (PAGE_SIZE) given
that it doesn't have a .merge function (for bio_add_page to consult
indirectly via dm_merge_bvec) yet the thin-pool does sit above an MD
device that has a compulsory merge_bvec_fn.  This scenario is exactly
why DM must resort to sending single PAGE_SIZE bios to the underlying
layer. Some additional context for this is available in the header for
commit 8cbeb67a ("dm: avoid unsupported spanning of md stripe boundaries").

Long story short, the reason a thin device doesn't properly get
configured to have a max_hw_sectors_kb of 4 (PAGE_SIZE) is that
thin_io_hints() is blindly copying the queue limits from the thin-pool
device directly to the thin device's queue limits.

Fix this by eliminating thin_io_hints.  Doing so is safe because the
block layer's queue limits stacking already enables the upper level thin
device to inherit the thin-pool device's discard and minimum_io_size and
optimal_io_size limits that get set in pool_io_hints.  But avoiding the
queue limits copy allows the thin and thin-pool limits to be different
where it is important, namely max_hw_sectors_kb.

Reported-by: Daniel Browning <db@kavod.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

1 parent 949db15

Files
Changes

Permalinks

balloon_compaction.c

/*
 * mm/balloon_compaction.c
 *
 * Common interface for making balloon pages movable by compaction.
 *
 * Copyright (C) 2012, Red Hat, Inc.  Rafael Aquini <aquini@redhat.com>
 */
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/balloon_compaction.h>

/*
 * balloon_devinfo_alloc - allocates a balloon device information descriptor.
 * @balloon_dev_descriptor: pointer to reference the balloon device which
 *                          this struct balloon_dev_info will be servicing.
 *
 * Driver must call it to properly allocate and initialize an instance of
 * struct balloon_dev_info which will be used to reference a balloon device
 * as well as to keep track of the balloon device page list.
 */
struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
{
	struct balloon_dev_info *b_dev_info;
	b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
	if (!b_dev_info)
		return ERR_PTR(-ENOMEM);

	b_dev_info->balloon_device = balloon_dev_descriptor;
	b_dev_info->mapping = NULL;
	b_dev_info->isolated_pages = 0;
	spin_lock_init(&b_dev_info->pages_lock);
	INIT_LIST_HEAD(&b_dev_info->pages);

	return b_dev_info;
}
EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);

/*
 * balloon_page_enqueue - allocates a new page and inserts it into the balloon
 *			  page list.
 * @b_dev_info: balloon device decriptor where we will insert a new page to
 *
 * Driver must call it to properly allocate a new enlisted balloon page
 * before definetively removing it from the guest system.
 * This function returns the page address for the recently enqueued page or
 * NULL in the case we fail to allocate a new page this turn.
 */
struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
{
	unsigned long flags;
	struct page *page = alloc_page(balloon_mapping_gfp_mask() |
					__GFP_NOMEMALLOC | __GFP_NORETRY);
	if (!page)
		return NULL;

	/*
	 * Block others from accessing the 'page' when we get around to
	 * establishing additional references. We should be the only one
	 * holding a reference to the 'page' at this point.
	 */
	BUG_ON(!trylock_page(page));
	spin_lock_irqsave(&b_dev_info->pages_lock, flags);
	balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages);
	spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
	unlock_page(page);
	return page;
}
EXPORT_SYMBOL_GPL(balloon_page_enqueue);

/*
 * balloon_page_dequeue - removes a page from balloon's page list and returns
 *			  the its address to allow the driver release the page.
 * @b_dev_info: balloon device decriptor where we will grab a page from.
 *
 * Driver must call it to properly de-allocate a previous enlisted balloon page
 * before definetively releasing it back to the guest system.
 * This function returns the page address for the recently dequeued page or
 * NULL in the case we find balloon's page list temporarily empty due to
 * compaction isolated pages.
 */
struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
{
	struct page *page, *tmp;
	unsigned long flags;
	bool dequeued_page;

	dequeued_page = false;
	list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
		/*
		 * Block others from accessing the 'page' while we get around
		 * establishing additional references and preparing the 'page'
		 * to be released by the balloon driver.
		 */
		if (trylock_page(page)) {
			spin_lock_irqsave(&b_dev_info->pages_lock, flags);
			/*
			 * Raise the page refcount here to prevent any wrong
			 * attempt to isolate this page, in case of coliding
			 * with balloon_page_isolate() just after we release
			 * the page lock.
			 *
			 * balloon_page_free() will take care of dropping
			 * this extra refcount later.
			 */
			get_page(page);
			balloon_page_delete(page);
			spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
			unlock_page(page);
			dequeued_page = true;
			break;
		}
	}

	if (!dequeued_page) {
		/*
		 * If we are unable to dequeue a balloon page because the page
		 * list is empty and there is no isolated pages, then something
		 * went out of track and some balloon pages are lost.
		 * BUG() here, otherwise the balloon driver may get stuck into
		 * an infinite loop while attempting to release all its pages.
		 */
		spin_lock_irqsave(&b_dev_info->pages_lock, flags);
		if (unlikely(list_empty(&b_dev_info->pages) &&
			     !b_dev_info->isolated_pages))
			BUG();
		spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
		page = NULL;
	}
	return page;
}
EXPORT_SYMBOL_GPL(balloon_page_dequeue);

#ifdef CONFIG_BALLOON_COMPACTION
/*
 * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
 * @b_dev_info: holds the balloon device information descriptor.
 * @a_ops: balloon_mapping address_space_operations descriptor.
 *
 * Driver must call it to properly allocate and initialize an instance of
 * struct address_space which will be used as the special page->mapping for
 * balloon device enlisted page instances.
 */
struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
				const struct address_space_operations *a_ops)
{
	struct address_space *mapping;

	mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
	if (!mapping)
		return ERR_PTR(-ENOMEM);

	/*
	 * Give a clean 'zeroed' status to all elements of this special
	 * balloon page->mapping struct address_space instance.
	 */
	address_space_init_once(mapping);

	/*
	 * Set mapping->flags appropriately, to allow balloon pages
	 * ->mapping identification.
	 */
	mapping_set_balloon(mapping);
	mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());

	/* balloon's page->mapping->a_ops callback descriptor */
	mapping->a_ops = a_ops;

	/*
	 * Establish a pointer reference back to the balloon device descriptor
	 * this particular page->mapping will be servicing.
	 * This is used by compaction / migration procedures to identify and
	 * access the balloon device pageset while isolating / migrating pages.
	 *
	 * As some balloon drivers can register multiple balloon devices
	 * for a single guest, this also helps compaction / migration to
	 * properly deal with multiple balloon pagesets, when required.
	 */
	mapping->private_data = b_dev_info;
	b_dev_info->mapping = mapping;

	return mapping;
}
EXPORT_SYMBOL_GPL(balloon_mapping_alloc);

static inline void __isolate_balloon_page(struct page *page)
{
	struct balloon_dev_info *b_dev_info = page->mapping->private_data;
	unsigned long flags;
	spin_lock_irqsave(&b_dev_info->pages_lock, flags);
	list_del(&page->lru);
	b_dev_info->isolated_pages++;
	spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
}

static inline void __putback_balloon_page(struct page *page)
{
	struct balloon_dev_info *b_dev_info = page->mapping->private_data;
	unsigned long flags;
	spin_lock_irqsave(&b_dev_info->pages_lock, flags);
	list_add(&page->lru, &b_dev_info->pages);
	b_dev_info->isolated_pages--;
	spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
}

static inline int __migrate_balloon_page(struct address_space *mapping,
		struct page *newpage, struct page *page, enum migrate_mode mode)
{
	return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
}

/* __isolate_lru_page() counterpart for a ballooned page */
bool balloon_page_isolate(struct page *page)
{
	/*
	 * Avoid burning cycles with pages that are yet under __free_pages(),
	 * or just got freed under us.
	 *
	 * In case we 'win' a race for a balloon page being freed under us and
	 * raise its refcount preventing __free_pages() from doing its job
	 * the put_page() at the end of this block will take care of
	 * release this page, thus avoiding a nasty leakage.
	 */
	if (likely(get_page_unless_zero(page))) {
		/*
		 * As balloon pages are not isolated from LRU lists, concurrent
		 * compaction threads can race against page migration functions
		 * as well as race against the balloon driver releasing a page.
		 *
		 * In order to avoid having an already isolated balloon page
		 * being (wrongly) re-isolated while it is under migration,
		 * or to avoid attempting to isolate pages being released by
		 * the balloon driver, lets be sure we have the page lock
		 * before proceeding with the balloon page isolation steps.
		 */
		if (likely(trylock_page(page))) {
			/*
			 * A ballooned page, by default, has just one refcount.
			 * Prevent concurrent compaction threads from isolating
			 * an already isolated balloon page by refcount check.
			 */
			if (__is_movable_balloon_page(page) &&
			    page_count(page) == 2) {
				__isolate_balloon_page(page);
				unlock_page(page);
				return true;
			}
			unlock_page(page);
		}
		put_page(page);
	}
	return false;
}

/* putback_lru_page() counterpart for a ballooned page */
void balloon_page_putback(struct page *page)
{
	/*
	 * 'lock_page()' stabilizes the page and prevents races against
	 * concurrent isolation threads attempting to re-isolate it.
	 */
	lock_page(page);

	if (__is_movable_balloon_page(page)) {
		__putback_balloon_page(page);
		/* drop the extra ref count taken for page isolation */
		put_page(page);
	} else {
		WARN_ON(1);
		dump_page(page);
	}
	unlock_page(page);
}

/* move_to_new_page() counterpart for a ballooned page */
int balloon_page_migrate(struct page *newpage,
			 struct page *page, enum migrate_mode mode)
{
	struct address_space *mapping;
	int rc = -EAGAIN;

	/*
	 * Block others from accessing the 'newpage' when we get around to
	 * establishing additional references. We should be the only one
	 * holding a reference to the 'newpage' at this point.
	 */
	BUG_ON(!trylock_page(newpage));

	if (WARN_ON(!__is_movable_balloon_page(page))) {
		dump_page(page);
		unlock_page(newpage);
		return rc;
	}

	mapping = page->mapping;
	if (mapping)
		rc = __migrate_balloon_page(mapping, newpage, page, mode);

	unlock_page(newpage);
	return rc;
}
#endif /* CONFIG_BALLOON_COMPACTION */

Showing with 0 additions and 0 deletions (0 / 0 diffs computed)

Computing file changes ...