Revision 0f640dca08330dfc7820d610578e5935b5e654b2 authored by Mike Snitzer on 31 January 2013, 14:11:14 UTC, committed by Alasdair G Kergon on 31 January 2013, 14:11:14 UTC
thin_io_hints() is blindly copying the queue limits from the thin-pool which can lead to incorrect limits being set. The fix here simply deletes the thin_io_hints() hook which leaves the existing stacking infrastructure to set the limits correctly. When a thin-pool uses an MD device for the data device a thin device from the thin-pool must respect MD's constraints about disallowing a bio from spanning multiple chunks. Otherwise we can see problems. If the raid0 chunksize is 1152K and thin-pool chunksize is 256K I see the following md/raid0 error (with extra debug tracing added to thin_endio) when mkfs.xfs is executed against the thin device: md/raid0:md99: make_request bug: can't convert block across chunks or bigger than 1152k 6688 127 device-mapper: thin: bio sector=2080 err=-5 bi_size=130560 bi_rw=17 bi_vcnt=32 bi_idx=0 This extra DM debugging shows that the failing bio is spanning across the first and second logical 1152K chunk (sector 2080 + 255 takes the bio beyond the first chunk's boundary of sector 2304). So the bio splitting that DM is doing clearly isn't respecting the MD limits. max_hw_sectors_kb is 127 for both the thin-pool and thin device (queue_max_hw_sectors returns 255 so we'll excuse sysfs's lack of precision). So this explains why bi_size is 130560. But the thin device's max_hw_sectors_kb should be 4 (PAGE_SIZE) given that it doesn't have a .merge function (for bio_add_page to consult indirectly via dm_merge_bvec) yet the thin-pool does sit above an MD device that has a compulsory merge_bvec_fn. This scenario is exactly why DM must resort to sending single PAGE_SIZE bios to the underlying layer. Some additional context for this is available in the header for commit 8cbeb67a ("dm: avoid unsupported spanning of md stripe boundaries"). Long story short, the reason a thin device doesn't properly get configured to have a max_hw_sectors_kb of 4 (PAGE_SIZE) is that thin_io_hints() is blindly copying the queue limits from the thin-pool device directly to the thin device's queue limits. Fix this by eliminating thin_io_hints. Doing so is safe because the block layer's queue limits stacking already enables the upper level thin device to inherit the thin-pool device's discard and minimum_io_size and optimal_io_size limits that get set in pool_io_hints. But avoiding the queue limits copy allows the thin and thin-pool limits to be different where it is important, namely max_hw_sectors_kb. Reported-by: Daniel Browning <db@kavod.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Cc: stable@vger.kernel.org Signed-off-by: Alasdair G Kergon <agk@redhat.com>
1 parent 949db15
balloon_compaction.c
/*
* mm/balloon_compaction.c
*
* Common interface for making balloon pages movable by compaction.
*
* Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com>
*/
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/balloon_compaction.h>
/*
* balloon_devinfo_alloc - allocates a balloon device information descriptor.
* @balloon_dev_descriptor: pointer to reference the balloon device which
* this struct balloon_dev_info will be servicing.
*
* Driver must call it to properly allocate and initialize an instance of
* struct balloon_dev_info which will be used to reference a balloon device
* as well as to keep track of the balloon device page list.
*/
struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
{
struct balloon_dev_info *b_dev_info;
b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
if (!b_dev_info)
return ERR_PTR(-ENOMEM);
b_dev_info->balloon_device = balloon_dev_descriptor;
b_dev_info->mapping = NULL;
b_dev_info->isolated_pages = 0;
spin_lock_init(&b_dev_info->pages_lock);
INIT_LIST_HEAD(&b_dev_info->pages);
return b_dev_info;
}
EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);
/*
* balloon_page_enqueue - allocates a new page and inserts it into the balloon
* page list.
* @b_dev_info: balloon device decriptor where we will insert a new page to
*
* Driver must call it to properly allocate a new enlisted balloon page
* before definetively removing it from the guest system.
* This function returns the page address for the recently enqueued page or
* NULL in the case we fail to allocate a new page this turn.
*/
struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
{
unsigned long flags;
struct page *page = alloc_page(balloon_mapping_gfp_mask() |
__GFP_NOMEMALLOC | __GFP_NORETRY);
if (!page)
return NULL;
/*
* Block others from accessing the 'page' when we get around to
* establishing additional references. We should be the only one
* holding a reference to the 'page' at this point.
*/
BUG_ON(!trylock_page(page));
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages);
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
unlock_page(page);
return page;
}
EXPORT_SYMBOL_GPL(balloon_page_enqueue);
/*
* balloon_page_dequeue - removes a page from balloon's page list and returns
* the its address to allow the driver release the page.
* @b_dev_info: balloon device decriptor where we will grab a page from.
*
* Driver must call it to properly de-allocate a previous enlisted balloon page
* before definetively releasing it back to the guest system.
* This function returns the page address for the recently dequeued page or
* NULL in the case we find balloon's page list temporarily empty due to
* compaction isolated pages.
*/
struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
{
struct page *page, *tmp;
unsigned long flags;
bool dequeued_page;
dequeued_page = false;
list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
/*
* Block others from accessing the 'page' while we get around
* establishing additional references and preparing the 'page'
* to be released by the balloon driver.
*/
if (trylock_page(page)) {
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
/*
* Raise the page refcount here to prevent any wrong
* attempt to isolate this page, in case of coliding
* with balloon_page_isolate() just after we release
* the page lock.
*
* balloon_page_free() will take care of dropping
* this extra refcount later.
*/
get_page(page);
balloon_page_delete(page);
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
unlock_page(page);
dequeued_page = true;
break;
}
}
if (!dequeued_page) {
/*
* If we are unable to dequeue a balloon page because the page
* list is empty and there is no isolated pages, then something
* went out of track and some balloon pages are lost.
* BUG() here, otherwise the balloon driver may get stuck into
* an infinite loop while attempting to release all its pages.
*/
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
if (unlikely(list_empty(&b_dev_info->pages) &&
!b_dev_info->isolated_pages))
BUG();
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
page = NULL;
}
return page;
}
EXPORT_SYMBOL_GPL(balloon_page_dequeue);
#ifdef CONFIG_BALLOON_COMPACTION
/*
* balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
* @b_dev_info: holds the balloon device information descriptor.
* @a_ops: balloon_mapping address_space_operations descriptor.
*
* Driver must call it to properly allocate and initialize an instance of
* struct address_space which will be used as the special page->mapping for
* balloon device enlisted page instances.
*/
struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
const struct address_space_operations *a_ops)
{
struct address_space *mapping;
mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
if (!mapping)
return ERR_PTR(-ENOMEM);
/*
* Give a clean 'zeroed' status to all elements of this special
* balloon page->mapping struct address_space instance.
*/
address_space_init_once(mapping);
/*
* Set mapping->flags appropriately, to allow balloon pages
* ->mapping identification.
*/
mapping_set_balloon(mapping);
mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());
/* balloon's page->mapping->a_ops callback descriptor */
mapping->a_ops = a_ops;
/*
* Establish a pointer reference back to the balloon device descriptor
* this particular page->mapping will be servicing.
* This is used by compaction / migration procedures to identify and
* access the balloon device pageset while isolating / migrating pages.
*
* As some balloon drivers can register multiple balloon devices
* for a single guest, this also helps compaction / migration to
* properly deal with multiple balloon pagesets, when required.
*/
mapping->private_data = b_dev_info;
b_dev_info->mapping = mapping;
return mapping;
}
EXPORT_SYMBOL_GPL(balloon_mapping_alloc);
static inline void __isolate_balloon_page(struct page *page)
{
struct balloon_dev_info *b_dev_info = page->mapping->private_data;
unsigned long flags;
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
list_del(&page->lru);
b_dev_info->isolated_pages++;
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
}
static inline void __putback_balloon_page(struct page *page)
{
struct balloon_dev_info *b_dev_info = page->mapping->private_data;
unsigned long flags;
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
list_add(&page->lru, &b_dev_info->pages);
b_dev_info->isolated_pages--;
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
}
static inline int __migrate_balloon_page(struct address_space *mapping,
struct page *newpage, struct page *page, enum migrate_mode mode)
{
return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
}
/* __isolate_lru_page() counterpart for a ballooned page */
bool balloon_page_isolate(struct page *page)
{
/*
* Avoid burning cycles with pages that are yet under __free_pages(),
* or just got freed under us.
*
* In case we 'win' a race for a balloon page being freed under us and
* raise its refcount preventing __free_pages() from doing its job
* the put_page() at the end of this block will take care of
* release this page, thus avoiding a nasty leakage.
*/
if (likely(get_page_unless_zero(page))) {
/*
* As balloon pages are not isolated from LRU lists, concurrent
* compaction threads can race against page migration functions
* as well as race against the balloon driver releasing a page.
*
* In order to avoid having an already isolated balloon page
* being (wrongly) re-isolated while it is under migration,
* or to avoid attempting to isolate pages being released by
* the balloon driver, lets be sure we have the page lock
* before proceeding with the balloon page isolation steps.
*/
if (likely(trylock_page(page))) {
/*
* A ballooned page, by default, has just one refcount.
* Prevent concurrent compaction threads from isolating
* an already isolated balloon page by refcount check.
*/
if (__is_movable_balloon_page(page) &&
page_count(page) == 2) {
__isolate_balloon_page(page);
unlock_page(page);
return true;
}
unlock_page(page);
}
put_page(page);
}
return false;
}
/* putback_lru_page() counterpart for a ballooned page */
void balloon_page_putback(struct page *page)
{
/*
* 'lock_page()' stabilizes the page and prevents races against
* concurrent isolation threads attempting to re-isolate it.
*/
lock_page(page);
if (__is_movable_balloon_page(page)) {
__putback_balloon_page(page);
/* drop the extra ref count taken for page isolation */
put_page(page);
} else {
WARN_ON(1);
dump_page(page);
}
unlock_page(page);
}
/* move_to_new_page() counterpart for a ballooned page */
int balloon_page_migrate(struct page *newpage,
struct page *page, enum migrate_mode mode)
{
struct address_space *mapping;
int rc = -EAGAIN;
/*
* Block others from accessing the 'newpage' when we get around to
* establishing additional references. We should be the only one
* holding a reference to the 'newpage' at this point.
*/
BUG_ON(!trylock_page(newpage));
if (WARN_ON(!__is_movable_balloon_page(page))) {
dump_page(page);
unlock_page(newpage);
return rc;
}
mapping = page->mapping;
if (mapping)
rc = __migrate_balloon_page(mapping, newpage, page, mode);
unlock_page(newpage);
return rc;
}
#endif /* CONFIG_BALLOON_COMPACTION */
![swh spinner](/static/img/swh-spinner.gif)
Computing file changes ...