Revision 2bbd00aef0671bfe3c2ca5ba67097246257de125 authored by Johannes Weiner on 26 February 2021, 01:16:47 UTC, committed by Linus Torvalds on 26 February 2021, 17:41:00 UTC
On NOHZ, the periodic vmstat flushers on each CPU can go to sleep and
won't wake up until stat changes are detected in the per-cpu deltas of the
zone vmstat counters.

In commit 75ef71840539 ("mm, vmstat: add infrastructure for per-node
vmstats") per-node counters were introduced, and subsequently most stats
were moved from the zone to the node level.  However, the node counters
weren't added to the NOHZ wakeup detection.

In theory this can cause per-cpu errors to remain in the user-reported
stats indefinitely.  In practice this only affects a handful of sub
counters (file_mapped, dirty and writeback e.g.) because other page state
changes at the node level likely involve a change at the zone level as
well (alloc and free, lru ops).  Also, nobody has complained.

Fix it up for completeness: wake up vmstat refreshing on node changes.
Also remove the BUILD_BUG_ONs that assert counter size; we haven't relied
on it since we added sizeof() to the range calculation in commit
13c9aaf7fa01 ("mm/vmstat.c: fix NUMA statistics updates").

Link: https://lkml.kernel.org/r/20210202184342.118513-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent a052d4d
Raw File
dm-dust.c
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2018 Red Hat, Inc.
 *
 * This is a test "dust" device, which fails reads on specified
 * sectors, emulating the behavior of a hard disk drive sending
 * a "Read Medium Error" sense.
 *
 */

#include <linux/device-mapper.h>
#include <linux/module.h>
#include <linux/rbtree.h>

#define DM_MSG_PREFIX "dust"

struct badblock {
	struct rb_node node;
	sector_t bb;
	unsigned char wr_fail_cnt;
};

struct dust_device {
	struct dm_dev *dev;
	struct rb_root badblocklist;
	unsigned long long badblock_count;
	spinlock_t dust_lock;
	unsigned int blksz;
	int sect_per_block_shift;
	unsigned int sect_per_block;
	sector_t start;
	bool fail_read_on_bb:1;
	bool quiet_mode:1;
};

static struct badblock *dust_rb_search(struct rb_root *root, sector_t blk)
{
	struct rb_node *node = root->rb_node;

	while (node) {
		struct badblock *bblk = rb_entry(node, struct badblock, node);

		if (bblk->bb > blk)
			node = node->rb_left;
		else if (bblk->bb < blk)
			node = node->rb_right;
		else
			return bblk;
	}

	return NULL;
}

static bool dust_rb_insert(struct rb_root *root, struct badblock *new)
{
	struct badblock *bblk;
	struct rb_node **link = &root->rb_node, *parent = NULL;
	sector_t value = new->bb;

	while (*link) {
		parent = *link;
		bblk = rb_entry(parent, struct badblock, node);

		if (bblk->bb > value)
			link = &(*link)->rb_left;
		else if (bblk->bb < value)
			link = &(*link)->rb_right;
		else
			return false;
	}

	rb_link_node(&new->node, parent, link);
	rb_insert_color(&new->node, root);

	return true;
}

static int dust_remove_block(struct dust_device *dd, unsigned long long block)
{
	struct badblock *bblock;
	unsigned long flags;

	spin_lock_irqsave(&dd->dust_lock, flags);
	bblock = dust_rb_search(&dd->badblocklist, block);

	if (bblock == NULL) {
		if (!dd->quiet_mode) {
			DMERR("%s: block %llu not found in badblocklist",
			      __func__, block);
		}
		spin_unlock_irqrestore(&dd->dust_lock, flags);
		return -EINVAL;
	}

	rb_erase(&bblock->node, &dd->badblocklist);
	dd->badblock_count--;
	if (!dd->quiet_mode)
		DMINFO("%s: badblock removed at block %llu", __func__, block);
	kfree(bblock);
	spin_unlock_irqrestore(&dd->dust_lock, flags);

	return 0;
}

static int dust_add_block(struct dust_device *dd, unsigned long long block,
			  unsigned char wr_fail_cnt)
{
	struct badblock *bblock;
	unsigned long flags;

	bblock = kmalloc(sizeof(*bblock), GFP_KERNEL);
	if (bblock == NULL) {
		if (!dd->quiet_mode)
			DMERR("%s: badblock allocation failed", __func__);
		return -ENOMEM;
	}

	spin_lock_irqsave(&dd->dust_lock, flags);
	bblock->bb = block;
	bblock->wr_fail_cnt = wr_fail_cnt;
	if (!dust_rb_insert(&dd->badblocklist, bblock)) {
		if (!dd->quiet_mode) {
			DMERR("%s: block %llu already in badblocklist",
			      __func__, block);
		}
		spin_unlock_irqrestore(&dd->dust_lock, flags);
		kfree(bblock);
		return -EINVAL;
	}

	dd->badblock_count++;
	if (!dd->quiet_mode) {
		DMINFO("%s: badblock added at block %llu with write fail count %u",
		       __func__, block, wr_fail_cnt);
	}
	spin_unlock_irqrestore(&dd->dust_lock, flags);

	return 0;
}

static int dust_query_block(struct dust_device *dd, unsigned long long block, char *result,
			    unsigned int maxlen, unsigned int *sz_ptr)
{
	struct badblock *bblock;
	unsigned long flags;
	unsigned int sz = *sz_ptr;

	spin_lock_irqsave(&dd->dust_lock, flags);
	bblock = dust_rb_search(&dd->badblocklist, block);
	if (bblock != NULL)
		DMEMIT("%s: block %llu found in badblocklist", __func__, block);
	else
		DMEMIT("%s: block %llu not found in badblocklist", __func__, block);
	spin_unlock_irqrestore(&dd->dust_lock, flags);

	return 1;
}

static int __dust_map_read(struct dust_device *dd, sector_t thisblock)
{
	struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock);

	if (bblk)
		return DM_MAPIO_KILL;

	return DM_MAPIO_REMAPPED;
}

static int dust_map_read(struct dust_device *dd, sector_t thisblock,
			 bool fail_read_on_bb)
{
	unsigned long flags;
	int r = DM_MAPIO_REMAPPED;

	if (fail_read_on_bb) {
		thisblock >>= dd->sect_per_block_shift;
		spin_lock_irqsave(&dd->dust_lock, flags);
		r = __dust_map_read(dd, thisblock);
		spin_unlock_irqrestore(&dd->dust_lock, flags);
	}

	return r;
}

static int __dust_map_write(struct dust_device *dd, sector_t thisblock)
{
	struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock);

	if (bblk && bblk->wr_fail_cnt > 0) {
		bblk->wr_fail_cnt--;
		return DM_MAPIO_KILL;
	}

	if (bblk) {
		rb_erase(&bblk->node, &dd->badblocklist);
		dd->badblock_count--;
		kfree(bblk);
		if (!dd->quiet_mode) {
			sector_div(thisblock, dd->sect_per_block);
			DMINFO("block %llu removed from badblocklist by write",
			       (unsigned long long)thisblock);
		}
	}

	return DM_MAPIO_REMAPPED;
}

static int dust_map_write(struct dust_device *dd, sector_t thisblock,
			  bool fail_read_on_bb)
{
	unsigned long flags;
	int r = DM_MAPIO_REMAPPED;

	if (fail_read_on_bb) {
		thisblock >>= dd->sect_per_block_shift;
		spin_lock_irqsave(&dd->dust_lock, flags);
		r = __dust_map_write(dd, thisblock);
		spin_unlock_irqrestore(&dd->dust_lock, flags);
	}

	return r;
}

static int dust_map(struct dm_target *ti, struct bio *bio)
{
	struct dust_device *dd = ti->private;
	int r;

	bio_set_dev(bio, dd->dev->bdev);
	bio->bi_iter.bi_sector = dd->start + dm_target_offset(ti, bio->bi_iter.bi_sector);

	if (bio_data_dir(bio) == READ)
		r = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);
	else
		r = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb);

	return r;
}

static bool __dust_clear_badblocks(struct rb_root *tree,
				   unsigned long long count)
{
	struct rb_node *node = NULL, *nnode = NULL;

	nnode = rb_first(tree);
	if (nnode == NULL) {
		BUG_ON(count != 0);
		return false;
	}

	while (nnode) {
		node = nnode;
		nnode = rb_next(node);
		rb_erase(node, tree);
		count--;
		kfree(node);
	}
	BUG_ON(count != 0);
	BUG_ON(tree->rb_node != NULL);

	return true;
}

static int dust_clear_badblocks(struct dust_device *dd, char *result, unsigned int maxlen,
				unsigned int *sz_ptr)
{
	unsigned long flags;
	struct rb_root badblocklist;
	unsigned long long badblock_count;
	unsigned int sz = *sz_ptr;

	spin_lock_irqsave(&dd->dust_lock, flags);
	badblocklist = dd->badblocklist;
	badblock_count = dd->badblock_count;
	dd->badblocklist = RB_ROOT;
	dd->badblock_count = 0;
	spin_unlock_irqrestore(&dd->dust_lock, flags);

	if (!__dust_clear_badblocks(&badblocklist, badblock_count))
		DMEMIT("%s: no badblocks found", __func__);
	else
		DMEMIT("%s: badblocks cleared", __func__);

	return 1;
}

static int dust_list_badblocks(struct dust_device *dd, char *result, unsigned int maxlen,
				unsigned int *sz_ptr)
{
	unsigned long flags;
	struct rb_root badblocklist;
	struct rb_node *node;
	struct badblock *bblk;
	unsigned int sz = *sz_ptr;
	unsigned long long num = 0;

	spin_lock_irqsave(&dd->dust_lock, flags);
	badblocklist = dd->badblocklist;
	for (node = rb_first(&badblocklist); node; node = rb_next(node)) {
		bblk = rb_entry(node, struct badblock, node);
		DMEMIT("%llu\n", bblk->bb);
		num++;
	}

	spin_unlock_irqrestore(&dd->dust_lock, flags);
	if (!num)
		DMEMIT("No blocks in badblocklist");

	return 1;
}

/*
 * Target parameters:
 *
 * <device_path> <offset> <blksz>
 *
 * device_path: path to the block device
 * offset: offset to data area from start of device_path
 * blksz: block size (minimum 512, maximum 1073741824, must be a power of 2)
 */
static int dust_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
	struct dust_device *dd;
	unsigned long long tmp;
	char dummy;
	unsigned int blksz;
	unsigned int sect_per_block;
	sector_t DUST_MAX_BLKSZ_SECTORS = 2097152;
	sector_t max_block_sectors = min(ti->len, DUST_MAX_BLKSZ_SECTORS);

	if (argc != 3) {
		ti->error = "Invalid argument count";
		return -EINVAL;
	}

	if (kstrtouint(argv[2], 10, &blksz) || !blksz) {
		ti->error = "Invalid block size parameter";
		return -EINVAL;
	}

	if (blksz < 512) {
		ti->error = "Block size must be at least 512";
		return -EINVAL;
	}

	if (!is_power_of_2(blksz)) {
		ti->error = "Block size must be a power of 2";
		return -EINVAL;
	}

	if (to_sector(blksz) > max_block_sectors) {
		ti->error = "Block size is too large";
		return -EINVAL;
	}

	sect_per_block = (blksz >> SECTOR_SHIFT);

	if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1 || tmp != (sector_t)tmp) {
		ti->error = "Invalid device offset sector";
		return -EINVAL;
	}

	dd = kzalloc(sizeof(struct dust_device), GFP_KERNEL);
	if (dd == NULL) {
		ti->error = "Cannot allocate context";
		return -ENOMEM;
	}

	if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dd->dev)) {
		ti->error = "Device lookup failed";
		kfree(dd);
		return -EINVAL;
	}

	dd->sect_per_block = sect_per_block;
	dd->blksz = blksz;
	dd->start = tmp;

	dd->sect_per_block_shift = __ffs(sect_per_block);

	/*
	 * Whether to fail a read on a "bad" block.
	 * Defaults to false; enabled later by message.
	 */
	dd->fail_read_on_bb = false;

	/*
	 * Initialize bad block list rbtree.
	 */
	dd->badblocklist = RB_ROOT;
	dd->badblock_count = 0;
	spin_lock_init(&dd->dust_lock);

	dd->quiet_mode = false;

	BUG_ON(dm_set_target_max_io_len(ti, dd->sect_per_block) != 0);

	ti->num_discard_bios = 1;
	ti->num_flush_bios = 1;
	ti->private = dd;

	return 0;
}

static void dust_dtr(struct dm_target *ti)
{
	struct dust_device *dd = ti->private;

	__dust_clear_badblocks(&dd->badblocklist, dd->badblock_count);
	dm_put_device(ti, dd->dev);
	kfree(dd);
}

static int dust_message(struct dm_target *ti, unsigned int argc, char **argv,
			char *result, unsigned int maxlen)
{
	struct dust_device *dd = ti->private;
	sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT;
	bool invalid_msg = false;
	int r = -EINVAL;
	unsigned long long tmp, block;
	unsigned char wr_fail_cnt;
	unsigned int tmp_ui;
	unsigned long flags;
	unsigned int sz = 0;
	char dummy;

	if (argc == 1) {
		if (!strcasecmp(argv[0], "addbadblock") ||
		    !strcasecmp(argv[0], "removebadblock") ||
		    !strcasecmp(argv[0], "queryblock")) {
			DMERR("%s requires an additional argument", argv[0]);
		} else if (!strcasecmp(argv[0], "disable")) {
			DMINFO("disabling read failures on bad sectors");
			dd->fail_read_on_bb = false;
			r = 0;
		} else if (!strcasecmp(argv[0], "enable")) {
			DMINFO("enabling read failures on bad sectors");
			dd->fail_read_on_bb = true;
			r = 0;
		} else if (!strcasecmp(argv[0], "countbadblocks")) {
			spin_lock_irqsave(&dd->dust_lock, flags);
			DMEMIT("countbadblocks: %llu badblock(s) found",
			       dd->badblock_count);
			spin_unlock_irqrestore(&dd->dust_lock, flags);
			r = 1;
		} else if (!strcasecmp(argv[0], "clearbadblocks")) {
			r = dust_clear_badblocks(dd, result, maxlen, &sz);
		} else if (!strcasecmp(argv[0], "quiet")) {
			if (!dd->quiet_mode)
				dd->quiet_mode = true;
			else
				dd->quiet_mode = false;
			r = 0;
		} else if (!strcasecmp(argv[0], "listbadblocks")) {
			r = dust_list_badblocks(dd, result, maxlen, &sz);
		} else {
			invalid_msg = true;
		}
	} else if (argc == 2) {
		if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1)
			return r;

		block = tmp;
		sector_div(size, dd->sect_per_block);
		if (block > size) {
			DMERR("selected block value out of range");
			return r;
		}

		if (!strcasecmp(argv[0], "addbadblock"))
			r = dust_add_block(dd, block, 0);
		else if (!strcasecmp(argv[0], "removebadblock"))
			r = dust_remove_block(dd, block);
		else if (!strcasecmp(argv[0], "queryblock"))
			r = dust_query_block(dd, block, result, maxlen, &sz);
		else
			invalid_msg = true;

	} else if (argc == 3) {
		if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1)
			return r;

		if (sscanf(argv[2], "%u%c", &tmp_ui, &dummy) != 1)
			return r;

		block = tmp;
		if (tmp_ui > 255) {
			DMERR("selected write fail count out of range");
			return r;
		}
		wr_fail_cnt = tmp_ui;
		sector_div(size, dd->sect_per_block);
		if (block > size) {
			DMERR("selected block value out of range");
			return r;
		}

		if (!strcasecmp(argv[0], "addbadblock"))
			r = dust_add_block(dd, block, wr_fail_cnt);
		else
			invalid_msg = true;

	} else
		DMERR("invalid number of arguments '%d'", argc);

	if (invalid_msg)
		DMERR("unrecognized message '%s' received", argv[0]);

	return r;
}

static void dust_status(struct dm_target *ti, status_type_t type,
			unsigned int status_flags, char *result, unsigned int maxlen)
{
	struct dust_device *dd = ti->private;
	unsigned int sz = 0;

	switch (type) {
	case STATUSTYPE_INFO:
		DMEMIT("%s %s %s", dd->dev->name,
		       dd->fail_read_on_bb ? "fail_read_on_bad_block" : "bypass",
		       dd->quiet_mode ? "quiet" : "verbose");
		break;

	case STATUSTYPE_TABLE:
		DMEMIT("%s %llu %u", dd->dev->name,
		       (unsigned long long)dd->start, dd->blksz);
		break;
	}
}

static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
{
	struct dust_device *dd = ti->private;
	struct dm_dev *dev = dd->dev;

	*bdev = dev->bdev;

	/*
	 * Only pass ioctls through if the device sizes match exactly.
	 */
	if (dd->start ||
	    ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
		return 1;

	return 0;
}

static int dust_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn,
				void *data)
{
	struct dust_device *dd = ti->private;

	return fn(ti, dd->dev, dd->start, ti->len, data);
}

static struct target_type dust_target = {
	.name = "dust",
	.version = {1, 0, 0},
	.module = THIS_MODULE,
	.ctr = dust_ctr,
	.dtr = dust_dtr,
	.iterate_devices = dust_iterate_devices,
	.map = dust_map,
	.message = dust_message,
	.status = dust_status,
	.prepare_ioctl = dust_prepare_ioctl,
};

static int __init dm_dust_init(void)
{
	int r = dm_register_target(&dust_target);

	if (r < 0)
		DMERR("dm_register_target failed %d", r);

	return r;
}

static void __exit dm_dust_exit(void)
{
	dm_unregister_target(&dust_target);
}

module_init(dm_dust_init);
module_exit(dm_dust_exit);

MODULE_DESCRIPTION(DM_NAME " dust test target");
MODULE_AUTHOR("Bryan Gurney <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");
back to top