Revision d7831a0bdf06b9f722b947bb0c205ff7d77cebd8 authored by Richard Kennedy on 30 June 2009, 18:41:35 UTC, committed by Linus Torvalds on 01 July 2009, 01:56:01 UTC
balance_dirty_pages can overreact and move all of the dirty pages to
writeback unnecessarily.

balance_dirty_pages makes its decision to throttle based on the number of
dirty plus writeback pages that are over the calculated limit,so it will
continue to move pages even when there are plenty of pages in writeback
and less than the threshold still dirty.

This allows it to overshoot its limits and move all the dirty pages to
writeback while waiting for the drives to catch up and empty the writeback
list.

A simple fio test easily demonstrates this problem.

fio --name=f1 --directory=/disk1 --size=2G -rw=write --name=f2 --directory=/disk2 --size=1G --rw=write --startdelay=10

This is the simplest fix I could find, but I'm not entirely sure that it
alone will be enough for all cases.  But it certainly is an improvement on
my desktop machine writing to 2 disks.

Do we need something more for machines with large arrays where
bdi_threshold * number_of_drives is greater than the dirty_ratio ?

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent df279ca
Raw File
faulty.c
/*
 * faulty.c : Multiple Devices driver for Linux
 *
 * Copyright (C) 2004 Neil Brown
 *
 * fautly-device-simulator personality for md
 *
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2, or (at your option)
 * any later version.
 *
 * You should have received a copy of the GNU General Public License
 * (for example /usr/src/linux/COPYING); if not, write to the Free
 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */


/*
 * The "faulty" personality causes some requests to fail.
 *
 * Possible failure modes are:
 *   reads fail "randomly" but succeed on retry
 *   writes fail "randomly" but succeed on retry
 *   reads for some address fail and then persist until a write
 *   reads for some address fail and then persist irrespective of write
 *   writes for some address fail and persist
 *   all writes fail
 *
 * Different modes can be active at a time, but only
 * one can be set at array creation.  Others can be added later.
 * A mode can be one-shot or recurrent with the recurrance being
 * once in every N requests.
 * The bottom 5 bits of the "layout" indicate the mode.  The
 * remainder indicate a period, or 0 for one-shot.
 *
 * There is an implementation limit on the number of concurrently
 * persisting-faulty blocks. When a new fault is requested that would
 * exceed the limit, it is ignored.
 * All current faults can be clear using a layout of "0".
 *
 * Requests are always sent to the device.  If they are to fail,
 * we clone the bio and insert a new b_end_io into the chain.
 */

#define	WriteTransient	0
#define	ReadTransient	1
#define	WritePersistent	2
#define	ReadPersistent	3
#define	WriteAll	4 /* doesn't go to device */
#define	ReadFixable	5
#define	Modes	6

#define	ClearErrors	31
#define	ClearFaults	30

#define AllPersist	100 /* internal use only */
#define	NoPersist	101

#define	ModeMask	0x1f
#define	ModeShift	5

#define MaxFault	50
#include <linux/blkdev.h>
#include <linux/raid/md_u.h>
#include "md.h"
#include <linux/seq_file.h>


static void faulty_fail(struct bio *bio, int error)
{
	struct bio *b = bio->bi_private;

	b->bi_size = bio->bi_size;
	b->bi_sector = bio->bi_sector;

	bio_put(bio);

	bio_io_error(b);
}

typedef struct faulty_conf {
	int period[Modes];
	atomic_t counters[Modes];
	sector_t faults[MaxFault];
	int	modes[MaxFault];
	int nfaults;
	mdk_rdev_t *rdev;
} conf_t;

static int check_mode(conf_t *conf, int mode)
{
	if (conf->period[mode] == 0 &&
	    atomic_read(&conf->counters[mode]) <= 0)
		return 0; /* no failure, no decrement */


	if (atomic_dec_and_test(&conf->counters[mode])) {
		if (conf->period[mode])
			atomic_set(&conf->counters[mode], conf->period[mode]);
		return 1;
	}
	return 0;
}

static int check_sector(conf_t *conf, sector_t start, sector_t end, int dir)
{
	/* If we find a ReadFixable sector, we fix it ... */
	int i;
	for (i=0; i<conf->nfaults; i++)
		if (conf->faults[i] >= start &&
		    conf->faults[i] < end) {
			/* found it ... */
			switch (conf->modes[i] * 2 + dir) {
			case WritePersistent*2+WRITE: return 1;
			case ReadPersistent*2+READ: return 1;
			case ReadFixable*2+READ: return 1;
			case ReadFixable*2+WRITE:
				conf->modes[i] = NoPersist;
				return 0;
			case AllPersist*2+READ:
			case AllPersist*2+WRITE: return 1;
			default:
				return 0;
			}
		}
	return 0;
}

static void add_sector(conf_t *conf, sector_t start, int mode)
{
	int i;
	int n = conf->nfaults;
	for (i=0; i<conf->nfaults; i++)
		if (conf->faults[i] == start) {
			switch(mode) {
			case NoPersist: conf->modes[i] = mode; return;
			case WritePersistent:
				if (conf->modes[i] == ReadPersistent ||
				    conf->modes[i] == ReadFixable)
					conf->modes[i] = AllPersist;
				else
					conf->modes[i] = WritePersistent;
				return;
			case ReadPersistent:
				if (conf->modes[i] == WritePersistent)
					conf->modes[i] = AllPersist;
				else
					conf->modes[i] = ReadPersistent;
				return;
			case ReadFixable:
				if (conf->modes[i] == WritePersistent ||
				    conf->modes[i] == ReadPersistent)
					conf->modes[i] = AllPersist;
				else
					conf->modes[i] = ReadFixable;
				return;
			}
		} else if (conf->modes[i] == NoPersist)
			n = i;

	if (n >= MaxFault)
		return;
	conf->faults[n] = start;
	conf->modes[n] = mode;
	if (conf->nfaults == n)
		conf->nfaults = n+1;
}

static int make_request(struct request_queue *q, struct bio *bio)
{
	mddev_t *mddev = q->queuedata;
	conf_t *conf = (conf_t*)mddev->private;
	int failit = 0;

	if (bio_data_dir(bio) == WRITE) {
		/* write request */
		if (atomic_read(&conf->counters[WriteAll])) {
			/* special case - don't decrement, don't generic_make_request,
			 * just fail immediately
			 */
			bio_endio(bio, -EIO);
			return 0;
		}

		if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9),
				 WRITE))
			failit = 1;
		if (check_mode(conf, WritePersistent)) {
			add_sector(conf, bio->bi_sector, WritePersistent);
			failit = 1;
		}
		if (check_mode(conf, WriteTransient))
			failit = 1;
	} else {
		/* read request */
		if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9),
				 READ))
			failit = 1;
		if (check_mode(conf, ReadTransient))
			failit = 1;
		if (check_mode(conf, ReadPersistent)) {
			add_sector(conf, bio->bi_sector, ReadPersistent);
			failit = 1;
		}
		if (check_mode(conf, ReadFixable)) {
			add_sector(conf, bio->bi_sector, ReadFixable);
			failit = 1;
		}
	}
	if (failit) {
		struct bio *b = bio_clone(bio, GFP_NOIO);
		b->bi_bdev = conf->rdev->bdev;
		b->bi_private = bio;
		b->bi_end_io = faulty_fail;
		generic_make_request(b);
		return 0;
	} else {
		bio->bi_bdev = conf->rdev->bdev;
		return 1;
	}
}

static void status(struct seq_file *seq, mddev_t *mddev)
{
	conf_t *conf = (conf_t*)mddev->private;
	int n;

	if ((n=atomic_read(&conf->counters[WriteTransient])) != 0)
		seq_printf(seq, " WriteTransient=%d(%d)",
			   n, conf->period[WriteTransient]);

	if ((n=atomic_read(&conf->counters[ReadTransient])) != 0)
		seq_printf(seq, " ReadTransient=%d(%d)",
			   n, conf->period[ReadTransient]);

	if ((n=atomic_read(&conf->counters[WritePersistent])) != 0)
		seq_printf(seq, " WritePersistent=%d(%d)",
			   n, conf->period[WritePersistent]);

	if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0)
		seq_printf(seq, " ReadPersistent=%d(%d)",
			   n, conf->period[ReadPersistent]);


	if ((n=atomic_read(&conf->counters[ReadFixable])) != 0)
		seq_printf(seq, " ReadFixable=%d(%d)",
			   n, conf->period[ReadFixable]);

	if ((n=atomic_read(&conf->counters[WriteAll])) != 0)
		seq_printf(seq, " WriteAll");

	seq_printf(seq, " nfaults=%d", conf->nfaults);
}


static int reshape(mddev_t *mddev)
{
	int mode = mddev->new_layout & ModeMask;
	int count = mddev->new_layout >> ModeShift;
	conf_t *conf = mddev->private;

	if (mddev->new_layout < 0)
		return 0;

	/* new layout */
	if (mode == ClearFaults)
		conf->nfaults = 0;
	else if (mode == ClearErrors) {
		int i;
		for (i=0 ; i < Modes ; i++) {
			conf->period[i] = 0;
			atomic_set(&conf->counters[i], 0);
		}
	} else if (mode < Modes) {
		conf->period[mode] = count;
		if (!count) count++;
		atomic_set(&conf->counters[mode], count);
	} else
		return -EINVAL;
	mddev->new_layout = -1;
	mddev->layout = -1; /* makes sure further changes come through */
	return 0;
}

static sector_t faulty_size(mddev_t *mddev, sector_t sectors, int raid_disks)
{
	WARN_ONCE(raid_disks,
		  "%s does not support generic reshape\n", __func__);

	if (sectors == 0)
		return mddev->dev_sectors;

	return sectors;
}

static int run(mddev_t *mddev)
{
	mdk_rdev_t *rdev;
	int i;
	conf_t *conf;

	if (md_check_no_bitmap(mddev))
		return -EINVAL;

	conf = kmalloc(sizeof(*conf), GFP_KERNEL);
	if (!conf)
		return -ENOMEM;

	for (i=0; i<Modes; i++) {
		atomic_set(&conf->counters[i], 0);
		conf->period[i] = 0;
	}
	conf->nfaults = 0;

	list_for_each_entry(rdev, &mddev->disks, same_set)
		conf->rdev = rdev;

	md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
	mddev->private = conf;

	reshape(mddev);

	return 0;
}

static int stop(mddev_t *mddev)
{
	conf_t *conf = (conf_t *)mddev->private;

	kfree(conf);
	mddev->private = NULL;
	return 0;
}

static struct mdk_personality faulty_personality =
{
	.name		= "faulty",
	.level		= LEVEL_FAULTY,
	.owner		= THIS_MODULE,
	.make_request	= make_request,
	.run		= run,
	.stop		= stop,
	.status		= status,
	.check_reshape	= reshape,
	.size		= faulty_size,
};

static int __init raid_init(void)
{
	return register_md_personality(&faulty_personality);
}

static void raid_exit(void)
{
	unregister_md_personality(&faulty_personality);
}

module_init(raid_init);
module_exit(raid_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS("md-personality-10"); /* faulty */
MODULE_ALIAS("md-faulty");
MODULE_ALIAS("md-level--5");
back to top