Revision d80e731ecab420ddcb79ee9d0ac427acbc187b4b authored by Oleg Nesterov on 24 February 2012, 19:07:11 UTC, committed by Linus Torvalds on 24 February 2012, 19:42:50 UTC
This patch is intentionally incomplete to simplify the review.
It ignores ep_unregister_pollwait() which plays with the same wqh.
See the next change.

epoll assumes that the EPOLL_CTL_ADD'ed file controls everything
f_op->poll() needs. In particular it assumes that the wait queue
can't go away until eventpoll_release(). This is not true in case
of signalfd, the task which does EPOLL_CTL_ADD uses its ->sighand
which is not connected to the file.

This patch adds the special event, POLLFREE, currently only for
epoll. It expects that init_poll_funcptr()'ed hook should do the
necessary cleanup. Perhaps it should be defined as EPOLLFREE in
eventpoll.

__cleanup_sighand() is changed to do wake_up_poll(POLLFREE) if
->signalfd_wqh is not empty, we add the new signalfd_cleanup()
helper.

ep_poll_callback(POLLFREE) simply does list_del_init(task_list).
This make this poll entry inconsistent, but we don't care. If you
share epoll fd which contains our sigfd with another process you
should blame yourself. signalfd is "really special". I simply do
not know how we can define the "right" semantics if it used with
epoll.

The main problem is, epoll calls signalfd_poll() once to establish
the connection with the wait queue, after that signalfd_poll(NULL)
returns the different/inconsistent results depending on who does
EPOLL_CTL_MOD/signalfd_read/etc. IOW: apart from sigmask, signalfd
has nothing to do with the file, it works with the current thread.

In short: this patch is the hack which tries to fix the symptoms.
It also assumes that nobody can take tasklist_lock under epoll
locks, this seems to be true.

Note:

	- we do not have wake_up_all_poll() but wake_up_poll()
	  is fine, poll/epoll doesn't use WQ_FLAG_EXCLUSIVE.

	- signalfd_cleanup() uses POLLHUP along with POLLFREE,
	  we need a couple of simple changes in eventpoll.c to
	  make sure it can't be "lost".

Reported-by: Maxime Bizon <mbizon@freebox.fr>
Cc: <stable@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 855a85f
Raw File
thrash.c
/*
 * mm/thrash.c
 *
 * Copyright (C) 2004, Red Hat, Inc.
 * Copyright (C) 2004, Rik van Riel <riel@redhat.com>
 * Released under the GPL, see the file COPYING for details.
 *
 * Simple token based thrashing protection, using the algorithm
 * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html
 *
 * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
 * Improved algorithm to pass token:
 * Each task has a priority which is incremented if it contended
 * for the token in an interval less than its previous attempt.
 * If the token is acquired, that task's priority is boosted to prevent
 * the token from bouncing around too often and to let the task make
 * some progress in its execution.
 */

#include <linux/jiffies.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/swap.h>
#include <linux/memcontrol.h>

#include <trace/events/vmscan.h>

#define TOKEN_AGING_INTERVAL	(0xFF)

static DEFINE_SPINLOCK(swap_token_lock);
struct mm_struct *swap_token_mm;
static struct mem_cgroup *swap_token_memcg;

#ifdef CONFIG_CGROUP_MEM_RES_CTLR
static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
{
	struct mem_cgroup *memcg;

	memcg = try_get_mem_cgroup_from_mm(mm);
	if (memcg)
		css_put(mem_cgroup_css(memcg));

	return memcg;
}
#else
static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
{
	return NULL;
}
#endif

void grab_swap_token(struct mm_struct *mm)
{
	int current_interval;
	unsigned int old_prio = mm->token_priority;
	static unsigned int global_faults;
	static unsigned int last_aging;

	global_faults++;

	current_interval = global_faults - mm->faultstamp;

	if (!spin_trylock(&swap_token_lock))
		return;

	/* First come first served */
	if (!swap_token_mm)
		goto replace_token;

	/*
	 * Usually, we don't need priority aging because long interval faults
	 * makes priority decrease quickly. But there is one exception. If the
	 * token owner task is sleeping, it never make long interval faults.
	 * Thus, we need a priority aging mechanism instead. The requirements
	 * of priority aging are
	 *  1) An aging interval is reasonable enough long. Too short aging
	 *     interval makes quick swap token lost and decrease performance.
	 *  2) The swap token owner task have to get priority aging even if
	 *     it's under sleep.
	 */
	if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
		swap_token_mm->token_priority /= 2;
		last_aging = global_faults;
	}

	if (mm == swap_token_mm) {
		mm->token_priority += 2;
		goto update_priority;
	}

	if (current_interval < mm->last_interval)
		mm->token_priority++;
	else {
		if (likely(mm->token_priority > 0))
			mm->token_priority--;
	}

	/* Check if we deserve the token */
	if (mm->token_priority > swap_token_mm->token_priority)
		goto replace_token;

update_priority:
	trace_update_swap_token_priority(mm, old_prio, swap_token_mm);

out:
	mm->faultstamp = global_faults;
	mm->last_interval = current_interval;
	spin_unlock(&swap_token_lock);
	return;

replace_token:
	mm->token_priority += 2;
	trace_replace_swap_token(swap_token_mm, mm);
	swap_token_mm = mm;
	swap_token_memcg = swap_token_memcg_from_mm(mm);
	last_aging = global_faults;
	goto out;
}

/* Called on process exit. */
void __put_swap_token(struct mm_struct *mm)
{
	spin_lock(&swap_token_lock);
	if (likely(mm == swap_token_mm)) {
		trace_put_swap_token(swap_token_mm);
		swap_token_mm = NULL;
		swap_token_memcg = NULL;
	}
	spin_unlock(&swap_token_lock);
}

static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
{
	if (!a)
		return true;
	if (!b)
		return true;
	if (a == b)
		return true;
	return false;
}

void disable_swap_token(struct mem_cgroup *memcg)
{
	/* memcg reclaim don't disable unrelated mm token. */
	if (match_memcg(memcg, swap_token_memcg)) {
		spin_lock(&swap_token_lock);
		if (match_memcg(memcg, swap_token_memcg)) {
			trace_disable_swap_token(swap_token_mm);
			swap_token_mm = NULL;
			swap_token_memcg = NULL;
		}
		spin_unlock(&swap_token_lock);
	}
}
back to top