Revision d80e731ecab420ddcb79ee9d0ac427acbc187b4b authored by Oleg Nesterov on 24 February 2012, 19:07:11 UTC, committed by Linus Torvalds on 24 February 2012, 19:42:50 UTC
This patch is intentionally incomplete to simplify the review.
It ignores ep_unregister_pollwait() which plays with the same wqh.
See the next change.

epoll assumes that the EPOLL_CTL_ADD'ed file controls everything
f_op->poll() needs. In particular it assumes that the wait queue
can't go away until eventpoll_release(). This is not true in case
of signalfd, the task which does EPOLL_CTL_ADD uses its ->sighand
which is not connected to the file.

This patch adds the special event, POLLFREE, currently only for
epoll. It expects that init_poll_funcptr()'ed hook should do the
necessary cleanup. Perhaps it should be defined as EPOLLFREE in
eventpoll.

__cleanup_sighand() is changed to do wake_up_poll(POLLFREE) if
->signalfd_wqh is not empty, we add the new signalfd_cleanup()
helper.

ep_poll_callback(POLLFREE) simply does list_del_init(task_list).
This make this poll entry inconsistent, but we don't care. If you
share epoll fd which contains our sigfd with another process you
should blame yourself. signalfd is "really special". I simply do
not know how we can define the "right" semantics if it used with
epoll.

The main problem is, epoll calls signalfd_poll() once to establish
the connection with the wait queue, after that signalfd_poll(NULL)
returns the different/inconsistent results depending on who does
EPOLL_CTL_MOD/signalfd_read/etc. IOW: apart from sigmask, signalfd
has nothing to do with the file, it works with the current thread.

In short: this patch is the hack which tries to fix the symptoms.
It also assumes that nobody can take tasklist_lock under epoll
locks, this seems to be true.

Note:

	- we do not have wake_up_all_poll() but wake_up_poll()
	  is fine, poll/epoll doesn't use WQ_FLAG_EXCLUSIVE.

	- signalfd_cleanup() uses POLLHUP along with POLLFREE,
	  we need a couple of simple changes in eventpoll.c to
	  make sure it can't be "lost".

Reported-by: Maxime Bizon <mbizon@freebox.fr>
Cc: <stable@kernel.org>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 855a85f
Raw File
quicklist.c
/*
 * Quicklist support.
 *
 * Quicklists are light weight lists of pages that have a defined state
 * on alloc and free. Pages must be in the quicklist specific defined state
 * (zero by default) when the page is freed. It seems that the initial idea
 * for such lists first came from Dave Miller and then various other people
 * improved on it.
 *
 * Copyright (C) 2007 SGI,
 * 	Christoph Lameter <clameter@sgi.com>
 * 		Generalized, added support for multiple lists and
 * 		constructors / destructors.
 */
#include <linux/kernel.h>

#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/quicklist.h>

DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);

#define FRACTION_OF_NODE_MEM	16

static unsigned long max_pages(unsigned long min_pages)
{
	unsigned long node_free_pages, max;
	int node = numa_node_id();
	struct zone *zones = NODE_DATA(node)->node_zones;
	int num_cpus_on_node;

	node_free_pages =
#ifdef CONFIG_ZONE_DMA
		zone_page_state(&zones[ZONE_DMA], NR_FREE_PAGES) +
#endif
#ifdef CONFIG_ZONE_DMA32
		zone_page_state(&zones[ZONE_DMA32], NR_FREE_PAGES) +
#endif
		zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES);

	max = node_free_pages / FRACTION_OF_NODE_MEM;

	num_cpus_on_node = cpumask_weight(cpumask_of_node(node));
	max /= num_cpus_on_node;

	return max(max, min_pages);
}

static long min_pages_to_free(struct quicklist *q,
	unsigned long min_pages, long max_free)
{
	long pages_to_free;

	pages_to_free = q->nr_pages - max_pages(min_pages);

	return min(pages_to_free, max_free);
}

/*
 * Trim down the number of pages in the quicklist
 */
void quicklist_trim(int nr, void (*dtor)(void *),
	unsigned long min_pages, unsigned long max_free)
{
	long pages_to_free;
	struct quicklist *q;

	q = &get_cpu_var(quicklist)[nr];
	if (q->nr_pages > min_pages) {
		pages_to_free = min_pages_to_free(q, min_pages, max_free);

		while (pages_to_free > 0) {
			/*
			 * We pass a gfp_t of 0 to quicklist_alloc here
			 * because we will never call into the page allocator.
			 */
			void *p = quicklist_alloc(nr, 0, NULL);

			if (dtor)
				dtor(p);
			free_page((unsigned long)p);
			pages_to_free--;
		}
	}
	put_cpu_var(quicklist);
}

unsigned long quicklist_total_size(void)
{
	unsigned long count = 0;
	int cpu;
	struct quicklist *ql, *q;

	for_each_online_cpu(cpu) {
		ql = per_cpu(quicklist, cpu);
		for (q = ql; q < ql + CONFIG_NR_QUICK; q++)
			count += q->nr_pages;
	}
	return count;
}

back to top