Revision f0a3a24b532d9a7e56a33c5112b2a212ed6ec580 authored by Roman Gushchin on 12 July 2019, 03:56:27 UTC, committed by Linus Torvalds on 12 July 2019, 18:05:44 UTC
Currently each charged slab page holds a reference to the cgroup to which
it's charged.  Kmem_caches are held by the memcg and are released all
together with the memory cgroup.  It means that none of kmem_caches are
released unless at least one reference to the memcg exists, which is very
far from optimal.

Let's rework it in a way that allows releasing individual kmem_caches as
soon as the cgroup is offline, the kmem_cache is empty and there are no
pending allocations.

To make it possible, let's introduce a new percpu refcounter for non-root
kmem caches.  The counter is initialized to the percpu mode, and is
switched to the atomic mode during kmem_cache deactivation.  The counter
is bumped for every charged page and also for every running allocation.
So the kmem_cache can't be released unless all allocations complete.

To shutdown non-active empty kmem_caches, let's reuse the work queue,
previously used for the kmem_cache deactivation.  Once the reference
counter reaches 0, let's schedule an asynchronous kmem_cache release.

* I used the following simple approach to test the performance
(stolen from another patchset by T. Harding):

    time find / -name fname-no-exist
    echo 2 > /proc/sys/vm/drop_caches
    repeat 10 times

Results:

        orig		patched

real	0m1.455s	real	0m1.355s
user	0m0.206s	user	0m0.219s
sys	0m0.855s	sys	0m0.807s

real	0m1.487s	real	0m1.699s
user	0m0.221s	user	0m0.256s
sys	0m0.806s	sys	0m0.948s

real	0m1.515s	real	0m1.505s
user	0m0.183s	user	0m0.215s
sys	0m0.876s	sys	0m0.858s

real	0m1.291s	real	0m1.380s
user	0m0.193s	user	0m0.198s
sys	0m0.843s	sys	0m0.786s

real	0m1.364s	real	0m1.374s
user	0m0.180s	user	0m0.182s
sys	0m0.868s	sys	0m0.806s

real	0m1.352s	real	0m1.312s
user	0m0.201s	user	0m0.212s
sys	0m0.820s	sys	0m0.761s

real	0m1.302s	real	0m1.349s
user	0m0.205s	user	0m0.203s
sys	0m0.803s	sys	0m0.792s

real	0m1.334s	real	0m1.301s
user	0m0.194s	user	0m0.201s
sys	0m0.806s	sys	0m0.779s

real	0m1.426s	real	0m1.434s
user	0m0.216s	user	0m0.181s
sys	0m0.824s	sys	0m0.864s

real	0m1.350s	real	0m1.295s
user	0m0.200s	user	0m0.190s
sys	0m0.842s	sys	0m0.811s

So it looks like the difference is not noticeable in this test.

[cai@lca.pw: fix an use-after-free in kmemcg_workfn()]
  Link: http://lkml.kernel.org/r/1560977573-10715-1-git-send-email-cai@lca.pw
Link: http://lkml.kernel.org/r/20190611231813.3148843-9-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Signed-off-by: Qian Cai <cai@lca.pw>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Waiman Long <longman@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Andrei Vagin <avagin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 63b02ef
Raw File
iomap.c
// SPDX-License-Identifier: GPL-2.0
/*
 * Implement the default iomap interfaces
 *
 * (C) Copyright 2004 Linus Torvalds
 */
#include <linux/pci.h>
#include <linux/io.h>

#include <linux/export.h>

/*
 * Read/write from/to an (offsettable) iomem cookie. It might be a PIO
 * access or a MMIO access, these functions don't care. The info is
 * encoded in the hardware mapping set up by the mapping functions
 * (or the cookie itself, depending on implementation and hw).
 *
 * The generic routines don't assume any hardware mappings, and just
 * encode the PIO/MMIO as part of the cookie. They coldly assume that
 * the MMIO IO mappings are not in the low address range.
 *
 * Architectures for which this is not true can't use this generic
 * implementation and should do their own copy.
 */

#ifndef HAVE_ARCH_PIO_SIZE
/*
 * We encode the physical PIO addresses (0-0xffff) into the
 * pointer by offsetting them with a constant (0x10000) and
 * assuming that all the low addresses are always PIO. That means
 * we can do some sanity checks on the low bits, and don't
 * need to just take things for granted.
 */
#define PIO_OFFSET	0x10000UL
#define PIO_MASK	0x0ffffUL
#define PIO_RESERVED	0x40000UL
#endif

static void bad_io_access(unsigned long port, const char *access)
{
	static int count = 10;
	if (count) {
		count--;
		WARN(1, KERN_ERR "Bad IO access at port %#lx (%s)\n", port, access);
	}
}

/*
 * Ugly macros are a way of life.
 */
#define IO_COND(addr, is_pio, is_mmio) do {			\
	unsigned long port = (unsigned long __force)addr;	\
	if (port >= PIO_RESERVED) {				\
		is_mmio;					\
	} else if (port > PIO_OFFSET) {				\
		port &= PIO_MASK;				\
		is_pio;						\
	} else							\
		bad_io_access(port, #is_pio );			\
} while (0)

#ifndef pio_read16be
#define pio_read16be(port) swab16(inw(port))
#define pio_read32be(port) swab32(inl(port))
#endif

#ifndef mmio_read16be
#define mmio_read16be(addr) swab16(readw(addr))
#define mmio_read32be(addr) swab32(readl(addr))
#define mmio_read64be(addr) swab64(readq(addr))
#endif

unsigned int ioread8(void __iomem *addr)
{
	IO_COND(addr, return inb(port), return readb(addr));
	return 0xff;
}
unsigned int ioread16(void __iomem *addr)
{
	IO_COND(addr, return inw(port), return readw(addr));
	return 0xffff;
}
unsigned int ioread16be(void __iomem *addr)
{
	IO_COND(addr, return pio_read16be(port), return mmio_read16be(addr));
	return 0xffff;
}
unsigned int ioread32(void __iomem *addr)
{
	IO_COND(addr, return inl(port), return readl(addr));
	return 0xffffffff;
}
unsigned int ioread32be(void __iomem *addr)
{
	IO_COND(addr, return pio_read32be(port), return mmio_read32be(addr));
	return 0xffffffff;
}
EXPORT_SYMBOL(ioread8);
EXPORT_SYMBOL(ioread16);
EXPORT_SYMBOL(ioread16be);
EXPORT_SYMBOL(ioread32);
EXPORT_SYMBOL(ioread32be);

#ifdef readq
static u64 pio_read64_lo_hi(unsigned long port)
{
	u64 lo, hi;

	lo = inl(port);
	hi = inl(port + sizeof(u32));

	return lo | (hi << 32);
}

static u64 pio_read64_hi_lo(unsigned long port)
{
	u64 lo, hi;

	hi = inl(port + sizeof(u32));
	lo = inl(port);

	return lo | (hi << 32);
}

static u64 pio_read64be_lo_hi(unsigned long port)
{
	u64 lo, hi;

	lo = pio_read32be(port + sizeof(u32));
	hi = pio_read32be(port);

	return lo | (hi << 32);
}

static u64 pio_read64be_hi_lo(unsigned long port)
{
	u64 lo, hi;

	hi = pio_read32be(port);
	lo = pio_read32be(port + sizeof(u32));

	return lo | (hi << 32);
}

u64 ioread64_lo_hi(void __iomem *addr)
{
	IO_COND(addr, return pio_read64_lo_hi(port), return readq(addr));
	return 0xffffffffffffffffULL;
}

u64 ioread64_hi_lo(void __iomem *addr)
{
	IO_COND(addr, return pio_read64_hi_lo(port), return readq(addr));
	return 0xffffffffffffffffULL;
}

u64 ioread64be_lo_hi(void __iomem *addr)
{
	IO_COND(addr, return pio_read64be_lo_hi(port),
		return mmio_read64be(addr));
	return 0xffffffffffffffffULL;
}

u64 ioread64be_hi_lo(void __iomem *addr)
{
	IO_COND(addr, return pio_read64be_hi_lo(port),
		return mmio_read64be(addr));
	return 0xffffffffffffffffULL;
}

EXPORT_SYMBOL(ioread64_lo_hi);
EXPORT_SYMBOL(ioread64_hi_lo);
EXPORT_SYMBOL(ioread64be_lo_hi);
EXPORT_SYMBOL(ioread64be_hi_lo);

#endif /* readq */

#ifndef pio_write16be
#define pio_write16be(val,port) outw(swab16(val),port)
#define pio_write32be(val,port) outl(swab32(val),port)
#endif

#ifndef mmio_write16be
#define mmio_write16be(val,port) writew(swab16(val),port)
#define mmio_write32be(val,port) writel(swab32(val),port)
#define mmio_write64be(val,port) writeq(swab64(val),port)
#endif

void iowrite8(u8 val, void __iomem *addr)
{
	IO_COND(addr, outb(val,port), writeb(val, addr));
}
void iowrite16(u16 val, void __iomem *addr)
{
	IO_COND(addr, outw(val,port), writew(val, addr));
}
void iowrite16be(u16 val, void __iomem *addr)
{
	IO_COND(addr, pio_write16be(val,port), mmio_write16be(val, addr));
}
void iowrite32(u32 val, void __iomem *addr)
{
	IO_COND(addr, outl(val,port), writel(val, addr));
}
void iowrite32be(u32 val, void __iomem *addr)
{
	IO_COND(addr, pio_write32be(val,port), mmio_write32be(val, addr));
}
EXPORT_SYMBOL(iowrite8);
EXPORT_SYMBOL(iowrite16);
EXPORT_SYMBOL(iowrite16be);
EXPORT_SYMBOL(iowrite32);
EXPORT_SYMBOL(iowrite32be);

#ifdef writeq
static void pio_write64_lo_hi(u64 val, unsigned long port)
{
	outl(val, port);
	outl(val >> 32, port + sizeof(u32));
}

static void pio_write64_hi_lo(u64 val, unsigned long port)
{
	outl(val >> 32, port + sizeof(u32));
	outl(val, port);
}

static void pio_write64be_lo_hi(u64 val, unsigned long port)
{
	pio_write32be(val, port + sizeof(u32));
	pio_write32be(val >> 32, port);
}

static void pio_write64be_hi_lo(u64 val, unsigned long port)
{
	pio_write32be(val >> 32, port);
	pio_write32be(val, port + sizeof(u32));
}

void iowrite64_lo_hi(u64 val, void __iomem *addr)
{
	IO_COND(addr, pio_write64_lo_hi(val, port),
		writeq(val, addr));
}

void iowrite64_hi_lo(u64 val, void __iomem *addr)
{
	IO_COND(addr, pio_write64_hi_lo(val, port),
		writeq(val, addr));
}

void iowrite64be_lo_hi(u64 val, void __iomem *addr)
{
	IO_COND(addr, pio_write64be_lo_hi(val, port),
		mmio_write64be(val, addr));
}

void iowrite64be_hi_lo(u64 val, void __iomem *addr)
{
	IO_COND(addr, pio_write64be_hi_lo(val, port),
		mmio_write64be(val, addr));
}

EXPORT_SYMBOL(iowrite64_lo_hi);
EXPORT_SYMBOL(iowrite64_hi_lo);
EXPORT_SYMBOL(iowrite64be_lo_hi);
EXPORT_SYMBOL(iowrite64be_hi_lo);

#endif /* readq */

/*
 * These are the "repeat MMIO read/write" functions.
 * Note the "__raw" accesses, since we don't want to
 * convert to CPU byte order. We write in "IO byte
 * order" (we also don't have IO barriers).
 */
#ifndef mmio_insb
static inline void mmio_insb(void __iomem *addr, u8 *dst, int count)
{
	while (--count >= 0) {
		u8 data = __raw_readb(addr);
		*dst = data;
		dst++;
	}
}
static inline void mmio_insw(void __iomem *addr, u16 *dst, int count)
{
	while (--count >= 0) {
		u16 data = __raw_readw(addr);
		*dst = data;
		dst++;
	}
}
static inline void mmio_insl(void __iomem *addr, u32 *dst, int count)
{
	while (--count >= 0) {
		u32 data = __raw_readl(addr);
		*dst = data;
		dst++;
	}
}
#endif

#ifndef mmio_outsb
static inline void mmio_outsb(void __iomem *addr, const u8 *src, int count)
{
	while (--count >= 0) {
		__raw_writeb(*src, addr);
		src++;
	}
}
static inline void mmio_outsw(void __iomem *addr, const u16 *src, int count)
{
	while (--count >= 0) {
		__raw_writew(*src, addr);
		src++;
	}
}
static inline void mmio_outsl(void __iomem *addr, const u32 *src, int count)
{
	while (--count >= 0) {
		__raw_writel(*src, addr);
		src++;
	}
}
#endif

void ioread8_rep(void __iomem *addr, void *dst, unsigned long count)
{
	IO_COND(addr, insb(port,dst,count), mmio_insb(addr, dst, count));
}
void ioread16_rep(void __iomem *addr, void *dst, unsigned long count)
{
	IO_COND(addr, insw(port,dst,count), mmio_insw(addr, dst, count));
}
void ioread32_rep(void __iomem *addr, void *dst, unsigned long count)
{
	IO_COND(addr, insl(port,dst,count), mmio_insl(addr, dst, count));
}
EXPORT_SYMBOL(ioread8_rep);
EXPORT_SYMBOL(ioread16_rep);
EXPORT_SYMBOL(ioread32_rep);

void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count)
{
	IO_COND(addr, outsb(port, src, count), mmio_outsb(addr, src, count));
}
void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count)
{
	IO_COND(addr, outsw(port, src, count), mmio_outsw(addr, src, count));
}
void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count)
{
	IO_COND(addr, outsl(port, src,count), mmio_outsl(addr, src, count));
}
EXPORT_SYMBOL(iowrite8_rep);
EXPORT_SYMBOL(iowrite16_rep);
EXPORT_SYMBOL(iowrite32_rep);

#ifdef CONFIG_HAS_IOPORT_MAP
/* Create a virtual mapping cookie for an IO port range */
void __iomem *ioport_map(unsigned long port, unsigned int nr)
{
	if (port > PIO_MASK)
		return NULL;
	return (void __iomem *) (unsigned long) (port + PIO_OFFSET);
}

void ioport_unmap(void __iomem *addr)
{
	/* Nothing to do */
}
EXPORT_SYMBOL(ioport_map);
EXPORT_SYMBOL(ioport_unmap);
#endif /* CONFIG_HAS_IOPORT_MAP */

#ifdef CONFIG_PCI
/* Hide the details if this is a MMIO or PIO address space and just do what
 * you expect in the correct way. */
void pci_iounmap(struct pci_dev *dev, void __iomem * addr)
{
	IO_COND(addr, /* nothing */, iounmap(addr));
}
EXPORT_SYMBOL(pci_iounmap);
#endif /* CONFIG_PCI */
back to top