Revision f0a3a24b532d9a7e56a33c5112b2a212ed6ec580 authored by Roman Gushchin on 12 July 2019, 03:56:27 UTC, committed by Linus Torvalds on 12 July 2019, 18:05:44 UTC
Currently each charged slab page holds a reference to the cgroup to which it's charged. Kmem_caches are held by the memcg and are released all together with the memory cgroup. It means that none of kmem_caches are released unless at least one reference to the memcg exists, which is very far from optimal. Let's rework it in a way that allows releasing individual kmem_caches as soon as the cgroup is offline, the kmem_cache is empty and there are no pending allocations. To make it possible, let's introduce a new percpu refcounter for non-root kmem caches. The counter is initialized to the percpu mode, and is switched to the atomic mode during kmem_cache deactivation. The counter is bumped for every charged page and also for every running allocation. So the kmem_cache can't be released unless all allocations complete. To shutdown non-active empty kmem_caches, let's reuse the work queue, previously used for the kmem_cache deactivation. Once the reference counter reaches 0, let's schedule an asynchronous kmem_cache release. * I used the following simple approach to test the performance (stolen from another patchset by T. Harding): time find / -name fname-no-exist echo 2 > /proc/sys/vm/drop_caches repeat 10 times Results: orig patched real 0m1.455s real 0m1.355s user 0m0.206s user 0m0.219s sys 0m0.855s sys 0m0.807s real 0m1.487s real 0m1.699s user 0m0.221s user 0m0.256s sys 0m0.806s sys 0m0.948s real 0m1.515s real 0m1.505s user 0m0.183s user 0m0.215s sys 0m0.876s sys 0m0.858s real 0m1.291s real 0m1.380s user 0m0.193s user 0m0.198s sys 0m0.843s sys 0m0.786s real 0m1.364s real 0m1.374s user 0m0.180s user 0m0.182s sys 0m0.868s sys 0m0.806s real 0m1.352s real 0m1.312s user 0m0.201s user 0m0.212s sys 0m0.820s sys 0m0.761s real 0m1.302s real 0m1.349s user 0m0.205s user 0m0.203s sys 0m0.803s sys 0m0.792s real 0m1.334s real 0m1.301s user 0m0.194s user 0m0.201s sys 0m0.806s sys 0m0.779s real 0m1.426s real 0m1.434s user 0m0.216s user 0m0.181s sys 0m0.824s sys 0m0.864s real 0m1.350s real 0m1.295s user 0m0.200s user 0m0.190s sys 0m0.842s sys 0m0.811s So it looks like the difference is not noticeable in this test. [cai@lca.pw: fix an use-after-free in kmemcg_workfn()] Link: http://lkml.kernel.org/r/1560977573-10715-1-git-send-email-cai@lca.pw Link: http://lkml.kernel.org/r/20190611231813.3148843-9-guro@fb.com Signed-off-by: Roman Gushchin <guro@fb.com> Signed-off-by: Qian Cai <cai@lca.pw> Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Christoph Lameter <cl@linux.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Shakeel Butt <shakeelb@google.com> Cc: Waiman Long <longman@redhat.com> Cc: David Rientjes <rientjes@google.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: Andrei Vagin <avagin@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 63b02ef
iomap.c
// SPDX-License-Identifier: GPL-2.0
/*
* Implement the default iomap interfaces
*
* (C) Copyright 2004 Linus Torvalds
*/
#include <linux/pci.h>
#include <linux/io.h>
#include <linux/export.h>
/*
* Read/write from/to an (offsettable) iomem cookie. It might be a PIO
* access or a MMIO access, these functions don't care. The info is
* encoded in the hardware mapping set up by the mapping functions
* (or the cookie itself, depending on implementation and hw).
*
* The generic routines don't assume any hardware mappings, and just
* encode the PIO/MMIO as part of the cookie. They coldly assume that
* the MMIO IO mappings are not in the low address range.
*
* Architectures for which this is not true can't use this generic
* implementation and should do their own copy.
*/
#ifndef HAVE_ARCH_PIO_SIZE
/*
* We encode the physical PIO addresses (0-0xffff) into the
* pointer by offsetting them with a constant (0x10000) and
* assuming that all the low addresses are always PIO. That means
* we can do some sanity checks on the low bits, and don't
* need to just take things for granted.
*/
#define PIO_OFFSET 0x10000UL
#define PIO_MASK 0x0ffffUL
#define PIO_RESERVED 0x40000UL
#endif
static void bad_io_access(unsigned long port, const char *access)
{
static int count = 10;
if (count) {
count--;
WARN(1, KERN_ERR "Bad IO access at port %#lx (%s)\n", port, access);
}
}
/*
* Ugly macros are a way of life.
*/
#define IO_COND(addr, is_pio, is_mmio) do { \
unsigned long port = (unsigned long __force)addr; \
if (port >= PIO_RESERVED) { \
is_mmio; \
} else if (port > PIO_OFFSET) { \
port &= PIO_MASK; \
is_pio; \
} else \
bad_io_access(port, #is_pio ); \
} while (0)
#ifndef pio_read16be
#define pio_read16be(port) swab16(inw(port))
#define pio_read32be(port) swab32(inl(port))
#endif
#ifndef mmio_read16be
#define mmio_read16be(addr) swab16(readw(addr))
#define mmio_read32be(addr) swab32(readl(addr))
#define mmio_read64be(addr) swab64(readq(addr))
#endif
unsigned int ioread8(void __iomem *addr)
{
IO_COND(addr, return inb(port), return readb(addr));
return 0xff;
}
unsigned int ioread16(void __iomem *addr)
{
IO_COND(addr, return inw(port), return readw(addr));
return 0xffff;
}
unsigned int ioread16be(void __iomem *addr)
{
IO_COND(addr, return pio_read16be(port), return mmio_read16be(addr));
return 0xffff;
}
unsigned int ioread32(void __iomem *addr)
{
IO_COND(addr, return inl(port), return readl(addr));
return 0xffffffff;
}
unsigned int ioread32be(void __iomem *addr)
{
IO_COND(addr, return pio_read32be(port), return mmio_read32be(addr));
return 0xffffffff;
}
EXPORT_SYMBOL(ioread8);
EXPORT_SYMBOL(ioread16);
EXPORT_SYMBOL(ioread16be);
EXPORT_SYMBOL(ioread32);
EXPORT_SYMBOL(ioread32be);
#ifdef readq
static u64 pio_read64_lo_hi(unsigned long port)
{
u64 lo, hi;
lo = inl(port);
hi = inl(port + sizeof(u32));
return lo | (hi << 32);
}
static u64 pio_read64_hi_lo(unsigned long port)
{
u64 lo, hi;
hi = inl(port + sizeof(u32));
lo = inl(port);
return lo | (hi << 32);
}
static u64 pio_read64be_lo_hi(unsigned long port)
{
u64 lo, hi;
lo = pio_read32be(port + sizeof(u32));
hi = pio_read32be(port);
return lo | (hi << 32);
}
static u64 pio_read64be_hi_lo(unsigned long port)
{
u64 lo, hi;
hi = pio_read32be(port);
lo = pio_read32be(port + sizeof(u32));
return lo | (hi << 32);
}
u64 ioread64_lo_hi(void __iomem *addr)
{
IO_COND(addr, return pio_read64_lo_hi(port), return readq(addr));
return 0xffffffffffffffffULL;
}
u64 ioread64_hi_lo(void __iomem *addr)
{
IO_COND(addr, return pio_read64_hi_lo(port), return readq(addr));
return 0xffffffffffffffffULL;
}
u64 ioread64be_lo_hi(void __iomem *addr)
{
IO_COND(addr, return pio_read64be_lo_hi(port),
return mmio_read64be(addr));
return 0xffffffffffffffffULL;
}
u64 ioread64be_hi_lo(void __iomem *addr)
{
IO_COND(addr, return pio_read64be_hi_lo(port),
return mmio_read64be(addr));
return 0xffffffffffffffffULL;
}
EXPORT_SYMBOL(ioread64_lo_hi);
EXPORT_SYMBOL(ioread64_hi_lo);
EXPORT_SYMBOL(ioread64be_lo_hi);
EXPORT_SYMBOL(ioread64be_hi_lo);
#endif /* readq */
#ifndef pio_write16be
#define pio_write16be(val,port) outw(swab16(val),port)
#define pio_write32be(val,port) outl(swab32(val),port)
#endif
#ifndef mmio_write16be
#define mmio_write16be(val,port) writew(swab16(val),port)
#define mmio_write32be(val,port) writel(swab32(val),port)
#define mmio_write64be(val,port) writeq(swab64(val),port)
#endif
void iowrite8(u8 val, void __iomem *addr)
{
IO_COND(addr, outb(val,port), writeb(val, addr));
}
void iowrite16(u16 val, void __iomem *addr)
{
IO_COND(addr, outw(val,port), writew(val, addr));
}
void iowrite16be(u16 val, void __iomem *addr)
{
IO_COND(addr, pio_write16be(val,port), mmio_write16be(val, addr));
}
void iowrite32(u32 val, void __iomem *addr)
{
IO_COND(addr, outl(val,port), writel(val, addr));
}
void iowrite32be(u32 val, void __iomem *addr)
{
IO_COND(addr, pio_write32be(val,port), mmio_write32be(val, addr));
}
EXPORT_SYMBOL(iowrite8);
EXPORT_SYMBOL(iowrite16);
EXPORT_SYMBOL(iowrite16be);
EXPORT_SYMBOL(iowrite32);
EXPORT_SYMBOL(iowrite32be);
#ifdef writeq
static void pio_write64_lo_hi(u64 val, unsigned long port)
{
outl(val, port);
outl(val >> 32, port + sizeof(u32));
}
static void pio_write64_hi_lo(u64 val, unsigned long port)
{
outl(val >> 32, port + sizeof(u32));
outl(val, port);
}
static void pio_write64be_lo_hi(u64 val, unsigned long port)
{
pio_write32be(val, port + sizeof(u32));
pio_write32be(val >> 32, port);
}
static void pio_write64be_hi_lo(u64 val, unsigned long port)
{
pio_write32be(val >> 32, port);
pio_write32be(val, port + sizeof(u32));
}
void iowrite64_lo_hi(u64 val, void __iomem *addr)
{
IO_COND(addr, pio_write64_lo_hi(val, port),
writeq(val, addr));
}
void iowrite64_hi_lo(u64 val, void __iomem *addr)
{
IO_COND(addr, pio_write64_hi_lo(val, port),
writeq(val, addr));
}
void iowrite64be_lo_hi(u64 val, void __iomem *addr)
{
IO_COND(addr, pio_write64be_lo_hi(val, port),
mmio_write64be(val, addr));
}
void iowrite64be_hi_lo(u64 val, void __iomem *addr)
{
IO_COND(addr, pio_write64be_hi_lo(val, port),
mmio_write64be(val, addr));
}
EXPORT_SYMBOL(iowrite64_lo_hi);
EXPORT_SYMBOL(iowrite64_hi_lo);
EXPORT_SYMBOL(iowrite64be_lo_hi);
EXPORT_SYMBOL(iowrite64be_hi_lo);
#endif /* readq */
/*
* These are the "repeat MMIO read/write" functions.
* Note the "__raw" accesses, since we don't want to
* convert to CPU byte order. We write in "IO byte
* order" (we also don't have IO barriers).
*/
#ifndef mmio_insb
static inline void mmio_insb(void __iomem *addr, u8 *dst, int count)
{
while (--count >= 0) {
u8 data = __raw_readb(addr);
*dst = data;
dst++;
}
}
static inline void mmio_insw(void __iomem *addr, u16 *dst, int count)
{
while (--count >= 0) {
u16 data = __raw_readw(addr);
*dst = data;
dst++;
}
}
static inline void mmio_insl(void __iomem *addr, u32 *dst, int count)
{
while (--count >= 0) {
u32 data = __raw_readl(addr);
*dst = data;
dst++;
}
}
#endif
#ifndef mmio_outsb
static inline void mmio_outsb(void __iomem *addr, const u8 *src, int count)
{
while (--count >= 0) {
__raw_writeb(*src, addr);
src++;
}
}
static inline void mmio_outsw(void __iomem *addr, const u16 *src, int count)
{
while (--count >= 0) {
__raw_writew(*src, addr);
src++;
}
}
static inline void mmio_outsl(void __iomem *addr, const u32 *src, int count)
{
while (--count >= 0) {
__raw_writel(*src, addr);
src++;
}
}
#endif
void ioread8_rep(void __iomem *addr, void *dst, unsigned long count)
{
IO_COND(addr, insb(port,dst,count), mmio_insb(addr, dst, count));
}
void ioread16_rep(void __iomem *addr, void *dst, unsigned long count)
{
IO_COND(addr, insw(port,dst,count), mmio_insw(addr, dst, count));
}
void ioread32_rep(void __iomem *addr, void *dst, unsigned long count)
{
IO_COND(addr, insl(port,dst,count), mmio_insl(addr, dst, count));
}
EXPORT_SYMBOL(ioread8_rep);
EXPORT_SYMBOL(ioread16_rep);
EXPORT_SYMBOL(ioread32_rep);
void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count)
{
IO_COND(addr, outsb(port, src, count), mmio_outsb(addr, src, count));
}
void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count)
{
IO_COND(addr, outsw(port, src, count), mmio_outsw(addr, src, count));
}
void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count)
{
IO_COND(addr, outsl(port, src,count), mmio_outsl(addr, src, count));
}
EXPORT_SYMBOL(iowrite8_rep);
EXPORT_SYMBOL(iowrite16_rep);
EXPORT_SYMBOL(iowrite32_rep);
#ifdef CONFIG_HAS_IOPORT_MAP
/* Create a virtual mapping cookie for an IO port range */
void __iomem *ioport_map(unsigned long port, unsigned int nr)
{
if (port > PIO_MASK)
return NULL;
return (void __iomem *) (unsigned long) (port + PIO_OFFSET);
}
void ioport_unmap(void __iomem *addr)
{
/* Nothing to do */
}
EXPORT_SYMBOL(ioport_map);
EXPORT_SYMBOL(ioport_unmap);
#endif /* CONFIG_HAS_IOPORT_MAP */
#ifdef CONFIG_PCI
/* Hide the details if this is a MMIO or PIO address space and just do what
* you expect in the correct way. */
void pci_iounmap(struct pci_dev *dev, void __iomem * addr)
{
IO_COND(addr, /* nothing */, iounmap(addr));
}
EXPORT_SYMBOL(pci_iounmap);
#endif /* CONFIG_PCI */
Computing file changes ...