Revision 6f6acb00514c10be35529402f36ad7a288f08c2e authored by Michal Hocko on 22 May 2014, 18:54:19 UTC, committed by Linus Torvalds on 23 May 2014, 16:37:29 UTC
Commit 284f39afeaa4 ("mm: memcg: push !mm handling out to page cache
charge function") explicitly checks for page cache charges without any
mm context (from kernel thread context[1]).

This seemed to be the only possible case where memory could be charged
without mm context so commit 03583f1a631c ("memcg: remove unnecessary
!mm check from try_get_mem_cgroup_from_mm()") removed the mm check from
get_mem_cgroup_from_mm().  This however caused another NULL ptr
dereference during early boot when loopback kernel thread splices to
tmpfs as reported by Stephan Kulow:

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000360
  IP: get_mem_cgroup_from_mm.isra.42+0x2b/0x60
  Oops: 0000 [#1] SMP
  Modules linked in: btrfs dm_multipath dm_mod scsi_dh multipath raid10 raid456 async_raid6_recov async_memcpy async_pq raid6_pq async_xor xor async_tx raid1 raid0 md_mod parport_pc parport nls_utf8 isofs usb_storage iscsi_ibft iscsi_boot_sysfs arc4 ecb fan thermal nfs lockd fscache nls_iso8859_1 nls_cp437 sg st hid_generic usbhid af_packet sunrpc sr_mod cdrom ata_generic uhci_hcd virtio_net virtio_blk ehci_hcd usbcore ata_piix floppy processor button usb_common virtio_pci virtio_ring virtio edd squashfs loop ppa]
  CPU: 0 PID: 97 Comm: loop1 Not tainted 3.15.0-rc5-5-default #1
  Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
  Call Trace:
    __mem_cgroup_try_charge_swapin+0x40/0xe0
    mem_cgroup_charge_file+0x8b/0xd0
    shmem_getpage_gfp+0x66b/0x7b0
    shmem_file_splice_read+0x18f/0x430
    splice_direct_to_actor+0xa2/0x1c0
    do_lo_receive+0x5a/0x60 [loop]
    loop_thread+0x298/0x720 [loop]
    kthread+0xc6/0xe0
    ret_from_fork+0x7c/0xb0

Also Branimir Maksimovic reported the following oops which is tiggered
for the swapcache charge path from the accounting code for kernel threads:

  CPU: 1 PID: 160 Comm: kworker/u8:5 Tainted: P           OE 3.15.0-rc5-core2-custom #159
  Hardware name: System manufacturer System Product Name/MAXIMUSV GENE, BIOS 1903 08/19/2013
  task: ffff880404e349b0 ti: ffff88040486a000 task.ti: ffff88040486a000
  RIP: get_mem_cgroup_from_mm.isra.42+0x2b/0x60
  Call Trace:
    __mem_cgroup_try_charge_swapin+0x45/0xf0
    mem_cgroup_charge_file+0x9c/0xe0
    shmem_getpage_gfp+0x62c/0x770
    shmem_write_begin+0x38/0x40
    generic_perform_write+0xc5/0x1c0
    __generic_file_aio_write+0x1d1/0x3f0
    generic_file_aio_write+0x4f/0xc0
    do_sync_write+0x5a/0x90
    do_acct_process+0x4b1/0x550
    acct_process+0x6d/0xa0
    do_exit+0x827/0xa70
    kthread+0xc3/0xf0

This patch fixes the issue by reintroducing mm check into
get_mem_cgroup_from_mm.  We could do the same trick in
__mem_cgroup_try_charge_swapin as we do for the regular page cache path
but it is not worth troubles.  The check is not that expensive and it is
better to have get_mem_cgroup_from_mm more robust.

[1] - http://marc.info/?l=linux-mm&m=139463617808941&w=2

Fixes: 03583f1a631c ("memcg: remove unnecessary !mm check from try_get_mem_cgroup_from_mm()")
Reported-and-tested-by: Stephan Kulow <coolo@suse.com>
Reported-by: Branimir Maksimovic <branimir.maksimovic@gmail.com>
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 55231e5
Raw File
pgtable-generic.c
/*
 *  mm/pgtable-generic.c
 *
 *  Generic pgtable methods declared in asm-generic/pgtable.h
 *
 *  Copyright (C) 2010  Linus Torvalds
 */

#include <linux/pagemap.h>
#include <asm/tlb.h>
#include <asm-generic/pgtable.h>

/*
 * If a p?d_bad entry is found while walking page tables, report
 * the error, before resetting entry to p?d_none.  Usually (but
 * very seldom) called out from the p?d_none_or_clear_bad macros.
 */

void pgd_clear_bad(pgd_t *pgd)
{
	pgd_ERROR(*pgd);
	pgd_clear(pgd);
}

void pud_clear_bad(pud_t *pud)
{
	pud_ERROR(*pud);
	pud_clear(pud);
}

void pmd_clear_bad(pmd_t *pmd)
{
	pmd_ERROR(*pmd);
	pmd_clear(pmd);
}

#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
/*
 * Only sets the access flags (dirty, accessed), as well as write 
 * permission. Furthermore, we know it always gets set to a "more
 * permissive" setting, which allows most architectures to optimize
 * this. We return whether the PTE actually changed, which in turn
 * instructs the caller to do things like update__mmu_cache.  This
 * used to be done in the caller, but sparc needs minor faults to
 * force that call on sun4c so we changed this macro slightly
 */
int ptep_set_access_flags(struct vm_area_struct *vma,
			  unsigned long address, pte_t *ptep,
			  pte_t entry, int dirty)
{
	int changed = !pte_same(*ptep, entry);
	if (changed) {
		set_pte_at(vma->vm_mm, address, ptep, entry);
		flush_tlb_fix_spurious_fault(vma, address);
	}
	return changed;
}
#endif

#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
int pmdp_set_access_flags(struct vm_area_struct *vma,
			  unsigned long address, pmd_t *pmdp,
			  pmd_t entry, int dirty)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	int changed = !pmd_same(*pmdp, entry);
	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
	if (changed) {
		set_pmd_at(vma->vm_mm, address, pmdp, entry);
		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
	}
	return changed;
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
	BUG();
	return 0;
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
}
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
int ptep_clear_flush_young(struct vm_area_struct *vma,
			   unsigned long address, pte_t *ptep)
{
	int young;
	young = ptep_test_and_clear_young(vma, address, ptep);
	if (young)
		flush_tlb_page(vma, address);
	return young;
}
#endif

#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
int pmdp_clear_flush_young(struct vm_area_struct *vma,
			   unsigned long address, pmd_t *pmdp)
{
	int young;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
#else
	BUG();
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
	young = pmdp_test_and_clear_young(vma, address, pmdp);
	if (young)
		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
	return young;
}
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
		       pte_t *ptep)
{
	struct mm_struct *mm = (vma)->vm_mm;
	pte_t pte;
	pte = ptep_get_and_clear(mm, address, ptep);
	if (pte_accessible(mm, pte))
		flush_tlb_page(vma, address);
	return pte;
}
#endif

#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
		       pmd_t *pmdp)
{
	pmd_t pmd;
	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
	pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
	return pmd;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
			  pmd_t *pmdp)
{
	pmd_t pmd = pmd_mksplitting(*pmdp);
	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
	set_pmd_at(vma->vm_mm, address, pmdp, pmd);
	/* tlb flush only to serialize against gup-fast */
	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
				pgtable_t pgtable)
{
	assert_spin_locked(pmd_lockptr(mm, pmdp));

	/* FIFO */
	if (!pmd_huge_pte(mm, pmdp))
		INIT_LIST_HEAD(&pgtable->lru);
	else
		list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru);
	pmd_huge_pte(mm, pmdp) = pgtable;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* no "address" argument so destroys page coloring of some arch */
pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
	pgtable_t pgtable;

	assert_spin_locked(pmd_lockptr(mm, pmdp));

	/* FIFO */
	pgtable = pmd_huge_pte(mm, pmdp);
	if (list_empty(&pgtable->lru))
		pmd_huge_pte(mm, pmdp) = NULL;
	else {
		pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next,
					      struct page, lru);
		list_del(&pgtable->lru);
	}
	return pgtable;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
		     pmd_t *pmdp)
{
	pmd_t entry = *pmdp;
	if (pmd_numa(entry))
		entry = pmd_mknonnuma(entry);
	set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
back to top