kernel_optimize_test/mm/debug.c
Kirill A. Shutemov 1d798ca3f1 mm: make compound_head() robust
Hugh has pointed that compound_head() call can be unsafe in some
context. There's one example:

	CPU0					CPU1

isolate_migratepages_block()
  page_count()
    compound_head()
      !!PageTail() == true
					put_page()
					  tail->first_page = NULL
      head = tail->first_page
					alloc_pages(__GFP_COMP)
					   prep_compound_page()
					     tail->first_page = head
					     __SetPageTail(p);
      !!PageTail() == true
    <head == NULL dereferencing>

The race is pure theoretical. I don't it's possible to trigger it in
practice. But who knows.

We can fix the race by changing how encode PageTail() and compound_head()
within struct page to be able to update them in one shot.

The patch introduces page->compound_head into third double word block in
front of compound_dtor and compound_order. Bit 0 encodes PageTail() and
the rest bits are pointer to head page if bit zero is set.

The patch moves page->pmd_huge_pte out of word, just in case if an
architecture defines pgtable_t into something what can have the bit 0
set.

hugetlb_cgroup uses page->lru.next in the second tail page to store
pointer struct hugetlb_cgroup. The patch switch it to use page->private
in the second tail page instead. The space is free since ->first_page is
removed from the union.

The patch also opens possibility to remove HUGETLB_CGROUP_MIN_ORDER
limitation, since there's now space in first tail page to store struct
hugetlb_cgroup pointer. But that's out of scope of the patch.

That means page->compound_head shares storage space with:

 - page->lru.next;
 - page->next;
 - page->rcu_head.next;

That's too long list to be absolutely sure, but looks like nobody uses
bit 0 of the word.

page->rcu_head.next guaranteed[1] to have bit 0 clean as long as we use
call_rcu(), call_rcu_bh(), call_rcu_sched(), or call_srcu(). But future
call_rcu_lazy() is not allowed as it makes use of the bit and we can
get false positive PageTail().

[1] http://lkml.kernel.org/g/20150827163634.GD4029@linux.vnet.ibm.com

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 17:50:42 -08:00

241 lines
6.6 KiB
C

/*
* mm/debug.c
*
* mm/ specific debug routines.
*
*/
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/trace_events.h>
#include <linux/memcontrol.h>
static const struct trace_print_flags pageflag_names[] = {
{1UL << PG_locked, "locked" },
{1UL << PG_error, "error" },
{1UL << PG_referenced, "referenced" },
{1UL << PG_uptodate, "uptodate" },
{1UL << PG_dirty, "dirty" },
{1UL << PG_lru, "lru" },
{1UL << PG_active, "active" },
{1UL << PG_slab, "slab" },
{1UL << PG_owner_priv_1, "owner_priv_1" },
{1UL << PG_arch_1, "arch_1" },
{1UL << PG_reserved, "reserved" },
{1UL << PG_private, "private" },
{1UL << PG_private_2, "private_2" },
{1UL << PG_writeback, "writeback" },
{1UL << PG_head, "head" },
{1UL << PG_swapcache, "swapcache" },
{1UL << PG_mappedtodisk, "mappedtodisk" },
{1UL << PG_reclaim, "reclaim" },
{1UL << PG_swapbacked, "swapbacked" },
{1UL << PG_unevictable, "unevictable" },
#ifdef CONFIG_MMU
{1UL << PG_mlocked, "mlocked" },
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
{1UL << PG_uncached, "uncached" },
#endif
#ifdef CONFIG_MEMORY_FAILURE
{1UL << PG_hwpoison, "hwpoison" },
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
{1UL << PG_compound_lock, "compound_lock" },
#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
{1UL << PG_young, "young" },
{1UL << PG_idle, "idle" },
#endif
};
static void dump_flags(unsigned long flags,
const struct trace_print_flags *names, int count)
{
const char *delim = "";
unsigned long mask;
int i;
pr_emerg("flags: %#lx(", flags);
/* remove zone id */
flags &= (1UL << NR_PAGEFLAGS) - 1;
for (i = 0; i < count && flags; i++) {
mask = names[i].mask;
if ((flags & mask) != mask)
continue;
flags &= ~mask;
pr_cont("%s%s", delim, names[i].name);
delim = "|";
}
/* check for left over flags */
if (flags)
pr_cont("%s%#lx", delim, flags);
pr_cont(")\n");
}
void dump_page_badflags(struct page *page, const char *reason,
unsigned long badflags)
{
pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
page, atomic_read(&page->_count), page_mapcount(page),
page->mapping, page->index);
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
if (reason)
pr_alert("page dumped because: %s\n", reason);
if (page->flags & badflags) {
pr_alert("bad because of flags:\n");
dump_flags(page->flags & badflags,
pageflag_names, ARRAY_SIZE(pageflag_names));
}
#ifdef CONFIG_MEMCG
if (page->mem_cgroup)
pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
#endif
}
void dump_page(struct page *page, const char *reason)
{
dump_page_badflags(page, reason, 0);
}
EXPORT_SYMBOL(dump_page);
#ifdef CONFIG_DEBUG_VM
static const struct trace_print_flags vmaflags_names[] = {
{VM_READ, "read" },
{VM_WRITE, "write" },
{VM_EXEC, "exec" },
{VM_SHARED, "shared" },
{VM_MAYREAD, "mayread" },
{VM_MAYWRITE, "maywrite" },
{VM_MAYEXEC, "mayexec" },
{VM_MAYSHARE, "mayshare" },
{VM_GROWSDOWN, "growsdown" },
{VM_PFNMAP, "pfnmap" },
{VM_DENYWRITE, "denywrite" },
{VM_LOCKONFAULT, "lockonfault" },
{VM_LOCKED, "locked" },
{VM_IO, "io" },
{VM_SEQ_READ, "seqread" },
{VM_RAND_READ, "randread" },
{VM_DONTCOPY, "dontcopy" },
{VM_DONTEXPAND, "dontexpand" },
{VM_ACCOUNT, "account" },
{VM_NORESERVE, "noreserve" },
{VM_HUGETLB, "hugetlb" },
#if defined(CONFIG_X86)
{VM_PAT, "pat" },
#elif defined(CONFIG_PPC)
{VM_SAO, "sao" },
#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
{VM_GROWSUP, "growsup" },
#elif !defined(CONFIG_MMU)
{VM_MAPPED_COPY, "mappedcopy" },
#else
{VM_ARCH_1, "arch_1" },
#endif
{VM_DONTDUMP, "dontdump" },
#ifdef CONFIG_MEM_SOFT_DIRTY
{VM_SOFTDIRTY, "softdirty" },
#endif
{VM_MIXEDMAP, "mixedmap" },
{VM_HUGEPAGE, "hugepage" },
{VM_NOHUGEPAGE, "nohugepage" },
{VM_MERGEABLE, "mergeable" },
};
void dump_vma(const struct vm_area_struct *vma)
{
pr_emerg("vma %p start %p end %p\n"
"next %p prev %p mm %p\n"
"prot %lx anon_vma %p vm_ops %p\n"
"pgoff %lx file %p private_data %p\n",
vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
vma->vm_prev, vma->vm_mm,
(unsigned long)pgprot_val(vma->vm_page_prot),
vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
vma->vm_file, vma->vm_private_data);
dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names));
}
EXPORT_SYMBOL(dump_vma);
void dump_mm(const struct mm_struct *mm)
{
pr_emerg("mm %p mmap %p seqnum %d task_size %lu\n"
#ifdef CONFIG_MMU
"get_unmapped_area %p\n"
#endif
"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
"pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
"start_brk %lx brk %lx start_stack %lx\n"
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
"binfmt %p flags %lx core_state %p\n"
#ifdef CONFIG_AIO
"ioctx_table %p\n"
#endif
#ifdef CONFIG_MEMCG
"owner %p "
#endif
"exe_file %p\n"
#ifdef CONFIG_MMU_NOTIFIER
"mmu_notifier_mm %p\n"
#endif
#ifdef CONFIG_NUMA_BALANCING
"numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"
#endif
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
"tlb_flush_pending %d\n"
#endif
"%s", /* This is here to hold the comma */
mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
#ifdef CONFIG_MMU
mm->get_unmapped_area,
#endif
mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
mm->pgd, atomic_read(&mm->mm_users),
atomic_read(&mm->mm_count),
atomic_long_read((atomic_long_t *)&mm->nr_ptes),
mm_nr_pmds((struct mm_struct *)mm),
mm->map_count,
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
mm->start_brk, mm->brk, mm->start_stack,
mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
mm->binfmt, mm->flags, mm->core_state,
#ifdef CONFIG_AIO
mm->ioctx_table,
#endif
#ifdef CONFIG_MEMCG
mm->owner,
#endif
mm->exe_file,
#ifdef CONFIG_MMU_NOTIFIER
mm->mmu_notifier_mm,
#endif
#ifdef CONFIG_NUMA_BALANCING
mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
#endif
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
mm->tlb_flush_pending,
#endif
"" /* This is here to not have a comma! */
);
dump_flags(mm->def_flags, vmaflags_names,
ARRAY_SIZE(vmaflags_names));
}
#endif /* CONFIG_DEBUG_VM */