s390/mm: add support for 2GB hugepages

This adds support for 2GB hugetlbfs pages on s390.

Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
This commit is contained in:
Gerald Schaefer 2016-07-04 14:47:01 +02:00 committed by Martin Schwidefsky
parent 46210c440c
commit d08de8e2d8
8 changed files with 232 additions and 63 deletions

View File

@ -41,7 +41,10 @@ static inline int prepare_hugepage_range(struct file *file,
static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
pte_val(*ptep) = _SEGMENT_ENTRY_EMPTY;
if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
pte_val(*ptep) = _REGION3_ENTRY_EMPTY;
else
pte_val(*ptep) = _SEGMENT_ENTRY_EMPTY;
}
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,

View File

@ -21,6 +21,7 @@
#define HPAGE_SIZE (1UL << HPAGE_SHIFT)
#define HPAGE_MASK (~(HPAGE_SIZE - 1))
#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
#define HUGE_MAX_HSTATE 2
#define ARCH_HAS_SETCLEAR_HUGE_PTE
#define ARCH_HAS_HUGE_PTE_TYPE

View File

@ -306,6 +306,9 @@ static inline int is_module_addr(void *addr)
#define _REGION3_ENTRY_SOFT_DIRTY 0x0000 /* SW region soft dirty bit */
#endif
#define _REGION_ENTRY_BITS 0xfffffffffffff227UL
#define _REGION_ENTRY_BITS_LARGE 0xffffffff8000fe27UL
/* Bits in the segment table entry */
#define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL
#define _SEGMENT_ENTRY_BITS_LARGE 0xfffffffffff0ff33UL
@ -573,7 +576,7 @@ static inline int pud_none(pud_t pud)
{
if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3)
return 0;
return (pud_val(pud) & _REGION_ENTRY_INVALID) != 0UL;
return pud_val(pud) == _REGION3_ENTRY_EMPTY;
}
static inline int pud_large(pud_t pud)
@ -593,17 +596,25 @@ static inline unsigned long pud_pfn(pud_t pud)
return (pud_val(pud) & origin_mask) >> PAGE_SHIFT;
}
static inline int pmd_large(pmd_t pmd)
{
return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0;
}
static inline int pmd_bad(pmd_t pmd)
{
if (pmd_large(pmd))
return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS_LARGE) != 0;
return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
}
static inline int pud_bad(pud_t pud)
{
/*
* With dynamic page table levels the pud can be a region table
* entry or a segment table entry. Check for the bit that are
* invalid for either table entry.
*/
unsigned long mask =
~_SEGMENT_ENTRY_ORIGIN & ~_REGION_ENTRY_INVALID &
~_REGION_ENTRY_TYPE_MASK & ~_REGION_ENTRY_LENGTH;
return (pud_val(pud) & mask) != 0;
if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3)
return pmd_bad(__pmd(pud_val(pud)));
if (pud_large(pud))
return (pud_val(pud) & ~_REGION_ENTRY_BITS_LARGE) != 0;
return (pud_val(pud) & ~_REGION_ENTRY_BITS) != 0;
}
static inline int pmd_present(pmd_t pmd)
@ -616,11 +627,6 @@ static inline int pmd_none(pmd_t pmd)
return pmd_val(pmd) == _SEGMENT_ENTRY_INVALID;
}
static inline int pmd_large(pmd_t pmd)
{
return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0;
}
static inline unsigned long pmd_pfn(pmd_t pmd)
{
unsigned long origin_mask;
@ -631,13 +637,6 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
return (pmd_val(pmd) & origin_mask) >> PAGE_SHIFT;
}
static inline int pmd_bad(pmd_t pmd)
{
if (pmd_large(pmd))
return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS_LARGE) != 0;
return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
}
#define __HAVE_ARCH_PMD_WRITE
static inline int pmd_write(pmd_t pmd)
{
@ -1081,6 +1080,7 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
#define pte_page(x) pfn_to_page(pte_pfn(x))
#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd))
#define pud_page(pud) pfn_to_page(pud_pfn(pud))
/* Find an entry in the lowest level page table.. */
#define pte_offset(pmd, addr) ((pte_t *) pmd_deref(*(pmd)) + pte_index(addr))
@ -1238,6 +1238,19 @@ static inline void __pmdp_idte(unsigned long address, pmd_t *pmdp)
: "cc" );
}
static inline void __pudp_idte(unsigned long address, pud_t *pudp)
{
unsigned long r3o;
r3o = (unsigned long) pudp - pud_index(address) * sizeof(pud_t);
r3o |= _ASCE_TYPE_REGION3;
asm volatile(
" .insn rrf,0xb98e0000,%2,%3,0,0"
: "=m" (*pudp)
: "m" (*pudp), "a" (r3o), "a" ((address & PUD_MASK))
: "cc");
}
static inline void __pmdp_idte_local(unsigned long address, pmd_t *pmdp)
{
unsigned long sto;
@ -1250,8 +1263,22 @@ static inline void __pmdp_idte_local(unsigned long address, pmd_t *pmdp)
: "cc" );
}
static inline void __pudp_idte_local(unsigned long address, pud_t *pudp)
{
unsigned long r3o;
r3o = (unsigned long) pudp - pud_index(address) * sizeof(pud_t);
r3o |= _ASCE_TYPE_REGION3;
asm volatile(
" .insn rrf,0xb98e0000,%2,%3,0,1"
: "=m" (*pudp)
: "m" (*pudp), "a" (r3o), "a" ((address & PUD_MASK))
: "cc");
}
pmd_t pmdp_xchg_direct(struct mm_struct *, unsigned long, pmd_t *, pmd_t);
pmd_t pmdp_xchg_lazy(struct mm_struct *, unsigned long, pmd_t *, pmd_t);
pud_t pudp_xchg_direct(struct mm_struct *, unsigned long, pud_t *, pud_t);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE

View File

@ -430,6 +430,9 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
VM_BUG_ON(pgd_none(*pgd));
pud = pud_offset(pgd, vmaddr);
VM_BUG_ON(pud_none(*pud));
/* large puds cannot yet be handled */
if (pud_large(*pud))
return -EFAULT;
pmd = pmd_offset(pud, vmaddr);
VM_BUG_ON(pmd_none(*pmd));
/* large pmds cannot yet be handled */

View File

@ -128,6 +128,44 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
return 1;
}
static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
struct page *head, *page;
unsigned long mask;
int refs;
mask = (write ? _REGION_ENTRY_PROTECT : 0) | _REGION_ENTRY_INVALID;
if ((pud_val(pud) & mask) != 0)
return 0;
VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
refs = 0;
head = pud_page(pud);
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
(*nr)++;
page++;
refs++;
} while (addr += PAGE_SIZE, addr != end);
if (!page_cache_add_speculative(head, refs)) {
*nr -= refs;
return 0;
}
if (unlikely(pud_val(pud) != pud_val(*pudp))) {
*nr -= refs;
while (refs--)
put_page(head);
return 0;
}
return 1;
}
static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
@ -144,7 +182,12 @@ static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
next = pud_addr_end(addr, end);
if (pud_none(pud))
return 0;
if (!gup_pmd_range(pudp, pud, addr, next, write, pages, nr))
if (unlikely(pud_large(pud))) {
if (!gup_huge_pud(pudp, pud, addr, next, write, pages,
nr))
return 0;
} else if (!gup_pmd_range(pudp, pud, addr, next, write, pages,
nr))
return 0;
} while (pudp++, addr = next, addr != end);

View File

@ -1,19 +1,22 @@
/*
* IBM System z Huge TLB Page Support for Kernel.
*
* Copyright IBM Corp. 2007
* Copyright IBM Corp. 2007,2016
* Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com>
*/
#define KMSG_COMPONENT "hugetlb"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
#include <linux/mm.h>
#include <linux/hugetlb.h>
static inline pmd_t __pte_to_pmd(pte_t pte)
static inline unsigned long __pte_to_rste(pte_t pte)
{
pmd_t pmd;
unsigned long rste;
/*
* Convert encoding pte bits pmd bits
* Convert encoding pte bits pmd / pud bits
* lIR.uswrdy.p dy..R...I...wr
* empty 010.000000.0 -> 00..0...1...00
* prot-none, clean, old 111.000000.1 -> 00..1...1...00
@ -33,25 +36,31 @@ static inline pmd_t __pte_to_pmd(pte_t pte)
* u unused, l large
*/
if (pte_present(pte)) {
pmd_val(pmd) = pte_val(pte) & PAGE_MASK;
pmd_val(pmd) |= (pte_val(pte) & _PAGE_READ) >> 4;
pmd_val(pmd) |= (pte_val(pte) & _PAGE_WRITE) >> 4;
pmd_val(pmd) |= (pte_val(pte) & _PAGE_INVALID) >> 5;
pmd_val(pmd) |= (pte_val(pte) & _PAGE_PROTECT);
pmd_val(pmd) |= (pte_val(pte) & _PAGE_DIRTY) << 10;
pmd_val(pmd) |= (pte_val(pte) & _PAGE_YOUNG) << 10;
pmd_val(pmd) |= (pte_val(pte) & _PAGE_SOFT_DIRTY) << 13;
rste = pte_val(pte) & PAGE_MASK;
rste |= (pte_val(pte) & _PAGE_READ) >> 4;
rste |= (pte_val(pte) & _PAGE_WRITE) >> 4;
rste |= (pte_val(pte) & _PAGE_INVALID) >> 5;
rste |= (pte_val(pte) & _PAGE_PROTECT);
rste |= (pte_val(pte) & _PAGE_DIRTY) << 10;
rste |= (pte_val(pte) & _PAGE_YOUNG) << 10;
rste |= (pte_val(pte) & _PAGE_SOFT_DIRTY) << 13;
} else
pmd_val(pmd) = _SEGMENT_ENTRY_INVALID;
return pmd;
rste = _SEGMENT_ENTRY_INVALID;
return rste;
}
static inline pte_t __pmd_to_pte(pmd_t pmd)
static inline pte_t __rste_to_pte(unsigned long rste)
{
int present;
pte_t pte;
if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
present = pud_present(__pud(rste));
else
present = pmd_present(__pmd(rste));
/*
* Convert encoding pmd bits pte bits
* Convert encoding pmd / pud bits pte bits
* dy..R...I...wr lIR.uswrdy.p
* empty 00..0...1...00 -> 010.000000.0
* prot-none, clean, old 00..1...1...00 -> 111.000000.1
@ -70,16 +79,16 @@ static inline pte_t __pmd_to_pte(pmd_t pmd)
* SW-bits: p present, y young, d dirty, r read, w write, s special,
* u unused, l large
*/
if (pmd_present(pmd)) {
pte_val(pte) = pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN_LARGE;
if (present) {
pte_val(pte) = rste & _SEGMENT_ENTRY_ORIGIN_LARGE;
pte_val(pte) |= _PAGE_LARGE | _PAGE_PRESENT;
pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_READ) << 4;
pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE) << 4;
pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_INVALID) << 5;
pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_PROTECT);
pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) >> 10;
pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) >> 10;
pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_SOFT_DIRTY) >> 13;
pte_val(pte) |= (rste & _SEGMENT_ENTRY_READ) << 4;
pte_val(pte) |= (rste & _SEGMENT_ENTRY_WRITE) << 4;
pte_val(pte) |= (rste & _SEGMENT_ENTRY_INVALID) << 5;
pte_val(pte) |= (rste & _SEGMENT_ENTRY_PROTECT);
pte_val(pte) |= (rste & _SEGMENT_ENTRY_DIRTY) >> 10;
pte_val(pte) |= (rste & _SEGMENT_ENTRY_YOUNG) >> 10;
pte_val(pte) |= (rste & _SEGMENT_ENTRY_SOFT_DIRTY) >> 13;
} else
pte_val(pte) = _PAGE_INVALID;
return pte;
@ -88,27 +97,33 @@ static inline pte_t __pmd_to_pte(pmd_t pmd)
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
pmd_t pmd = __pte_to_pmd(pte);
unsigned long rste = __pte_to_rste(pte);
pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE;
*(pmd_t *) ptep = pmd;
/* Set correct table type for 2G hugepages */
if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
rste |= _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE;
else
rste |= _SEGMENT_ENTRY_LARGE;
pte_val(*ptep) = rste;
}
pte_t huge_ptep_get(pte_t *ptep)
{
pmd_t pmd = *(pmd_t *) ptep;
return __pmd_to_pte(pmd);
return __rste_to_pte(pte_val(*ptep));
}
pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
pte_t pte = huge_ptep_get(ptep);
pmd_t *pmdp = (pmd_t *) ptep;
pmd_t old;
pud_t *pudp = (pud_t *) ptep;
old = pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
return __pmd_to_pte(old);
if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
pudp_xchg_direct(mm, addr, pudp, __pud(_REGION3_ENTRY_EMPTY));
else
pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
return pte;
}
pte_t *huge_pte_alloc(struct mm_struct *mm,
@ -120,8 +135,12 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
pgdp = pgd_offset(mm, addr);
pudp = pud_alloc(mm, pgdp, addr);
if (pudp)
pmdp = pmd_alloc(mm, pudp, addr);
if (pudp) {
if (sz == PUD_SIZE)
return (pte_t *) pudp;
else if (sz == PMD_SIZE)
pmdp = pmd_alloc(mm, pudp, addr);
}
return (pte_t *) pmdp;
}
@ -134,8 +153,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
pgdp = pgd_offset(mm, addr);
if (pgd_present(*pgdp)) {
pudp = pud_offset(pgdp, addr);
if (pud_present(*pudp))
if (pud_present(*pudp)) {
if (pud_large(*pudp))
return (pte_t *) pudp;
pmdp = pmd_offset(pudp, addr);
}
}
return (pte_t *) pmdp;
}
@ -147,5 +169,34 @@ int pmd_huge(pmd_t pmd)
int pud_huge(pud_t pud)
{
return 0;
return pud_large(pud);
}
struct page *
follow_huge_pud(struct mm_struct *mm, unsigned long address,
pud_t *pud, int flags)
{
if (flags & FOLL_GET)
return NULL;
return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
}
static __init int setup_hugepagesz(char *opt)
{
unsigned long size;
char *string = opt;
size = memparse(opt, &opt);
if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) {
hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
} else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) {
hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
} else {
pr_err("hugepagesz= specifies an unsupported page size %s\n",
string);
return 0;
}
return 1;
}
__setup("hugepagesz=", setup_hugepagesz);

View File

@ -352,6 +352,45 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr,
}
EXPORT_SYMBOL(pmdp_xchg_lazy);
static inline pud_t pudp_flush_direct(struct mm_struct *mm,
unsigned long addr, pud_t *pudp)
{
pud_t old;
old = *pudp;
if (pud_val(old) & _REGION_ENTRY_INVALID)
return old;
if (!MACHINE_HAS_IDTE) {
/*
* Invalid bit position is the same for pmd and pud, so we can
* re-use _pmd_csp() here
*/
__pmdp_csp((pmd_t *) pudp);
return old;
}
atomic_inc(&mm->context.flush_count);
if (MACHINE_HAS_TLB_LC &&
cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
__pudp_idte_local(addr, pudp);
else
__pudp_idte(addr, pudp);
atomic_dec(&mm->context.flush_count);
return old;
}
pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr,
pud_t *pudp, pud_t new)
{
pud_t old;
preempt_disable();
old = pudp_flush_direct(mm, addr, pudp);
*pudp = new;
preempt_enable();
return old;
}
EXPORT_SYMBOL(pudp_xchg_direct);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)

View File

@ -1022,7 +1022,9 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
#if defined(CONFIG_X86_64) && ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA))
#if (defined(CONFIG_X86_64) || defined(CONFIG_S390)) && \
((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \
defined(CONFIG_CMA))
static void destroy_compound_gigantic_page(struct page *page,
unsigned int order)
{