kernel_optimize_test/mm/msync.c
Hugh Dickins 0b14c179a4 [PATCH] unpaged: VM_UNPAGED
Although we tend to associate VM_RESERVED with remap_pfn_range, quite a few
drivers set VM_RESERVED on areas which are then populated by nopage.  The
PageReserved removal in 2.6.15-rc1 changed VM_RESERVED not to free pages in
zap_pte_range, without changing those drivers not to set it: so their pages
just leak away.

Let's not change miscellaneous drivers now: introduce VM_UNPAGED at the core,
to flag the special areas where the ptes may have no struct page, or if they
have then it's not to be touched.  Replace most instances of VM_RESERVED in
core mm by VM_UNPAGED.  Force it on in remap_pfn_range, and the sparc and
sparc64 io_remap_pfn_range.

Revert addition of VM_RESERVED to powerpc vdso, it's not needed there.  Is it
needed anywhere?  It still governs the mm->reserved_vm statistic, and special
vmas not to be merged, and areas not to be core dumped; but could probably be
eliminated later (the drivers are probably specifying it because in 2.4 it
kept swapout off the vma, but in 2.6 we work from the LRU, which these pages
don't get on).

Use the VM_SHM slot for VM_UNPAGED, and define VM_SHM to 0: it serves no
purpose whatsoever, and should be removed from drivers when we clean up.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: William Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-11-22 09:13:42 -08:00

223 lines
5.2 KiB
C

/*
* linux/mm/msync.c
*
* Copyright (C) 1994-1999 Linus Torvalds
*/
/*
* The msync() system call.
*/
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/hugetlb.h>
#include <linux/syscalls.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end)
{
pte_t *pte;
spinlock_t *ptl;
int progress = 0;
again:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do {
unsigned long pfn;
struct page *page;
if (progress >= 64) {
progress = 0;
if (need_resched() || need_lockbreak(ptl))
break;
}
progress++;
if (!pte_present(*pte))
continue;
if (!pte_maybe_dirty(*pte))
continue;
pfn = pte_pfn(*pte);
if (unlikely(!pfn_valid(pfn))) {
print_bad_pte(vma, *pte, addr);
continue;
}
page = pfn_to_page(pfn);
if (ptep_clear_flush_dirty(vma, addr, pte) ||
page_test_and_clear_dirty(page))
set_page_dirty(page);
progress += 3;
} while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
if (addr != end)
goto again;
}
static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end)
{
pmd_t *pmd;
unsigned long next;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
msync_pte_range(vma, pmd, addr, next);
} while (pmd++, addr = next, addr != end);
}
static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end)
{
pud_t *pud;
unsigned long next;
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
msync_pmd_range(vma, pud, addr, next);
} while (pud++, addr = next, addr != end);
}
static void msync_page_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
pgd_t *pgd;
unsigned long next;
/* For hugepages we can't go walking the page table normally,
* but that's ok, hugetlbfs is memory based, so we don't need
* to do anything more on an msync().
* Can't do anything with VM_UNPAGED regions either.
*/
if (vma->vm_flags & (VM_HUGETLB|VM_UNPAGED))
return;
BUG_ON(addr >= end);
pgd = pgd_offset(vma->vm_mm, addr);
flush_cache_range(vma, addr, end);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
msync_pud_range(vma, pgd, addr, next);
} while (pgd++, addr = next, addr != end);
}
/*
* MS_SYNC syncs the entire file - including mappings.
*
* MS_ASYNC does not start I/O (it used to, up to 2.5.67). Instead, it just
* marks the relevant pages dirty. The application may now run fsync() to
* write out the dirty pages and wait on the writeout and check the result.
* Or the application may run fadvise(FADV_DONTNEED) against the fd to start
* async writeout immediately.
* So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to
* applications.
*/
static int msync_interval(struct vm_area_struct *vma,
unsigned long addr, unsigned long end, int flags)
{
int ret = 0;
struct file *file = vma->vm_file;
if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
return -EBUSY;
if (file && (vma->vm_flags & VM_SHARED)) {
msync_page_range(vma, addr, end);
if (flags & MS_SYNC) {
struct address_space *mapping = file->f_mapping;
int err;
ret = filemap_fdatawrite(mapping);
if (file->f_op && file->f_op->fsync) {
/*
* We don't take i_sem here because mmap_sem
* is already held.
*/
err = file->f_op->fsync(file,file->f_dentry,1);
if (err && !ret)
ret = err;
}
err = filemap_fdatawait(mapping);
if (!ret)
ret = err;
}
}
return ret;
}
asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
{
unsigned long end;
struct vm_area_struct *vma;
int unmapped_error, error = -EINVAL;
if (flags & MS_SYNC)
current->flags |= PF_SYNCWRITE;
down_read(&current->mm->mmap_sem);
if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
goto out;
if (start & ~PAGE_MASK)
goto out;
if ((flags & MS_ASYNC) && (flags & MS_SYNC))
goto out;
error = -ENOMEM;
len = (len + ~PAGE_MASK) & PAGE_MASK;
end = start + len;
if (end < start)
goto out;
error = 0;
if (end == start)
goto out;
/*
* If the interval [start,end) covers some unmapped address ranges,
* just ignore them, but return -ENOMEM at the end.
*/
vma = find_vma(current->mm, start);
unmapped_error = 0;
for (;;) {
/* Still start < end. */
error = -ENOMEM;
if (!vma)
goto out;
/* Here start < vma->vm_end. */
if (start < vma->vm_start) {
unmapped_error = -ENOMEM;
start = vma->vm_start;
}
/* Here vma->vm_start <= start < vma->vm_end. */
if (end <= vma->vm_end) {
if (start < end) {
error = msync_interval(vma, start, end, flags);
if (error)
goto out;
}
error = unmapped_error;
goto out;
}
/* Here vma->vm_start <= start < vma->vm_end < end. */
error = msync_interval(vma, start, vma->vm_end, flags);
if (error)
goto out;
start = vma->vm_end;
vma = vma->vm_next;
}
out:
up_read(&current->mm->mmap_sem);
current->flags &= ~PF_SYNCWRITE;
return error;
}