mirror of
https://github.com/torvalds/linux.git
synced 2026-01-25 07:47:50 +00:00
The vDSO getrandom() implementation works with a buffer allocated with a
new system call that has certain requirements:
- It shouldn't be written to core dumps.
* Easy: VM_DONTDUMP.
- It should be zeroed on fork.
* Easy: VM_WIPEONFORK.
- It shouldn't be written to swap.
* Uh-oh: mlock is rlimited.
* Uh-oh: mlock isn't inherited by forks.
- It shouldn't reserve actual memory, but it also shouldn't crash when
page faulting in memory if none is available
* Uh-oh: VM_NORESERVE means segfaults.
It turns out that the vDSO getrandom() function has three really nice
characteristics that we can exploit to solve this problem:
1) Due to being wiped during fork(), the vDSO code is already robust to
having the contents of the pages it reads zeroed out midway through
the function's execution.
2) In the absolute worst case of whatever contingency we're coding for,
we have the option to fallback to the getrandom() syscall, and
everything is fine.
3) The buffers the function uses are only ever useful for a maximum of
60 seconds -- a sort of cache, rather than a long term allocation.
These characteristics mean that we can introduce VM_DROPPABLE, which
has the following semantics:
a) It never is written out to swap.
b) Under memory pressure, mm can just drop the pages (so that they're
zero when read back again).
c) It is inherited by fork.
d) It doesn't count against the mlock budget, since nothing is locked.
e) If there's not enough memory to service a page fault, it's not fatal,
and no signal is sent.
This way, allocations used by vDSO getrandom() can use:
VM_DROPPABLE | VM_DONTDUMP | VM_WIPEONFORK | VM_NORESERVE
And there will be no problem with OOMing, crashing on overcommitment,
using memory when not in use, not wiping on fork(), coredumps, or
writing out to swap.
In order to let vDSO getrandom() use this, expose these via mmap(2) as
MAP_DROPPABLE.
Note that this involves removing the MADV_FREE special case from
sort_folio(), which according to Yu Zhao is unnecessary and will simply
result in an extra call to shrink_folio_list() in the worst case. The
chunk removed reenables the swapbacked flag, which we don't want for
VM_DROPPABLE, and we can't conditionalize it here because there isn't a
vma reference available.
Finally, the provided self test ensures that this is working as desired.
Cc: linux-mm@kvack.org
Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
834 lines
21 KiB
C
834 lines
21 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* linux/mm/mlock.c
|
|
*
|
|
* (C) Copyright 1995 Linus Torvalds
|
|
* (C) Copyright 2002 Christoph Hellwig
|
|
*/
|
|
|
|
#include <linux/capability.h>
|
|
#include <linux/mman.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/sched/user.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/swapops.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/pagevec.h>
|
|
#include <linux/pagewalk.h>
|
|
#include <linux/mempolicy.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/export.h>
|
|
#include <linux/rmap.h>
|
|
#include <linux/mmzone.h>
|
|
#include <linux/hugetlb.h>
|
|
#include <linux/memcontrol.h>
|
|
#include <linux/mm_inline.h>
|
|
#include <linux/secretmem.h>
|
|
|
|
#include "internal.h"
|
|
|
|
struct mlock_fbatch {
|
|
local_lock_t lock;
|
|
struct folio_batch fbatch;
|
|
};
|
|
|
|
static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = {
|
|
.lock = INIT_LOCAL_LOCK(lock),
|
|
};
|
|
|
|
bool can_do_mlock(void)
|
|
{
|
|
if (rlimit(RLIMIT_MEMLOCK) != 0)
|
|
return true;
|
|
if (capable(CAP_IPC_LOCK))
|
|
return true;
|
|
return false;
|
|
}
|
|
EXPORT_SYMBOL(can_do_mlock);
|
|
|
|
/*
|
|
* Mlocked folios are marked with the PG_mlocked flag for efficient testing
|
|
* in vmscan and, possibly, the fault path; and to support semi-accurate
|
|
* statistics.
|
|
*
|
|
* An mlocked folio [folio_test_mlocked(folio)] is unevictable. As such, it
|
|
* will be ostensibly placed on the LRU "unevictable" list (actually no such
|
|
* list exists), rather than the [in]active lists. PG_unevictable is set to
|
|
* indicate the unevictable state.
|
|
*/
|
|
|
|
static struct lruvec *__mlock_folio(struct folio *folio, struct lruvec *lruvec)
|
|
{
|
|
/* There is nothing more we can do while it's off LRU */
|
|
if (!folio_test_clear_lru(folio))
|
|
return lruvec;
|
|
|
|
lruvec = folio_lruvec_relock_irq(folio, lruvec);
|
|
|
|
if (unlikely(folio_evictable(folio))) {
|
|
/*
|
|
* This is a little surprising, but quite possible: PG_mlocked
|
|
* must have got cleared already by another CPU. Could this
|
|
* folio be unevictable? I'm not sure, but move it now if so.
|
|
*/
|
|
if (folio_test_unevictable(folio)) {
|
|
lruvec_del_folio(lruvec, folio);
|
|
folio_clear_unevictable(folio);
|
|
lruvec_add_folio(lruvec, folio);
|
|
|
|
__count_vm_events(UNEVICTABLE_PGRESCUED,
|
|
folio_nr_pages(folio));
|
|
}
|
|
goto out;
|
|
}
|
|
|
|
if (folio_test_unevictable(folio)) {
|
|
if (folio_test_mlocked(folio))
|
|
folio->mlock_count++;
|
|
goto out;
|
|
}
|
|
|
|
lruvec_del_folio(lruvec, folio);
|
|
folio_clear_active(folio);
|
|
folio_set_unevictable(folio);
|
|
folio->mlock_count = !!folio_test_mlocked(folio);
|
|
lruvec_add_folio(lruvec, folio);
|
|
__count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio));
|
|
out:
|
|
folio_set_lru(folio);
|
|
return lruvec;
|
|
}
|
|
|
|
static struct lruvec *__mlock_new_folio(struct folio *folio, struct lruvec *lruvec)
|
|
{
|
|
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
|
|
|
|
lruvec = folio_lruvec_relock_irq(folio, lruvec);
|
|
|
|
/* As above, this is a little surprising, but possible */
|
|
if (unlikely(folio_evictable(folio)))
|
|
goto out;
|
|
|
|
folio_set_unevictable(folio);
|
|
folio->mlock_count = !!folio_test_mlocked(folio);
|
|
__count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio));
|
|
out:
|
|
lruvec_add_folio(lruvec, folio);
|
|
folio_set_lru(folio);
|
|
return lruvec;
|
|
}
|
|
|
|
static struct lruvec *__munlock_folio(struct folio *folio, struct lruvec *lruvec)
|
|
{
|
|
int nr_pages = folio_nr_pages(folio);
|
|
bool isolated = false;
|
|
|
|
if (!folio_test_clear_lru(folio))
|
|
goto munlock;
|
|
|
|
isolated = true;
|
|
lruvec = folio_lruvec_relock_irq(folio, lruvec);
|
|
|
|
if (folio_test_unevictable(folio)) {
|
|
/* Then mlock_count is maintained, but might undercount */
|
|
if (folio->mlock_count)
|
|
folio->mlock_count--;
|
|
if (folio->mlock_count)
|
|
goto out;
|
|
}
|
|
/* else assume that was the last mlock: reclaim will fix it if not */
|
|
|
|
munlock:
|
|
if (folio_test_clear_mlocked(folio)) {
|
|
__zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
|
|
if (isolated || !folio_test_unevictable(folio))
|
|
__count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
|
|
else
|
|
__count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
|
|
}
|
|
|
|
/* folio_evictable() has to be checked *after* clearing Mlocked */
|
|
if (isolated && folio_test_unevictable(folio) && folio_evictable(folio)) {
|
|
lruvec_del_folio(lruvec, folio);
|
|
folio_clear_unevictable(folio);
|
|
lruvec_add_folio(lruvec, folio);
|
|
__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
|
|
}
|
|
out:
|
|
if (isolated)
|
|
folio_set_lru(folio);
|
|
return lruvec;
|
|
}
|
|
|
|
/*
|
|
* Flags held in the low bits of a struct folio pointer on the mlock_fbatch.
|
|
*/
|
|
#define LRU_FOLIO 0x1
|
|
#define NEW_FOLIO 0x2
|
|
static inline struct folio *mlock_lru(struct folio *folio)
|
|
{
|
|
return (struct folio *)((unsigned long)folio + LRU_FOLIO);
|
|
}
|
|
|
|
static inline struct folio *mlock_new(struct folio *folio)
|
|
{
|
|
return (struct folio *)((unsigned long)folio + NEW_FOLIO);
|
|
}
|
|
|
|
/*
|
|
* mlock_folio_batch() is derived from folio_batch_move_lru(): perhaps that can
|
|
* make use of such folio pointer flags in future, but for now just keep it for
|
|
* mlock. We could use three separate folio batches instead, but one feels
|
|
* better (munlocking a full folio batch does not need to drain mlocking folio
|
|
* batches first).
|
|
*/
|
|
static void mlock_folio_batch(struct folio_batch *fbatch)
|
|
{
|
|
struct lruvec *lruvec = NULL;
|
|
unsigned long mlock;
|
|
struct folio *folio;
|
|
int i;
|
|
|
|
for (i = 0; i < folio_batch_count(fbatch); i++) {
|
|
folio = fbatch->folios[i];
|
|
mlock = (unsigned long)folio & (LRU_FOLIO | NEW_FOLIO);
|
|
folio = (struct folio *)((unsigned long)folio - mlock);
|
|
fbatch->folios[i] = folio;
|
|
|
|
if (mlock & LRU_FOLIO)
|
|
lruvec = __mlock_folio(folio, lruvec);
|
|
else if (mlock & NEW_FOLIO)
|
|
lruvec = __mlock_new_folio(folio, lruvec);
|
|
else
|
|
lruvec = __munlock_folio(folio, lruvec);
|
|
}
|
|
|
|
if (lruvec)
|
|
unlock_page_lruvec_irq(lruvec);
|
|
folios_put(fbatch);
|
|
}
|
|
|
|
void mlock_drain_local(void)
|
|
{
|
|
struct folio_batch *fbatch;
|
|
|
|
local_lock(&mlock_fbatch.lock);
|
|
fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
|
|
if (folio_batch_count(fbatch))
|
|
mlock_folio_batch(fbatch);
|
|
local_unlock(&mlock_fbatch.lock);
|
|
}
|
|
|
|
void mlock_drain_remote(int cpu)
|
|
{
|
|
struct folio_batch *fbatch;
|
|
|
|
WARN_ON_ONCE(cpu_online(cpu));
|
|
fbatch = &per_cpu(mlock_fbatch.fbatch, cpu);
|
|
if (folio_batch_count(fbatch))
|
|
mlock_folio_batch(fbatch);
|
|
}
|
|
|
|
bool need_mlock_drain(int cpu)
|
|
{
|
|
return folio_batch_count(&per_cpu(mlock_fbatch.fbatch, cpu));
|
|
}
|
|
|
|
/**
|
|
* mlock_folio - mlock a folio already on (or temporarily off) LRU
|
|
* @folio: folio to be mlocked.
|
|
*/
|
|
void mlock_folio(struct folio *folio)
|
|
{
|
|
struct folio_batch *fbatch;
|
|
|
|
local_lock(&mlock_fbatch.lock);
|
|
fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
|
|
|
|
if (!folio_test_set_mlocked(folio)) {
|
|
int nr_pages = folio_nr_pages(folio);
|
|
|
|
zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
|
|
__count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
|
|
}
|
|
|
|
folio_get(folio);
|
|
if (!folio_batch_add(fbatch, mlock_lru(folio)) ||
|
|
folio_test_large(folio) || lru_cache_disabled())
|
|
mlock_folio_batch(fbatch);
|
|
local_unlock(&mlock_fbatch.lock);
|
|
}
|
|
|
|
/**
|
|
* mlock_new_folio - mlock a newly allocated folio not yet on LRU
|
|
* @folio: folio to be mlocked, either normal or a THP head.
|
|
*/
|
|
void mlock_new_folio(struct folio *folio)
|
|
{
|
|
struct folio_batch *fbatch;
|
|
int nr_pages = folio_nr_pages(folio);
|
|
|
|
local_lock(&mlock_fbatch.lock);
|
|
fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
|
|
folio_set_mlocked(folio);
|
|
|
|
zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
|
|
__count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
|
|
|
|
folio_get(folio);
|
|
if (!folio_batch_add(fbatch, mlock_new(folio)) ||
|
|
folio_test_large(folio) || lru_cache_disabled())
|
|
mlock_folio_batch(fbatch);
|
|
local_unlock(&mlock_fbatch.lock);
|
|
}
|
|
|
|
/**
|
|
* munlock_folio - munlock a folio
|
|
* @folio: folio to be munlocked, either normal or a THP head.
|
|
*/
|
|
void munlock_folio(struct folio *folio)
|
|
{
|
|
struct folio_batch *fbatch;
|
|
|
|
local_lock(&mlock_fbatch.lock);
|
|
fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
|
|
/*
|
|
* folio_test_clear_mlocked(folio) must be left to __munlock_folio(),
|
|
* which will check whether the folio is multiply mlocked.
|
|
*/
|
|
folio_get(folio);
|
|
if (!folio_batch_add(fbatch, folio) ||
|
|
folio_test_large(folio) || lru_cache_disabled())
|
|
mlock_folio_batch(fbatch);
|
|
local_unlock(&mlock_fbatch.lock);
|
|
}
|
|
|
|
static inline unsigned int folio_mlock_step(struct folio *folio,
|
|
pte_t *pte, unsigned long addr, unsigned long end)
|
|
{
|
|
unsigned int count, i, nr = folio_nr_pages(folio);
|
|
unsigned long pfn = folio_pfn(folio);
|
|
pte_t ptent = ptep_get(pte);
|
|
|
|
if (!folio_test_large(folio))
|
|
return 1;
|
|
|
|
count = pfn + nr - pte_pfn(ptent);
|
|
count = min_t(unsigned int, count, (end - addr) >> PAGE_SHIFT);
|
|
|
|
for (i = 0; i < count; i++, pte++) {
|
|
pte_t entry = ptep_get(pte);
|
|
|
|
if (!pte_present(entry))
|
|
break;
|
|
if (pte_pfn(entry) - pfn >= nr)
|
|
break;
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
static inline bool allow_mlock_munlock(struct folio *folio,
|
|
struct vm_area_struct *vma, unsigned long start,
|
|
unsigned long end, unsigned int step)
|
|
{
|
|
/*
|
|
* For unlock, allow munlock large folio which is partially
|
|
* mapped to VMA. As it's possible that large folio is
|
|
* mlocked and VMA is split later.
|
|
*
|
|
* During memory pressure, such kind of large folio can
|
|
* be split. And the pages are not in VM_LOCKed VMA
|
|
* can be reclaimed.
|
|
*/
|
|
if (!(vma->vm_flags & VM_LOCKED))
|
|
return true;
|
|
|
|
/* folio_within_range() cannot take KSM, but any small folio is OK */
|
|
if (!folio_test_large(folio))
|
|
return true;
|
|
|
|
/* folio not in range [start, end), skip mlock */
|
|
if (!folio_within_range(folio, vma, start, end))
|
|
return false;
|
|
|
|
/* folio is not fully mapped, skip mlock */
|
|
if (step != folio_nr_pages(folio))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
|
|
unsigned long end, struct mm_walk *walk)
|
|
|
|
{
|
|
struct vm_area_struct *vma = walk->vma;
|
|
spinlock_t *ptl;
|
|
pte_t *start_pte, *pte;
|
|
pte_t ptent;
|
|
struct folio *folio;
|
|
unsigned int step = 1;
|
|
unsigned long start = addr;
|
|
|
|
ptl = pmd_trans_huge_lock(pmd, vma);
|
|
if (ptl) {
|
|
if (!pmd_present(*pmd))
|
|
goto out;
|
|
if (is_huge_zero_pmd(*pmd))
|
|
goto out;
|
|
folio = pmd_folio(*pmd);
|
|
if (vma->vm_flags & VM_LOCKED)
|
|
mlock_folio(folio);
|
|
else
|
|
munlock_folio(folio);
|
|
goto out;
|
|
}
|
|
|
|
start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
|
|
if (!start_pte) {
|
|
walk->action = ACTION_AGAIN;
|
|
return 0;
|
|
}
|
|
|
|
for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) {
|
|
ptent = ptep_get(pte);
|
|
if (!pte_present(ptent))
|
|
continue;
|
|
folio = vm_normal_folio(vma, addr, ptent);
|
|
if (!folio || folio_is_zone_device(folio))
|
|
continue;
|
|
|
|
step = folio_mlock_step(folio, pte, addr, end);
|
|
if (!allow_mlock_munlock(folio, vma, start, end, step))
|
|
goto next_entry;
|
|
|
|
if (vma->vm_flags & VM_LOCKED)
|
|
mlock_folio(folio);
|
|
else
|
|
munlock_folio(folio);
|
|
|
|
next_entry:
|
|
pte += step - 1;
|
|
addr += (step - 1) << PAGE_SHIFT;
|
|
}
|
|
pte_unmap(start_pte);
|
|
out:
|
|
spin_unlock(ptl);
|
|
cond_resched();
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* mlock_vma_pages_range() - mlock any pages already in the range,
|
|
* or munlock all pages in the range.
|
|
* @vma - vma containing range to be mlock()ed or munlock()ed
|
|
* @start - start address in @vma of the range
|
|
* @end - end of range in @vma
|
|
* @newflags - the new set of flags for @vma.
|
|
*
|
|
* Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED;
|
|
* called for munlock() and munlockall(), to clear VM_LOCKED from @vma.
|
|
*/
|
|
static void mlock_vma_pages_range(struct vm_area_struct *vma,
|
|
unsigned long start, unsigned long end, vm_flags_t newflags)
|
|
{
|
|
static const struct mm_walk_ops mlock_walk_ops = {
|
|
.pmd_entry = mlock_pte_range,
|
|
.walk_lock = PGWALK_WRLOCK_VERIFY,
|
|
};
|
|
|
|
/*
|
|
* There is a slight chance that concurrent page migration,
|
|
* or page reclaim finding a page of this now-VM_LOCKED vma,
|
|
* will call mlock_vma_folio() and raise page's mlock_count:
|
|
* double counting, leaving the page unevictable indefinitely.
|
|
* Communicate this danger to mlock_vma_folio() with VM_IO,
|
|
* which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas.
|
|
* mmap_lock is held in write mode here, so this weird
|
|
* combination should not be visible to other mmap_lock users;
|
|
* but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED.
|
|
*/
|
|
if (newflags & VM_LOCKED)
|
|
newflags |= VM_IO;
|
|
vma_start_write(vma);
|
|
vm_flags_reset_once(vma, newflags);
|
|
|
|
lru_add_drain();
|
|
walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL);
|
|
lru_add_drain();
|
|
|
|
if (newflags & VM_IO) {
|
|
newflags &= ~VM_IO;
|
|
vm_flags_reset_once(vma, newflags);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* mlock_fixup - handle mlock[all]/munlock[all] requests.
|
|
*
|
|
* Filters out "special" vmas -- VM_LOCKED never gets set for these, and
|
|
* munlock is a no-op. However, for some special vmas, we go ahead and
|
|
* populate the ptes.
|
|
*
|
|
* For vmas that pass the filters, merge/split as appropriate.
|
|
*/
|
|
static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
|
|
struct vm_area_struct **prev, unsigned long start,
|
|
unsigned long end, vm_flags_t newflags)
|
|
{
|
|
struct mm_struct *mm = vma->vm_mm;
|
|
int nr_pages;
|
|
int ret = 0;
|
|
vm_flags_t oldflags = vma->vm_flags;
|
|
|
|
if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
|
|
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
|
|
vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE))
|
|
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
|
|
goto out;
|
|
|
|
vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
|
|
if (IS_ERR(vma)) {
|
|
ret = PTR_ERR(vma);
|
|
goto out;
|
|
}
|
|
|
|
/*
|
|
* Keep track of amount of locked VM.
|
|
*/
|
|
nr_pages = (end - start) >> PAGE_SHIFT;
|
|
if (!(newflags & VM_LOCKED))
|
|
nr_pages = -nr_pages;
|
|
else if (oldflags & VM_LOCKED)
|
|
nr_pages = 0;
|
|
mm->locked_vm += nr_pages;
|
|
|
|
/*
|
|
* vm_flags is protected by the mmap_lock held in write mode.
|
|
* It's okay if try_to_unmap_one unmaps a page just after we
|
|
* set VM_LOCKED, populate_vma_page_range will bring it back.
|
|
*/
|
|
if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
|
|
/* No work to do, and mlocking twice would be wrong */
|
|
vma_start_write(vma);
|
|
vm_flags_reset(vma, newflags);
|
|
} else {
|
|
mlock_vma_pages_range(vma, start, end, newflags);
|
|
}
|
|
out:
|
|
*prev = vma;
|
|
return ret;
|
|
}
|
|
|
|
static int apply_vma_lock_flags(unsigned long start, size_t len,
|
|
vm_flags_t flags)
|
|
{
|
|
unsigned long nstart, end, tmp;
|
|
struct vm_area_struct *vma, *prev;
|
|
VMA_ITERATOR(vmi, current->mm, start);
|
|
|
|
VM_BUG_ON(offset_in_page(start));
|
|
VM_BUG_ON(len != PAGE_ALIGN(len));
|
|
end = start + len;
|
|
if (end < start)
|
|
return -EINVAL;
|
|
if (end == start)
|
|
return 0;
|
|
vma = vma_iter_load(&vmi);
|
|
if (!vma)
|
|
return -ENOMEM;
|
|
|
|
prev = vma_prev(&vmi);
|
|
if (start > vma->vm_start)
|
|
prev = vma;
|
|
|
|
nstart = start;
|
|
tmp = vma->vm_start;
|
|
for_each_vma_range(vmi, vma, end) {
|
|
int error;
|
|
vm_flags_t newflags;
|
|
|
|
if (vma->vm_start != tmp)
|
|
return -ENOMEM;
|
|
|
|
newflags = vma->vm_flags & ~VM_LOCKED_MASK;
|
|
newflags |= flags;
|
|
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
|
|
tmp = vma->vm_end;
|
|
if (tmp > end)
|
|
tmp = end;
|
|
error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
|
|
if (error)
|
|
return error;
|
|
tmp = vma_iter_end(&vmi);
|
|
nstart = tmp;
|
|
}
|
|
|
|
if (tmp < end)
|
|
return -ENOMEM;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Go through vma areas and sum size of mlocked
|
|
* vma pages, as return value.
|
|
* Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
|
|
* is also counted.
|
|
* Return value: previously mlocked page counts
|
|
*/
|
|
static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
|
|
unsigned long start, size_t len)
|
|
{
|
|
struct vm_area_struct *vma;
|
|
unsigned long count = 0;
|
|
unsigned long end;
|
|
VMA_ITERATOR(vmi, mm, start);
|
|
|
|
/* Don't overflow past ULONG_MAX */
|
|
if (unlikely(ULONG_MAX - len < start))
|
|
end = ULONG_MAX;
|
|
else
|
|
end = start + len;
|
|
|
|
for_each_vma_range(vmi, vma, end) {
|
|
if (vma->vm_flags & VM_LOCKED) {
|
|
if (start > vma->vm_start)
|
|
count -= (start - vma->vm_start);
|
|
if (end < vma->vm_end) {
|
|
count += end - vma->vm_start;
|
|
break;
|
|
}
|
|
count += vma->vm_end - vma->vm_start;
|
|
}
|
|
}
|
|
|
|
return count >> PAGE_SHIFT;
|
|
}
|
|
|
|
/*
|
|
* convert get_user_pages() return value to posix mlock() error
|
|
*/
|
|
static int __mlock_posix_error_return(long retval)
|
|
{
|
|
if (retval == -EFAULT)
|
|
retval = -ENOMEM;
|
|
else if (retval == -ENOMEM)
|
|
retval = -EAGAIN;
|
|
return retval;
|
|
}
|
|
|
|
static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
|
|
{
|
|
unsigned long locked;
|
|
unsigned long lock_limit;
|
|
int error = -ENOMEM;
|
|
|
|
start = untagged_addr(start);
|
|
|
|
if (!can_do_mlock())
|
|
return -EPERM;
|
|
|
|
len = PAGE_ALIGN(len + (offset_in_page(start)));
|
|
start &= PAGE_MASK;
|
|
|
|
lock_limit = rlimit(RLIMIT_MEMLOCK);
|
|
lock_limit >>= PAGE_SHIFT;
|
|
locked = len >> PAGE_SHIFT;
|
|
|
|
if (mmap_write_lock_killable(current->mm))
|
|
return -EINTR;
|
|
|
|
locked += current->mm->locked_vm;
|
|
if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
|
|
/*
|
|
* It is possible that the regions requested intersect with
|
|
* previously mlocked areas, that part area in "mm->locked_vm"
|
|
* should not be counted to new mlock increment count. So check
|
|
* and adjust locked count if necessary.
|
|
*/
|
|
locked -= count_mm_mlocked_page_nr(current->mm,
|
|
start, len);
|
|
}
|
|
|
|
/* check against resource limits */
|
|
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
|
|
error = apply_vma_lock_flags(start, len, flags);
|
|
|
|
mmap_write_unlock(current->mm);
|
|
if (error)
|
|
return error;
|
|
|
|
error = __mm_populate(start, len, 0);
|
|
if (error)
|
|
return __mlock_posix_error_return(error);
|
|
return 0;
|
|
}
|
|
|
|
SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
|
|
{
|
|
return do_mlock(start, len, VM_LOCKED);
|
|
}
|
|
|
|
SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
|
|
{
|
|
vm_flags_t vm_flags = VM_LOCKED;
|
|
|
|
if (flags & ~MLOCK_ONFAULT)
|
|
return -EINVAL;
|
|
|
|
if (flags & MLOCK_ONFAULT)
|
|
vm_flags |= VM_LOCKONFAULT;
|
|
|
|
return do_mlock(start, len, vm_flags);
|
|
}
|
|
|
|
SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
|
|
{
|
|
int ret;
|
|
|
|
start = untagged_addr(start);
|
|
|
|
len = PAGE_ALIGN(len + (offset_in_page(start)));
|
|
start &= PAGE_MASK;
|
|
|
|
if (mmap_write_lock_killable(current->mm))
|
|
return -EINTR;
|
|
ret = apply_vma_lock_flags(start, len, 0);
|
|
mmap_write_unlock(current->mm);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
|
|
* and translate into the appropriate modifications to mm->def_flags and/or the
|
|
* flags for all current VMAs.
|
|
*
|
|
* There are a couple of subtleties with this. If mlockall() is called multiple
|
|
* times with different flags, the values do not necessarily stack. If mlockall
|
|
* is called once including the MCL_FUTURE flag and then a second time without
|
|
* it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
|
|
*/
|
|
static int apply_mlockall_flags(int flags)
|
|
{
|
|
VMA_ITERATOR(vmi, current->mm, 0);
|
|
struct vm_area_struct *vma, *prev = NULL;
|
|
vm_flags_t to_add = 0;
|
|
|
|
current->mm->def_flags &= ~VM_LOCKED_MASK;
|
|
if (flags & MCL_FUTURE) {
|
|
current->mm->def_flags |= VM_LOCKED;
|
|
|
|
if (flags & MCL_ONFAULT)
|
|
current->mm->def_flags |= VM_LOCKONFAULT;
|
|
|
|
if (!(flags & MCL_CURRENT))
|
|
goto out;
|
|
}
|
|
|
|
if (flags & MCL_CURRENT) {
|
|
to_add |= VM_LOCKED;
|
|
if (flags & MCL_ONFAULT)
|
|
to_add |= VM_LOCKONFAULT;
|
|
}
|
|
|
|
for_each_vma(vmi, vma) {
|
|
vm_flags_t newflags;
|
|
|
|
newflags = vma->vm_flags & ~VM_LOCKED_MASK;
|
|
newflags |= to_add;
|
|
|
|
/* Ignore errors */
|
|
mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end,
|
|
newflags);
|
|
cond_resched();
|
|
}
|
|
out:
|
|
return 0;
|
|
}
|
|
|
|
SYSCALL_DEFINE1(mlockall, int, flags)
|
|
{
|
|
unsigned long lock_limit;
|
|
int ret;
|
|
|
|
if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
|
|
flags == MCL_ONFAULT)
|
|
return -EINVAL;
|
|
|
|
if (!can_do_mlock())
|
|
return -EPERM;
|
|
|
|
lock_limit = rlimit(RLIMIT_MEMLOCK);
|
|
lock_limit >>= PAGE_SHIFT;
|
|
|
|
if (mmap_write_lock_killable(current->mm))
|
|
return -EINTR;
|
|
|
|
ret = -ENOMEM;
|
|
if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
|
|
capable(CAP_IPC_LOCK))
|
|
ret = apply_mlockall_flags(flags);
|
|
mmap_write_unlock(current->mm);
|
|
if (!ret && (flags & MCL_CURRENT))
|
|
mm_populate(0, TASK_SIZE);
|
|
|
|
return ret;
|
|
}
|
|
|
|
SYSCALL_DEFINE0(munlockall)
|
|
{
|
|
int ret;
|
|
|
|
if (mmap_write_lock_killable(current->mm))
|
|
return -EINTR;
|
|
ret = apply_mlockall_flags(0);
|
|
mmap_write_unlock(current->mm);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
|
|
* shm segments) get accounted against the user_struct instead.
|
|
*/
|
|
static DEFINE_SPINLOCK(shmlock_user_lock);
|
|
|
|
int user_shm_lock(size_t size, struct ucounts *ucounts)
|
|
{
|
|
unsigned long lock_limit, locked;
|
|
long memlock;
|
|
int allowed = 0;
|
|
|
|
locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
|
lock_limit = rlimit(RLIMIT_MEMLOCK);
|
|
if (lock_limit != RLIM_INFINITY)
|
|
lock_limit >>= PAGE_SHIFT;
|
|
spin_lock(&shmlock_user_lock);
|
|
memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
|
|
|
|
if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
|
|
dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
|
|
goto out;
|
|
}
|
|
if (!get_ucounts(ucounts)) {
|
|
dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
|
|
allowed = 0;
|
|
goto out;
|
|
}
|
|
allowed = 1;
|
|
out:
|
|
spin_unlock(&shmlock_user_lock);
|
|
return allowed;
|
|
}
|
|
|
|
void user_shm_unlock(size_t size, struct ucounts *ucounts)
|
|
{
|
|
spin_lock(&shmlock_user_lock);
|
|
dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
|
|
spin_unlock(&shmlock_user_lock);
|
|
put_ucounts(ucounts);
|
|
}
|